diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c069d5c1dc267c07e8c57e9e6b2403406c9793d..72ceab9a911a4bedf6fb9853784af799c1436b34 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ if (POLICY CMP0074) cmake_policy(SET CMP0074 OLD) endif () -list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/) +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/ /opt/rocm/hip/cmake) set(BOOST_INCLUDE ${Boost_INCLUDE_DIR} CACHE PATH "Include directory for BOOST") set(LIBHILBERT_ROOT CACHE PATH "LibHilbert root path") @@ -14,13 +14,10 @@ set(SE_CLASS3 CACHE BOOL "Activate compilation with SE_CLASS3") set(ENABLE_GPU CACHE BOOL "Disable the GPU code independently that a cuda compiler is found") set(TEST_PERFORMANCE CACHE BOOL "Enable test performance") set(ALPAKA_ROOT CACHE PATH "Alpaka root path") -set(CUDA_ON_CPU CACHE STRING "Make Cuda work on heap") +set(HIP_ENABLE CACHE BOOL "Enable HIP compiler") +set(AMD_ARCH_COMPILE "gfx900" CACHE STRING "AMD gpu architecture used to compile kernels") -if (ENABLE_GPU) - set(CUDA_ON_CPU OFF) - enable_language(CUDA) - find_package(CUDA) -endif() +add_subdirectory (../openfpm_devices/ openfpm_devices) set (CMAKE_CXX_STANDARD 14) set (CMAKE_CUDA_STANDARD 14) @@ -31,7 +28,8 @@ message("Searching Vc in ${Vc_DIR}") find_package(Boost 1.72.0 REQUIRED COMPONENTS unit_test_framework iostreams program_options system filesystem OPTIONAL_COMPONENTS fiber context) find_package(LibHilbert REQUIRED) find_package(Vc REQUIRED) - +find_package(HIP) +find_package(OpenMP) ###### CONFIG.h FILE ###### @@ -47,6 +45,11 @@ if(TEST_PERFORMANCE) set(DEFINE_PERFORMANCE_TEST "#define PERFORMANCE_TEST") endif() +if(HIP_FOUND) + set(DEFINE_HIP_GPU "#define HIP_GPU") + set(DEFINE_CUDIFY_USE_HIP "#define CUDIFY_USE_HIP") +endif() + if (Boost_FOUND) set(DEFINE_HAVE_BOOST "#define HAVE_BOOST") set(DEFINE_HAVE_BOOST_IOSTREAMS "#define HAVE_BOOST_IOSTREAMS") @@ -73,6 +76,10 @@ if(CUDA_ON_CPU) set(DEFINE_CUDA_GPU "#define CUDA_GPU") endif() +if(HIP_FOUND) + set(DEFINE_CUDA_GPU "#define CUDA_GPU") +endif() + if(LIBHILBERT_FOUND) set(DEFINE_HAVE_LIBHILBERT "#define HAVE_LIBHILBERT 1") else() @@ -82,9 +89,20 @@ endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/config/config_cmake.h.in ${CMAKE_CURRENT_SOURCE_DIR}/src/config/config.h) -add_subdirectory (../openfpm_devices/ openfpm_devices) - set(WARNING_SUPPRESSION_AND_OPTION_NVCC ${WARNING_SUPPRESSION_AND_OPTION_NVCC} PARENT_SCOPE) set(WARNING_SUPPRESSION_AND_OPTION_NVCC_TEXT ${WARNING_SUPPRESSION_AND_OPTION_NVCC_TEXT} PARENT_SCOPE) add_subdirectory (src) + +set(DEFINE_HAVE_BOOST ${DEFINE_HAVE_BOOST} PARENT_SCOPE) +set(DEFINE_HAVE_BOOST_IOSTREAMS ${DEFINE_HAVE_BOOST_IOSTREAMS} PARENT_SCOPE) +set(DEFINE_HAVE_BOOST_PROGRAM_OPTIONS ${DEFINE_HAVE_BOOST_PROGRAM_OPTIONS} PARENT_SCOPE) +set(DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK ${DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK} PARENT_SCOPE) +set(DEFINE_HAVE_BOOST_CONTEXT ${DEFINE_HAVE_BOOST_CONTEXT} PARENT_SCOPE) +set(DEFINE_HAVE_BOOST_FIBER ${DEFINE_HAVE_BOOST_FIBER} PARENT_SCOPE) +set(DEFINE_HAVE_OPENMP ${DEFINE_HAVE_OPENMP} PARENT_SCOPE) +set(DEFINE_HAVE_ALPAKA ${DEFINE_HAVE_ALPAKA} PARENT_SCOPE) +set(DEFINE_CUDA_GPU ${DEFINE_CUDA_GPU} PARENT_SCOPE) +set(DEFINE_CUDIFY_BACKEND ${DEFINE_CUDIFY_BACKEND} PARENT_SCOPE) +set(OPTIONAL_BOOST_LIBS ${OPTIONAL_BOOST_LIBS} PARENT_SCOPE) + diff --git a/build.sh b/build.sh index 004afcdee2119b29e69f6ca8a4609c1355002ca2..6a9b7ec3c44ddf6eb63477a83fd0b8d842e8c7b0 100755 --- a/build.sh +++ b/build.sh @@ -7,7 +7,6 @@ hostname=$(hostname) type_compile=$3 branch=$4 - echo "Build on: $hostname with $type_compile branch: $branch" if [ x"$hostname" == x"cifarm-centos-node.mpi-cbg.de" ]; then @@ -30,6 +29,8 @@ if [ ! -d $HOME/openfpm_dependencies/openfpm_data/VCDEVEL ]; then ./install_VCDEVEL.sh $HOME/openfpm_dependencies/openfpm_data/ 4 fi +rm -rf $HOME/openfpm_dependencies/openfpm_data/BOOST + if [ ! -d $HOME/openfpm_dependencies/openfpm_data/BOOST ]; then if [ x"$hostname" == x"cifarm-mac-node" ]; then echo "Compiling for OSX" @@ -40,6 +41,8 @@ if [ ! -d $HOME/openfpm_dependencies/openfpm_data/BOOST ]; then fi fi +./install_CMAKE_on_CI.sh $HOME/openfpm_dependencies/openfpm_data 4 + mkdir /tmp/openfpm_data_$type_compile mv * .[^.]* /tmp/openfpm_data_$type_compile mv /tmp/openfpm_data_$type_compile openfpm_data @@ -56,7 +59,13 @@ cd "$workspace/openfpm_data" pre_command="" sh ./autogen.sh options="$options --disable-gpu " -options="$options --with-vcdevel=$HOME/openfpm_dependencies/openfpm_data/VCDEVEL --with-boost=$HOME/openfpm_dependencies/openfpm_data/BOOST --with-libhilbert=$HOME/openfpm_dependencies/openfpm_data/LIBHILBERT --enable-cuda_on_cpu" +options="$options --with-vcdevel=$HOME/openfpm_dependencies/openfpm_data/VCDEVEL --with-boost=$HOME/openfpm_dependencies/openfpm_data/BOOST --with-libhilbert=$HOME/openfpm_dependencies/openfpm_data/LIBHILBERT" + +if [ x"$hostname" == x"cifarm-mac-node" ]; then + options="$options --enable-cuda-on-cpu" +else + options="$options --with-cuda-on-backend=OpenMP" +fi if [ x"$3" == x"SE" ]; then options="$options --enable-se-class1 --enable-se-class2 --enable-se-class3 --with-action-on-error=throw --enable-test-coverage" diff --git a/configure b/configure index 878cf86f1a54d6824e530ffe53afc8a1882ac1c2..29474677fd8291e1521bd90ee5887a112929ccfe 100755 --- a/configure +++ b/configure @@ -100,6 +100,7 @@ enable_debug with_metis with_hdf5 with_libhilbert +with_cuda_on_backend enable_cuda_on_cpu enable_scan_coverty enable_test_performance @@ -123,6 +124,8 @@ with_eigen with_vcdevel enable_gpu enable_asan +enable_garbageinj +enable_garbageinjv ' rm -rf build @@ -240,7 +243,7 @@ do conf_options="$conf_options -DSCAN_COVERTY=ON" ;; cuda_on_cpu) - conf_options="$conf_options -DCUDA_ON_CPU=ON" + conf_options="$conf_options -DCUDA_ON_BACKEND=SEQUENTIAL" ;; test_performance) conf_options="$conf_options -DTEST_PERFORMANCE=ON" @@ -261,6 +264,12 @@ do asan) conf_options="$conf_options -DENABLE_ASAN=ON" ;; + garbageinj) + conf_options="$conf_options -DENABLE_GARBAGE_INJECTOR=ON" + ;; + garbageinjv) + conf_options="$conf_options -DENABLE_VCLUSTER_GARBAGE_INJECTOR=ON" + ;; *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig" ac_unrecognized_sep=', ' ;; @@ -466,6 +475,9 @@ do ac_useropt_orig=$ac_useropt ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_useropt in + cuda_on_backend) + conf_options="$conf_options -DCUDA_ON_BACKEND=$ac_optarg" + ;; libhilbert) conf_options="$conf_options -DLIBHILBERT_ROOT=$ac_optarg" ;; @@ -578,6 +590,7 @@ cd build ## remove enerything echo "Calling cmake ../. $conf_options" +printf "cmake ../. $conf_options" > cmake_build_options rm ../error_code DYLD_LIBRARY_PATH=$ld_lib_pathopt cmake ../. $conf_options if [ $? != 0 ]; then @@ -606,6 +619,7 @@ clean: install: \$(MAKE) -C build \$@ + script/install_parallel_debugger pdata: \$(MAKE) -C build \$@ diff --git a/discover_os b/discover_os new file mode 100755 index 0000000000000000000000000000000000000000..0a5742ed9bba0f52c0fd1d39226159b793e61762 --- /dev/null +++ b/discover_os @@ -0,0 +1,39 @@ +#! /bin/bash + +function discover_os() { +platform=unknown +arch=$(uname -m) + + if [[ "$OSTYPE" == "linux-gnu" ]]; then + echo -e "We are on\033[1;34m LINUX \033[0m, with architecture \033[1;34m$arch\033[0m" + platform=linux + elif [[ "$OSTYPE" == "linux" ]]; then + echo -e "We are on\033[1;34m LINUX \033[0m, with architecture \033[1;34m$arch\033[0m" + platform=linux + elif [[ "$OSTYPE" == "darwin"* ]]; then + echo -e "We are on\033[1;34m MAC OSX \033[0m, with architecture \033[1;34m$arch\033[0m" + platform=osx + elif [[ "$OSTYPE" == "cygwin" ]]; then + echo -e "We are on\033[1;34m CYGWIN \033[0m, with architecture \033[1;34m$arch\033[0m" + platform=cygwin + elif [[ "$OSTYPE" == "msys" ]]; then + echo -e "We are on\033[1;34m Microsoft Window \033[0m, with architecture \033[1;34m$arch\033[0m" + echo "This platform is not supported" + exit 1 + elif [[ "$OSTYPE" == "win32" ]]; then + echo -e "We are on\033[1;34m Microsoft Window \033[0m, with architecture \033[1;34m$arch\033[0m" + echo "This platform is not supported" + exit 1 + elif [[ "$OSTYPE" == "freebsd"* ]]; then + echo -e "We are on\033[1;34m FREEBSD \033[0m, with architecture \033[1;34m$arch\033[0m" + echo "This platform is not supported" + exit 1 + else + echo -e "We are on an\033[1;34m unknown OS \033[0m, with architecture \033[1;34m$arch\033[0m" + echo "This platform is not supported" + exit 1 + fi + + +} + diff --git a/install_BOOST.sh b/install_BOOST.sh index 723da2c4458ce197211398b6687db9e0bad50675..d080b347e8710c4caa5098d820ec6b15796827d0 100755 --- a/install_BOOST.sh +++ b/install_BOOST.sh @@ -1,5 +1,8 @@ #!/bin/bash +source discover_os +discover_os + # check if the directory $1/BOOST exist if [ -d "$1/BOOST" ]; then @@ -7,9 +10,10 @@ if [ -d "$1/BOOST" ]; then exit 0 fi -wget http://ppmcore.mpi-cbg.de/upload/boost_1_72_0.tar.bz2 -tar -xvf boost_1_72_0.tar.bz2 -cd boost_1_72_0 +rm boost_1_75_0.tar.bz2 +wget http://ppmcore.mpi-cbg.de/upload/boost_1_75_0.tar.bz2 +tar -xvf boost_1_75_0.tar.bz2 +cd boost_1_75_0 if [ x"$4" != x"" ]; then if [ -f $HOME/user-config.jam ]; then mv $HOME/user-config.jam $HOME/user-config.jam_bck @@ -22,11 +26,25 @@ if [ x"$4" != x"" ]; then fi ./bootstrap.sh --with-toolset=$3 mkdir $1/BOOST -./b2 -j $2 install --prefix=$1/BOOST -rm -rf boost_1_72_0 +# Several flavours +if [ x"$platform" == x"osx" ]; then + if [ x"$arch" == x"arm64" ]; then + if [ x"$3" == x"" ]; then + ./b2 -j $2 install --prefix=$1/BOOST address-model=64 architecture=arm abi=aapcs binary-format=mach-o toolset=clang + else + ./b2 -j $2 install --prefix=$1/BOOST address-model=64 architecture=arm abi=aapcs binary-format=mach-o toolset=$3 + fi + else + ./b2 -j $2 install --prefix=$1/BOOST address-model=64 architecture=x86 abi=sysv binary-format=mach-o toolset=clang + fi +else + ./b2 -j $2 install --prefix=$1/BOOST +fi + +rm -rf boost_1_75_0 if [ -f $HOME/user-config.jam_bck ]; then mv $HOME/user-config.jam_bck $HOME/user-config.jam fi -rm -rf boost_1_72_0.tar.bz2 +rm -rf boost_1_75_0.tar.bz2 diff --git a/install_CMAKE_on_CI.sh b/install_CMAKE_on_CI.sh new file mode 100755 index 0000000000000000000000000000000000000000..d7492ad4088d8737727f255b2b042c06f6edd4a1 --- /dev/null +++ b/install_CMAKE_on_CI.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +if [ -d "$1/CMAKE" ]; then + exit 0 +fi + +wget https://github.com/Kitware/CMake/releases/download/v3.20.3/cmake-3.20.3.tar.gz +tar -xvf cmake-3.20.3.tar.gz +cd cmake-3.20.3 + +./bootstrap --prefix=$1/CMAKE +make +make install + + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 622e5cf14eab8af3008f1f3f5d3edabd3a5c50d7..4db6568e8a3fb87539b6a66c26e8dd9e28f45bca 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -21,7 +21,7 @@ if (TEST_PERFORMANCE) Vector/performance/vector_performance_test.cu) endif () -if (CUDA_FOUND OR CUDA_ON_CPU) +if (NOT CUDA_ON_BACKEND STREQUAL "None") set(CUDA_SOURCES ${CUDA_SOURCES} Vector/map_vector_sparse_unit_tests.cu Vector/vector_gpu_unit_tests.cu @@ -45,31 +45,55 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") add_definitions("-DBOOST_MPL_CFG_HAS_TYPEOF") endif() -if (CUDA_ON_CPU) - add_definitions(-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 ) - # add_definitions(-D_GLIBCXX_PARALLEL) +if (CUDA_ON_BACKEND STREQUAL "OpenMP" OR CUDA_ON_BACKEND STREQUAL "SEQUENTIAL") + set_source_files_properties(${CUDA_SOURCES} PROPERTIES COMPILE_FLAGS "-D__NVCC__ -DCUDART_VERSION=11000") set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX) - set_source_files_properties(isolation.cu PROPERTIES LANGUAGE CXX) - if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel") - add_definitions("-x c++") - endif() - if (CUDA_ON_CPU STREQUAL "OpenMP") - add_definitions(-DCUDIFY_USE_OPENMP) + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + add_definitions("-x c++") endif() endif() -add_executable(mem_map ${CUDA_SOURCES} - main.cpp - data_type/aggregate_unit_tests.cpp - util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp - memory_ly/memory_conf_unit_tests.cpp - Space/tests/SpaceBox_unit_tests.cpp - Space/Shape/Sphere_unit_test.cpp +if ( HIP_ENABLE AND HIP_FOUND ) + + list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) + + if (CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND HIP_HIPCC_FLAGS -O0) + endif() + + list(APPEND HIP_HIPCC_FLAGS -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0) + set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX) + + set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) + + hip_add_executable(mem_map ${CUDA_SOURCES} + main.cpp + data_type/aggregate_unit_tests.cpp + util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp + memory_ly/memory_conf_unit_tests.cpp + Space/tests/SpaceBox_unit_tests.cpp + Space/Shape/Sphere_unit_test.cpp + SparseGrid/SparseGrid_unit_tests.cpp + SparseGrid/SparseGrid_chunk_copy_unit_tests.cpp + Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp + Grid/Geometry/tests/grid_smb_tests.cpp) + +else() + + add_executable(mem_map ${CUDA_SOURCES} + main.cpp + data_type/aggregate_unit_tests.cpp + util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp + memory_ly/memory_conf_unit_tests.cpp + Space/tests/SpaceBox_unit_tests.cpp + Space/Shape/Sphere_unit_test.cpp SparseGrid/SparseGrid_unit_tests.cpp SparseGrid/SparseGrid_chunk_copy_unit_tests.cpp - Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp + Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp Grid/Geometry/tests/grid_smb_tests.cpp) +endif() + set_property(TARGET mem_map PROPERTY CUDA_ARCHITECTURES 60 75) if (CUDA_FOUND) @@ -84,6 +108,11 @@ endif() add_dependencies(mem_map ofpmmemory) +if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + target_compile_options(mem_map PRIVATE "-Wno-undefined-var-template") + target_compile_options(mem_map PRIVATE "-Wno-macro-redefined") +endif() + if (CMAKE_COMPILER_IS_GNUCC) target_compile_options(mem_map PRIVATE "-Wno-deprecated-declarations") target_compile_options(mem_map PRIVATE $<$<COMPILE_LANGUAGE:CXX>: -mavx>) @@ -157,6 +186,7 @@ target_link_libraries(mem_map ${Boost_LIBRARIES}) target_link_libraries(mem_map -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES}) target_link_libraries(mem_map ofpmmemory) target_link_libraries(mem_map ${Vc_LIBRARIES}) +target_link_libraries(mem_map OpenMP::OpenMP_CXX) if (CUDA_FOUND) target_link_libraries(isolation ${Boost_LIBRARIES}) @@ -405,12 +435,13 @@ install(FILES util/multi_array_openfpm/array_openfpm.hpp COMPONENT OpenFPM) -install(FILES util/cuda/scan_cuda.cuh - util/cuda/ofp_context.hxx +install(FILES util/cuda/ofp_context.hxx util/cuda/kernels.cuh util/cuda/scan_ofp.cuh util/cuda/sort_ofp.cuh util/cuda/reduce_ofp.cuh + util/cuda/segreduce_ofp.cuh + util/cuda/merge_ofp.cuh DESTINATION openfpm_data/include/util/cuda COMPONENT OpenFPM) diff --git a/src/Grid/cuda/cuda_grid_gpu_funcs.cuh b/src/Grid/cuda/cuda_grid_gpu_funcs.cuh index f28b58bda50106d1e5c2dd3596a142fefd67f104..b86d80d061b44504a5cc683d618bd2716b986c26 100644 --- a/src/Grid/cuda/cuda_grid_gpu_funcs.cuh +++ b/src/Grid/cuda/cuda_grid_gpu_funcs.cuh @@ -8,8 +8,11 @@ #ifndef CUDA_GRID_GPU_FUNCS_CUH_ #define CUDA_GRID_GPU_FUNCS_CUH_ +#include "config.h" +#include "util/cuda_launch.hpp" #include "map_grid_cuda_ker.cuh" + #if defined(CUDA_GPU) && defined(__NVCC__) template<unsigned int dim, typename grid_type> @@ -102,7 +105,7 @@ struct grid_toKernelImpl g.get_data_().disable_manage_memory(); g.get_data_().mem = gc.get_internal_data_().mem; // Increment the reference of mem - g.get_data_().mem->incRef(); + //g.get_data_().mem->incRef(); g.get_data_().mem_r.bind_ref(gc.get_internal_data_().mem_r); g.get_data_().switchToDevicePtr(); diff --git a/src/Grid/cuda/cuda_grid_gpu_tests.cu b/src/Grid/cuda/cuda_grid_gpu_tests.cu index d26103e01083762fc1eb377aac9c886df228bda4..bb0bb4af8e2a0cad0e04c1287d44b38785f9df65 100644 --- a/src/Grid/cuda/cuda_grid_gpu_tests.cu +++ b/src/Grid/cuda/cuda_grid_gpu_tests.cu @@ -32,6 +32,18 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func) auto gcf = c3.getGPUIterator(k1,k2); +#ifdef __HIP__ + + BOOST_REQUIRE_EQUAL(gcf.thr.x,8ul); + BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf.thr.z,4ul); + + BOOST_REQUIRE_EQUAL(gcf.wthr.x,8ul); + BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf.wthr.z,16ul); + +#else + BOOST_REQUIRE_EQUAL(gcf.thr.x,16ul); BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul); BOOST_REQUIRE_EQUAL(gcf.thr.z,8ul); @@ -40,12 +52,26 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func) BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul); BOOST_REQUIRE_EQUAL(gcf.wthr.z,8ul); +#endif + grid_key_dx<3> k3({50,50,50}); grid_key_dx<3> k4({62,62,62}); grid_key_dx<3> k5({60,61,62}); auto gcf2 = c3.getGPUIterator(k3,k4); +#ifdef __HIP__ + + BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.z,4ul); + + BOOST_REQUIRE_EQUAL(gcf2.wthr.x,2ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.z,4ul); + +#else + BOOST_REQUIRE_EQUAL(gcf2.thr.x,13ul); BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul); BOOST_REQUIRE_EQUAL(gcf2.thr.z,8ul); @@ -54,6 +80,8 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func) BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul); BOOST_REQUIRE_EQUAL(gcf2.wthr.z,2ul); +#endif + gcf2 = c3.getGPUIterator(k3,k4,511); BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul); diff --git a/src/Grid/cuda/map_grid_cuda_ker.cuh b/src/Grid/cuda/map_grid_cuda_ker.cuh index b638288614dbdf13fb3cdb6e84215133edc781f0..442507b12d7366da5ab03d77c7f027a3e125243d 100644 --- a/src/Grid/cuda/map_grid_cuda_ker.cuh +++ b/src/Grid/cuda/map_grid_cuda_ker.cuh @@ -303,7 +303,8 @@ public: * \return an encap_c that is the representation of the object (careful is not the object) * */ - __device__ inline encapc<dim,T_,layout> get_o(const grid_key_dx<dim> & v1) + template<typename Tk> + __device__ inline encapc<dim,T_,layout> get_o(const grid_key_dx<dim,Tk> & v1) { #ifdef SE_CLASS1 if (check_bound(v1) == false) @@ -323,7 +324,8 @@ public: * \return an encap_c that is the representation of the object (careful is not the object) * */ - __device__ inline const encapc<dim,T_,layout> get_o(const grid_key_dx<dim> & v1) const + template<typename Tk> + __device__ inline const encapc<dim,T_,layout> get_o(const grid_key_dx<dim,Tk> & v1) const { #ifdef SE_CLASS1 if (check_bound(v1) == false) @@ -424,7 +426,7 @@ public: * \param stop end point * */ - struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = 1024) const + struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = default_kernel_wg_threads_) const { return getGPUIterator_impl<dim>(g1,key1,key2,n_thr); } diff --git a/src/Grid/grid_base_implementation.hpp b/src/Grid/grid_base_implementation.hpp index 9ff639d69398fdc6012962817e54e42937294ee7..889b713237bc217fdd994c4d132ca22262c1c7f1 100644 --- a/src/Grid/grid_base_implementation.hpp +++ b/src/Grid/grid_base_implementation.hpp @@ -70,6 +70,37 @@ struct skip_init<true,T> #ifdef __NVCC__ +template<bool active> +struct copy_ndim_grid_device_active_impl + { + template<typename grid_type1, typename grid_type2, typename ite_gpu_type> + static inline void copy(grid_type1 & g1, grid_type2 & g2, ite_gpu_type & ite) + { + + } + + template<typename grid_type1, typename grid_type2, typename ite_gpu_type> + static inline void copy_block(grid_type1 & g1, grid_type2 & g2, ite_gpu_type & ite) + { + } +}; + +template<> +struct copy_ndim_grid_device_active_impl<true> +{ + template<typename grid_type1, typename grid_type2, typename ite_gpu_type> + static inline void copy(grid_type1 & g1, grid_type2 & g2, ite_gpu_type & ite) + { + CUDA_LAUNCH((copy_ndim_grid_device<grid_type1::dims,decltype(g1.toKernel())>),ite,g2.toKernel(),g1.toKernel()); + } + + template<typename grid_type1, typename grid_type2, typename ite_gpu_type> + static inline void copy_block(grid_type1 & g1, grid_type2 & g2, ite_gpu_type & ite) + { + CUDA_LAUNCH((copy_ndim_grid_block_device<grid_type1::dims,decltype(g1.toKernel())>),ite,g2.toKernel(),g1.toKernel()); + } +}; + template<unsigned int dim, typename ids_type = int> struct grid_p { @@ -177,27 +208,6 @@ struct grid_p<1,ids_type> #endif -template<unsigned int dim> -bool has_work_gpu(ite_gpu<dim> & ite) -{ - size_t tot_work = 1; - - if (dim == 1) - {tot_work *= ite.wthr.x * ite.thr.x;} - else if(dim == 2) - { - tot_work *= ite.wthr.x * ite.thr.x; - tot_work *= ite.wthr.y * ite.thr.y; - } - else - { - tot_work *= ite.wthr.x * ite.thr.x; - tot_work *= ite.wthr.y * ite.thr.y; - tot_work *= ite.wthr.z * ite.thr.z; - } - - return tot_work != 0; -} template<unsigned int dim> void move_work_to_blocks(ite_gpu<dim> & ite) @@ -459,6 +469,9 @@ private: { #if defined(CUDA_GPU) && defined(__NVCC__) + // Compile time-cheking that make sense to call a GPU kernel to copy. + + grid_key_dx<dim> start; grid_key_dx<dim> stop; @@ -488,7 +501,7 @@ private: { if (blockSize == 1) { - CUDA_LAUNCH((copy_ndim_grid_device<dim,decltype(grid_new.toKernel())>),ite,this->toKernel(),grid_new.toKernel()); + copy_ndim_grid_device_active_impl<S::isDeviceHostSame() == false>::copy(grid_new,*this,ite); } else { @@ -496,7 +509,7 @@ private: ite.thr.x = blockSize; - CUDA_LAUNCH((copy_ndim_grid_block_device<dim,decltype(grid_new.toKernel())>),ite,this->toKernel(),grid_new.toKernel()); + copy_ndim_grid_device_active_impl<S::isDeviceHostSame() == false>::copy_block(grid_new,*this,ite); } } } @@ -514,7 +527,7 @@ private: auto ite = getGPUIterator_impl<1>(g_sm_copy,start,stop); - CUDA_LAUNCH((copy_ndim_grid_device<dim,decltype(grid_new.toKernel())>),ite,this->toKernel(),grid_new.toKernel()); + copy_ndim_grid_device_active_impl<S::isDeviceHostSame() == false>::copy(grid_new,*this,ite); } #else @@ -730,7 +743,7 @@ public: * \param stop end point * */ - struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = 1024) const + struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = default_kernel_wg_threads_) const { return getGPUIterator_impl<dim>(g1,key1,key2,n_thr); } @@ -875,6 +888,24 @@ public: return layout_base<T>::template get<p>(data_,g1,v1); } + /*! \brief No blocks here, it return 1 + * + * \return 1 + */ + int getBlockEdgeSize() + { + return 1; + } + + /*! \brief No blocks here, it does nothing + * + * \param nb unused + * \param nt unused + * + */ + void setGPUInsertBuffer(unsigned int nb, unsigned int nt) + {} + /*! \brief Get the const reference of the selected element * * \param v1 grid_key that identify the element in the grid diff --git a/src/Grid/grid_key.hpp b/src/Grid/grid_key.hpp index 5dd194abb53ee42534304721bf48698164d75137..2365757ba6f4b85a2ebcaab9835dab98e58431ce 100644 --- a/src/Grid/grid_key.hpp +++ b/src/Grid/grid_key.hpp @@ -441,7 +441,7 @@ public: * \return a string * */ - std::string to_string() + std::string to_string() const { return this->toPointS().toString(); } diff --git a/src/Grid/grid_sm.hpp b/src/Grid/grid_sm.hpp index ed1e1d9ea776d2e2d4b3c95cf1fb606d0c043bc5..a0588a177acc4edaedaa77916bb1f39c39ae0737 100755 --- a/src/Grid/grid_sm.hpp +++ b/src/Grid/grid_sm.hpp @@ -121,11 +121,33 @@ struct ite_gpu #endif }; +template<unsigned int dim> +bool has_work_gpu(ite_gpu<dim> & ite) +{ + size_t tot_work = 1; + + if (dim == 1) + {tot_work *= ite.wthr.x * ite.thr.x;} + else if(dim == 2) + { + tot_work *= ite.wthr.x * ite.thr.x; + tot_work *= ite.wthr.y * ite.thr.y; + } + else + { + tot_work *= ite.wthr.x * ite.thr.x; + tot_work *= ite.wthr.y * ite.thr.y; + tot_work *= ite.wthr.z * ite.thr.z; + } + + return tot_work != 0; +} + //! Declaration grid_sm template<unsigned int N, typename T> class grid_sm; -template<unsigned int dim, typename T2, typename T> -ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = 1024); +template<unsigned int dim, typename grid_sm_type, typename T> +ite_gpu<dim> getGPUIterator_impl(const grid_sm_type & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = default_kernel_wg_threads_); //! Declaration print_warning_on_adjustment template <unsigned int dim, typename linearizer> class print_warning_on_adjustment; @@ -733,7 +755,7 @@ public: return grid_key_dx_iterator_sub<N>(*this,start,stop); } -#ifdef CUDA_GPU +#if defined(CUDA_GPU) /*! \brief Get an iterator for the GPU * @@ -742,7 +764,7 @@ public: * */ template<typename T2> - struct ite_gpu<N> getGPUIterator(const grid_key_dx<N,T2> & key1, const grid_key_dx<N,T2> & key2, size_t n_thr = 1024) const + struct ite_gpu<N> getGPUIterator(const grid_key_dx<N,T2> & key1, const grid_key_dx<N,T2> & key2, size_t n_thr = default_kernel_wg_threads_) const { return getGPUIterator_impl<N>(*this,key1,key2,n_thr); } @@ -753,7 +775,7 @@ public: * \param stop end point * */ - struct ite_gpu<N> getGPUIterator(size_t n_thr = 1024) const + struct ite_gpu<N> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<N> k1; grid_key_dx<N> k2; @@ -812,8 +834,8 @@ public: }; -template<unsigned int dim, typename T2, typename T> -ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, const size_t n_thr) +template<unsigned int dim, typename grid_sm_type, typename T> +ite_gpu<dim> getGPUIterator_impl(const grid_sm_type & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, const size_t n_thr) { size_t tot_work = 1; for (size_t i = 0 ; i < dim ; i++) diff --git a/src/Grid/map_grid.hpp b/src/Grid/map_grid.hpp index e6965bae03acbbba006cfbde63c92b5e731a2289..b2fa40e037e74cc1b10b5ce948ada6d6cf50a06f 100755 --- a/src/Grid/map_grid.hpp +++ b/src/Grid/map_grid.hpp @@ -59,6 +59,15 @@ typedef HeapMemory CudaMemory; #endif +/*! \brief get the type of the SetBlock + * + * + */ +template<typename SGridGpu> +struct GetSetBlockType +{ + typedef typename SGridGpu::device_grid_type::container type; +}; /*! Stub grid class * @@ -185,12 +194,28 @@ public: rem_copy_opt opt = rem_copy_opt::NONE_OPT) {} +#if defined(__HIP__) + + /*! \brief It copy a grid + * + * \param g grid to copy + * + */ + __device__ grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g) + { + printf("Error grid_base operator= is not defined in device code\n"); + + return *this; + } + +#endif + /*! \brief It copy a grid * * \param g grid to copy * */ - grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g) + __host__ grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g) { (static_cast<grid_base_impl<dim,T,S, memory_traits_lin> *>(this))->swap(g.duplicate()); diff --git a/src/Grid/performance/grid_performance_tests.hpp b/src/Grid/performance/grid_performance_tests.hpp index e2878f28bfbc18683ae6a64d9af0e109e494dcb5..e3762c181afa51fe85af4e9b507afaad24df05c8 100644 --- a/src/Grid/performance/grid_performance_tests.hpp +++ b/src/Grid/performance/grid_performance_tests.hpp @@ -220,7 +220,8 @@ BOOST_AUTO_TEST_CASE(grid_performance_write_report) StandardXMLPerformanceGraph("grid_performance_funcs.xml",file_xml_ref,cg); - addUpdtateTime(cg,1); + addUpdateTime(cg,1,"data","grid_performance_funcs"); + createCommitFile("data"); cg.write("grid_performance_funcs.html"); } diff --git a/src/NN/CellList/CellListIterator_test.hpp b/src/NN/CellList/CellListIterator_test.hpp index 5e8266d0a98968f8f80ea02f5da1f264ffd19c7b..e20740ee174e8de948343a8d30c9b2ccc95d4f4e 100644 --- a/src/NN/CellList/CellListIterator_test.hpp +++ b/src/NN/CellList/CellListIterator_test.hpp @@ -12,6 +12,11 @@ #include "NN/CellList/ParticleIt_Cells.hpp" #include "NN/CellList/ParticleItCRS_Cells.hpp" +#ifdef OPENFPM_PDATA +#include "VCluster/VCluster.hpp" +#endif + + /*! \brief Fill the cell-list with particles in the box 0.0,1.0 * * \param k Number of particles @@ -96,6 +101,18 @@ BOOST_AUTO_TEST_CASE( celllist_lin_and_iterator_test ) BOOST_AUTO_TEST_CASE( celllist_hilb_and_iterator_test ) { +#ifdef OPENFPM_PDATA + + auto & v_cl = create_vcluster(); + + std::string c2 = std::string("openfpm_data/test_data/NN_hilb_keys"); + +#else + + std::string c2 = std::string("test_data/NN_hilb_keys"); + +#endif + ///////// INPUT DATA ////////// const size_t dim = 3; @@ -145,7 +162,7 @@ BOOST_AUTO_TEST_CASE( celllist_hilb_and_iterator_test ) openfpm::vector<size_t> keys_old; - keys_old.load("test_data/NN_hilb_keys"); + keys_old.load(c2); for (size_t i = 0; i < keys_old.size(); i++) { diff --git a/src/NN/CellList/CellList_gpu_test.cu b/src/NN/CellList/CellList_gpu_test.cu index 5be0523cdee78b88d2bcb249db293b2f8ff2bad2..55ad9a0517b87ecfb372da57c8b7febdac72470e 100644 --- a/src/NN/CellList/CellList_gpu_test.cu +++ b/src/NN/CellList/CellList_gpu_test.cu @@ -16,7 +16,6 @@ #include "CellList.hpp" #include "util/boost/boost_array_openfpm.hpp" #include "Point_test.hpp" -#include "util/cuda/scan_cuda.cuh" #include "util/cuda_util.hpp" BOOST_AUTO_TEST_SUITE( CellList_gpu_test ) @@ -1363,6 +1362,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props); cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m); + auto & s_t_ns = cl2.getSortToNonSort(); pl.template hostToDevice<0>(); @@ -1371,6 +1371,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di // Domain particles + auto & gdsi = cl2.getDomainSortIds(); gdsi.template deviceToHost<0>(); s_t_ns.template deviceToHost<0>(); @@ -1497,6 +1498,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di } BOOST_REQUIRE_EQUAL(check,true); + } } diff --git a/src/NN/CellList/CellList_test.hpp b/src/NN/CellList/CellList_test.hpp index 1336823e542b51224b64ac7662070711aa3c754f..e93bb76f1cc342c05be9be9c5ceec312f320af0e 100644 --- a/src/NN/CellList/CellList_test.hpp +++ b/src/NN/CellList/CellList_test.hpp @@ -395,7 +395,7 @@ template<unsigned int dim, typename T, typename CellS> void Test_NN_iterator_rad for (size_t i = 0 ; i < dim ; i++) { - vrp.template get<0>(j)[i] = ((float)rand() / RAND_MAX)*(box.getHigh(i) - box.getLow(i)) + box.getLow(i); + vrp.template get<0>(j)[i] = ((float)rand() / (float)RAND_MAX)*(box.getHigh(i) - box.getLow(i)) + box.getLow(i); } } diff --git a/src/Packer_Unpacker/Packer.hpp b/src/Packer_Unpacker/Packer.hpp index dde5c85d1fd351f9d95173223488e7c636f26b42..f7d305066e8c82276409805c92771e7d0a5fb303 100644 --- a/src/Packer_Unpacker/Packer.hpp +++ b/src/Packer_Unpacker/Packer.hpp @@ -8,19 +8,21 @@ #ifndef SRC_PACKER_HPP_ #define SRC_PACKER_HPP_ +#include "util/cudify/cudify.hpp" + #include "util/object_util.hpp" //#include "Grid/util.hpp" #include "Vector/util.hpp" #include "memory/ExtPreAlloc.hpp" #include "util/util_debug.hpp" - #include "Grid/grid_sm.hpp" #include "util/Pack_stat.hpp" #include "Pack_selector.hpp" #include "has_pack_encap.hpp" #include "Packer_util.hpp" + template <typename> struct Debug; /*! \brief Packing class @@ -458,6 +460,33 @@ public: /////////////////////////////////// +template<typename Timpl> +struct copy_packer_chunk_arr_impl +{ + template<typename T, typename e_src, typename e_dst> + static inline void call(const e_src & src, size_t sub_id, e_dst & dst) + { + // Remove the reference from the type to copy + typedef typename boost::remove_reference<decltype(dst.template get< T::value >())>::type copy_rtype; + + meta_copy<copy_rtype>::meta_copy_(src.template get< T::value >()[sub_id],dst.template get< T::value >()); + } +}; + +template<unsigned int N1, typename Timpl> +struct copy_packer_chunk_arr_impl<Timpl[N1]> +{ + template<typename T, typename e_src, typename e_dst> + static inline void call(const e_src & src, size_t sub_id, e_dst & dst) + { + // Remove the reference from the type to copy + typedef typename boost::remove_reference<decltype(dst.template get< T::value >()[0])>::type copy_rtype; + + for (int i = 0 ; i < N1 ; i++) + {meta_copy<copy_rtype>::meta_copy_(src.template get< T::value >()[i][sub_id],dst.template get< T::value >()[i]);} + } +}; + //! It copy one element of the chunk for each property template<typename e_src, typename e_dst> struct copy_packer_chunk @@ -479,8 +508,7 @@ struct copy_packer_chunk */ inline copy_packer_chunk(const e_src & src, size_t sub_id, e_dst & dst) :src(src),dst(dst),sub_id(sub_id) - { - }; + {}; @@ -491,7 +519,9 @@ struct copy_packer_chunk // Remove the reference from the type to copy typedef typename boost::remove_reference<decltype(dst.template get< T::value >())>::type copy_rtype; - meta_copy<copy_rtype>::meta_copy_(src.template get< T::value >()[sub_id],dst.template get< T::value >()); + // meta_copy<copy_rtype>::meta_copy_(src.template get< T::value >()[sub_id],dst.template get< T::value >()); + + copy_packer_chunk_arr_impl<copy_rtype>::template call<T>(src,sub_id,dst); } }; diff --git a/src/Packer_Unpacker/Packer_util.hpp b/src/Packer_Unpacker/Packer_util.hpp index 0f0502f2f867e69b70bbe0c401c8defd6e34aba0..7c5c57c2ae9e1996bcbacacaae4484d5bcf58be2 100644 --- a/src/Packer_Unpacker/Packer_util.hpp +++ b/src/Packer_Unpacker/Packer_util.hpp @@ -21,6 +21,7 @@ template<typename T, typename Mem, int pack_type=Pack_selector<T>::value > class #include "Vector/vector_def.hpp" + /////////////////////////////////////////////////////////////////////////////////// //////////////////////////////// FUNCTORS FOR ENCAP //////////////////////////////// //////////////////////////////////////////////////////////////////////////////////// @@ -107,6 +108,27 @@ struct call_pack_enc_functor } }; +template<typename Timp> +struct call_pack_enc_functor_chunking_impl_arr +{ + template<typename T, typename Mem, typename encap> + static inline void call(ExtPreAlloc<Mem> & mem,const encap & obj, size_t sub_id, Pack_stat & sts) + { + Packer<typename Timp::value_type,Mem>::pack(mem,obj.template get<T::value>()[sub_id],sts); + } +}; + +template<unsigned int N1, typename Timp> +struct call_pack_enc_functor_chunking_impl_arr<Timp[N1]> +{ + template<typename T, typename Mem, typename encap> + static inline void call(ExtPreAlloc<Mem> & mem,const encap & obj, size_t sub_id, Pack_stat & sts) + { + for (int i = 0 ; i < N1 ; i++) + {Packer<typename Timp::value_type,Mem>::pack(mem,obj.template get<T::value>()[i][sub_id],sts);} + } +}; + //A functor for call_aggregatePack template<typename encap, typename Mem> struct call_pack_enc_functor_chunking @@ -132,9 +154,11 @@ struct call_pack_enc_functor_chunking template<typename T> inline void operator()(T& t) { - typedef typename boost::mpl::at<typename encap::type,T>::type::value_type obj_type; +// typedef typename boost::mpl::at<typename encap::type,T>::type::value_type obj_type; + + call_pack_enc_functor_chunking_impl_arr<typename boost::mpl::at<typename encap::type,T>::type>::template call<T>(mem,obj,sub_id,sts); - Packer<obj_type,Mem>::pack(mem,obj.template get<T::value>()[sub_id],sts); +// Packer<obj_type,Mem>::pack(mem,obj.template get<T::value>()[sub_id],sts); } }; @@ -193,6 +217,33 @@ struct call_unpack_encap_functor } }; +template<typename T> +struct call_unpack_encap_functor_chunking_array_selector +{ + template<typename T_value, typename Mem, typename encap> + static inline void unpack(ExtPreAlloc<Mem> & mem, const encap & obj, unsigned int sub_id, Unpack_stat & ps) + { + typedef typename boost::mpl::at<typename encap::type,T_value>::type::value_type obj_type; + + Unpacker<obj_type,Mem>::unpack(mem,obj.template get<T_value::value>()[sub_id],ps); + } +}; + +template<typename T, unsigned int N1> +struct call_unpack_encap_functor_chunking_array_selector<T[N1]> +{ + template<typename T_value, typename Mem, typename encap> + static inline void unpack(ExtPreAlloc<Mem> & mem, const encap & obj, unsigned int sub_id, Unpack_stat & ps) + { + for (int i = 0 ; i < N1 ; i++) + { + typedef typename std::remove_all_extents< typename boost::mpl::at<typename encap::type,T_value>::type >::type::value_type obj_type; + + Unpacker<obj_type,Mem>::unpack(mem,obj.template get<T_value::value>()[i][sub_id],ps); + } + } +}; + //A functor for call_aggregateUnpack template<typename encap, typename Mem, int ... prp> struct call_unpack_encap_functor_chunking @@ -212,9 +263,11 @@ struct call_unpack_encap_functor_chunking template<typename T> inline void operator()(T& t) { - typedef typename boost::mpl::at<typename encap::type,T>::type::value_type obj_type; + typedef typename boost::mpl::at<typename encap::type,T>::type obj_type; + +// Unpacker<obj_type,Mem>::unpack(mem,obj.template get<T::value>()[sub_id],ps); - Unpacker<obj_type,Mem>::unpack(mem,obj.template get<T::value>()[sub_id],ps); + call_unpack_encap_functor_chunking_array_selector<obj_type>::template unpack<T>(mem,obj,sub_id,ps); } }; diff --git a/src/Point_test.hpp b/src/Point_test.hpp index c41e0429ee1a321ab6f80acfa76324fbcc76906f..ede644fa7c19308f6103366d3c3cdcb796b7e1ec 100755 --- a/src/Point_test.hpp +++ b/src/Point_test.hpp @@ -335,7 +335,7 @@ public: * \return this * */ - inline Point_test<T> operator= (const Point_test<T> & p) + __device__ __host__ inline Point_test<T> & operator= (const Point_test<T> & p) { boost::fusion::at_c<0>(data) = boost::fusion::at_c<0>(p.data); boost::fusion::at_c<1>(data) = boost::fusion::at_c<1>(p.data); diff --git a/src/Space/Ghost.hpp b/src/Space/Ghost.hpp index 15c0af2e70ef26696b2ef25f7cd529a9bb8267c3..c5df897c0c8d33e2b72df4e3155b1cb72725539b 100644 --- a/src/Space/Ghost.hpp +++ b/src/Space/Ghost.hpp @@ -45,14 +45,14 @@ public: * \param g ghost * */ - template <typename S> inline Ghost(const Ghost<dim,S> & g) +/* template <typename S> inline Ghost(const Ghost<dim,S> & g) { for (size_t i = 0 ; i < dim ; i++) { this->setLow(i,g.getLow(i)); this->setHigh(i,g.getHigh(i)); } - } + }*/ /*! \brief Constructor from initializer list * diff --git a/src/Space/Shape/Point.hpp b/src/Space/Shape/Point.hpp index c6a229a92ed3af91c004c17f8df2bb9ceec71562..6a6cc1df9be84e8debb71474ee6e9c061e5b0ad8 100644 --- a/src/Space/Shape/Point.hpp +++ b/src/Space/Shape/Point.hpp @@ -11,6 +11,11 @@ #include "memory_ly/Encap.hpp" #include "Point_operators.hpp" +template<typename T> +struct native_encapsulated_type +{ + typedef T type; +}; /*! \brief This class implement the point shape in an N-dimensional space * @@ -41,6 +46,7 @@ template<unsigned int dim ,typename T> class Point //! Property id of the point static const unsigned int x = 0; + typedef typename native_encapsulated_type<T[dim]>::type type_native; /*! \brief Evaluate the expression and save the result on the point * @@ -199,7 +205,7 @@ template<unsigned int dim ,typename T> class Point * */ - __device__ __host__ inline T& operator[](size_t i) + __device__ __host__ inline T& operator[](unsigned int i) { return get(i); } @@ -212,7 +218,7 @@ template<unsigned int dim ,typename T> class Point * */ - __device__ __host__ inline const T& operator[](size_t i) const + __device__ __host__ inline const T& operator[](unsigned int i) const { return get(i); } @@ -487,7 +493,8 @@ template<unsigned int dim ,typename T> class Point * \return itself * */ - __device__ __host__ Point<dim,T> & operator=(const point_expression<T[dim]> & p_exp) + template<typename any> + __device__ __host__ Point<dim,T> & operator=(const point_expression<any> & p_exp) { p_exp.init(); @@ -702,4 +709,21 @@ template<typename T> struct is_Point<T, typename Void< typename T::yes_is_point>::type> : std::true_type {}; +/*! \brief like std::rank but it also work for openfpm structures like Point where it return 1 + * + * \tparam T structure to check + * + */ +template<typename T, bool is_point = is_Point<T>::value> +struct rank_gen +{ + typedef boost::mpl::int_<std::rank<T>::value> type; +}; + +template<typename T> +struct rank_gen<T,true> +{ + typedef boost::mpl::int_<1> type; +}; + #endif diff --git a/src/Space/Shape/Point_operators.hpp b/src/Space/Shape/Point_operators.hpp index 402a8c954f95849f0c4546ae329cf187f4aebf66..2ac668869492cb90772cee5ea19f9ca415b3dd21 100644 --- a/src/Space/Shape/Point_operators.hpp +++ b/src/Space/Shape/Point_operators.hpp @@ -55,64 +55,99 @@ template<unsigned int dim ,typename T> class Point; #define POINT_NEARBYINT 39 #define POINT_RINT 40 #define POINT_SUB_UNI 41 +#define POINT_NORM_INF 42 + /////////////////// Best cast rules //////////////////////// +template<bool cond, typename exp1, typename exp2> +struct first_or_second_pt +{ + typedef typename exp2::coord_type coord_type; +}; + +template<typename exp1, typename exp2> +struct first_or_second_pt<true,exp1,exp2> +{ + typedef typename exp1::coord_type coord_type; +}; + +template<typename T, typename Sfinae = void> +struct has_coordtype: std::false_type {}; + +/*! \brief has_data check if a type has defined a member data + * + * ### Example + * + * \snippet util_test.hpp Check has_data + * + * return true if T::type is a valid type + * + */ +template<typename T> +struct has_coordtype<T, typename Void<typename T::coord_type>::type> : std::true_type +{}; + template<typename source1, typename source2> -struct best_conv +struct best_conv_impl { typedef source1 type; }; - template<typename source2> -struct best_conv<int,source2> +struct best_conv_impl<int,source2> { typedef source2 type; }; template<typename source2> -struct best_conv<long int,source2> +struct best_conv_impl<long int,source2> { typedef source2 type; }; template<typename source2> -struct best_conv<unsigned int,source2> +struct best_conv_impl<unsigned int,source2> { typedef source2 type; }; template<typename source2> -struct best_conv<unsigned long int,source2> +struct best_conv_impl<unsigned long int,source2> { typedef source2 type; }; template<typename source1> -struct best_conv<source1,int> +struct best_conv_impl<source1,int> { typedef source1 type; }; template<typename source1> -struct best_conv<source1,long int> +struct best_conv_impl<source1,long int> { typedef source1 type; }; template<typename source1> -struct best_conv<source1,unsigned int> +struct best_conv_impl<source1,unsigned int> { typedef source1 type; }; template<typename source1> -struct best_conv<source1,unsigned long int> +struct best_conv_impl<source1,unsigned long int> { typedef source1 type; }; +template<typename source1, typename source2> +struct best_conv +{ + typedef typename best_conv_impl<typename std::remove_const<source1>::type,typename std::remove_const<source2>::type>::type type; +}; + /////////////////////////////////////////////////////////////// constexpr unsigned int max_expr(unsigned int dim1, unsigned int dim2) @@ -241,6 +276,8 @@ public: //! this operation produce a vector as result of size dims static const unsigned int nvals = 1; + typedef T coord_type; + /*! \brief constructor from a value * * \param d value @@ -266,7 +303,19 @@ public: * \return It just return the velue set in the constructor * */ - __device__ __host__ inline T value(const size_t k) const + __device__ __host__ inline const T & value(const int k) const + { + return d; + } + + /*! \brief Evaluate the expression + * + * \param k coordinate to evaluate + * + * \return It just return the velue set in the constructor + * + */ + __device__ __host__ inline T & value(const int k) { return d; } @@ -305,6 +354,9 @@ class point_expression_op<orig,exp1,exp2,POINT_SUM> public: + //! The type of the internal vector + typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type; + //! original type of the point expression typedef orig orig_type; @@ -365,6 +417,20 @@ public: init(); return o1.value(0) + o2.value(0); } + + /*! \brief Get the component i + * + * \param i component + * + * \return the i-component + * + */ + template<typename r_type=typename best_conv<typename std::remove_reference<decltype(o1.value(0))>::type, + typename std::remove_reference<decltype(o2.value(0))>::type>::type > + __device__ __host__ inline r_type operator[](size_t i) + { + return o1.value(i) + o2.value(i); + } }; /*! \brief Subtraction operation @@ -383,6 +449,9 @@ class point_expression_op<orig, exp1,exp2,POINT_SUB> public: + //! The type of the internal vector + typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type; + //! Original type typedef orig orig_type; @@ -443,6 +512,8 @@ public: init(); return o1.value(0) - o2.value(0); } + + }; /*! \brief expression that subtract two points @@ -478,6 +549,9 @@ public: //! result dimensionality of this expression static const unsigned int nvals = exp1::nvals; + //! The type of the internal vector + typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type; + /*! constructor from expression * * \param o1 expression1 @@ -537,6 +611,9 @@ class point_expression_op<orig,exp1,exp2,POINT_MUL_POINT> public: + //! The type of the internal vector + typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type; + //! base type of the expression typedef orig orig_type; @@ -617,6 +694,9 @@ class point_expression_op<orig,exp1,exp2,POINT_MUL> public: + //! The type of the internal vector + typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type; + //! origin type typedef orig orig_type; @@ -694,6 +774,10 @@ class point_expression_op<orig,exp1,exp2,POINT_DIV> public: + + //! The type of the internal vector + typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type; + //! original type typedef orig orig_type; @@ -1769,6 +1853,9 @@ public: //! this operation produce a vector as result of size dims static const unsigned int nvals = dim; + //! The type of the internal vector + typedef T coord_type; + /*! \brief constructor from an array * * \param d array of dimension dim @@ -1820,6 +1907,22 @@ public: return *this; } + /*! \brief Array operator + * + */ + __device__ __host__ T & operator[](int n) + { + return d[n]; + } + + /*! \brief Array operator + * + */ + __device__ __host__ const T & operator[](int n) const + { + return d[n]; + } + /*! \brief This function must be called before value * * it calculate the scalar product before return the values @@ -1868,6 +1971,9 @@ public: //! this operation produce a vector as result of size dims static const unsigned int nvals = dim; + //! The type of the internal vector + typedef T coord_type; + /*! \brief construct from an array of dimension dim * * \param d array @@ -1887,6 +1993,14 @@ public: { } + /*! \brief Array operator + * + */ + T operator[](int n) const + { + return d[n]; + } + /*! \brief Evaluate the expression at coordinate k * * It just return the value set in the constructor @@ -1902,7 +2016,7 @@ public: } }; -/*! \brief Specialization for a const array of dimension dim +/*! \brief Specialization for views * * \tparam T type of the array * \tparam dim dimensionality of the array @@ -1912,7 +2026,7 @@ template<typename T, typename vmpl> class point_expression<openfpm::detail::multi_array::sub_array_openfpm<T,1,vmpl>> { //! array view of dimension dim - const openfpm::detail::multi_array::const_sub_array_openfpm<T,1,vmpl,const T *> d; + mutable openfpm::detail::multi_array::sub_array_openfpm<T,1,vmpl> d; public: @@ -1925,13 +2039,40 @@ public: //! this operation produce a vector as result of size dims static const unsigned int nvals = subar_dim<vmpl>::type::value; + //! The type of the internal vector + typedef T coord_type; + + /*! \brief Operator= for point expression + * + * \tparam orig origin type + * \tparam exp1 expression 1 + * \tparam exp2 expression 2 + * \tparam op operation + * + * \param point expression + * + * \return a point expression + * + */ + template<typename orig, typename exp1, typename exp2, unsigned int op> + __device__ __host__ point_expression<openfpm::detail::multi_array::sub_array_openfpm<T,1,vmpl>> & + operator=(const point_expression_op<orig,exp1,exp2,op> & p_exp) + { + p_exp.init(); + + for (size_t i = 0; i < nvals ; i++) + {d[i] = p_exp.value(i);} + + return *this; + } + /*! \brief construct from an array of dimension dim * * \param d array * */ __device__ __host__ inline point_expression(const openfpm::detail::multi_array::sub_array_openfpm<T,1,vmpl> & d) - :d(d.origin(),d.strides()) + :d(d.origin_mutable(),d.strides()) { } @@ -1944,6 +2085,26 @@ public: { } + /*! \brief Same as value() + * + * \param k coordinate + * + */ + __device__ __host__ inline T & operator[](int k) + { + return value(k); + } + + /*! \brief Same as value() + * + * \param k coordinate + * + */ + __device__ __host__ inline T & operator[](int k) const + { + return value(k); + } + /*! \brief Evaluate the expression at coordinate k * * It just return the value set in the constructor @@ -1953,7 +2114,21 @@ public: * \return the value * */ - __device__ __host__ inline T value(const size_t k) const + __device__ __host__ inline T & value(const int k) + { + return d[k]; + } + + /*! \brief Evaluate the expression at coordinate k + * + * It just return the value set in the constructor + * + * \param k coordinate + * + * \return the value + * + */ + __device__ __host__ inline T & value(const int k) const { return d[k]; } diff --git a/src/Space/Shape/Point_operators_functions.hpp b/src/Space/Shape/Point_operators_functions.hpp index 8b95323abbff38558194b7ad16a3c9e4716d27ae..5b84af14c1c92d97a73b57960b9a27e20abb3537 100644 --- a/src/Space/Shape/Point_operators_functions.hpp +++ b/src/Space/Shape/Point_operators_functions.hpp @@ -28,6 +28,9 @@ public:\ typedef typename orig::coord_type return_type;\ \ static const unsigned int nvals = exp1::nvals;\ + \ + typedef typename first_or_second_pt<has_coordtype<exp1>::value,exp1,exp2>::coord_type coord_type;\ + \ \ __device__ __host__ inline explicit point_expression_op(const exp1 & o1)\ :o1(o1),scal(0)\ @@ -77,6 +80,84 @@ fun_name(const Point<dim,T> & va)\ return exp_sum;\ } +/*! \brief Point norm Infinity operation + * + * \tparam orig original type + * \tparam exp1 expression1 + * \tparam exp2 expression2 + * \tparam op operation + * + */ +template <typename orig, typename exp1, typename exp2> +class point_expression_op<orig,exp1,exp2,POINT_NORM_INF> +{ + const exp1 o1; + + //! Scalar value + mutable typename std::remove_const<typename orig::coord_type>::type scal; + +public: + + //! Origin type + typedef orig orig_type; + + //! indicate that this class encapsulate an expression + typedef int is_expression; + + //! indicate that init must be called before value + typedef int has_init; + + //! return type of the expression + typedef typename orig::coord_type return_type; + + //! this operation produce a scalar as result + static const unsigned int nvals = 1; + + typedef typename exp1::coord_type coord_type; + + //! Constructor from expression + __device__ __host__ inline explicit point_expression_op(const exp1 & o1) + :o1(o1),scal(0.0) + {} + + /*! \brief This function must be called before value + * + * it calculate the scalar product before return the values + * + */ + __device__ __host__ inline void init() const + { + scal = 0.0; + + for (size_t i = 0 ; i < orig::dims ; i++) { + if (fabs(o1.value(i)) > scal) { + scal=fabs(o1.value(i)); + } + } + //scal += o1.value(i) * o1.value(i); + + //scal = sqrt(scal); + + o1.init(); + } + + /*! \brief Evaluate the expression + * + * \param key where to evaluate the expression + * + */ + template<typename r_type=typename std::remove_reference<decltype(o1.value(0))>::type > __device__ __host__ inline r_type value(size_t k) const + { + return scal; + } + + template <typename T>__device__ __host__ operator T() const + { + init(); + return scal; + } +}; + /*! \brief Point norm operation * @@ -111,6 +192,8 @@ public: //! this operation produce a scalar as result static const unsigned int nvals = 1; + typedef typename exp1::coord_type coord_type; + //! Constructor from expression __device__ __host__ inline explicit point_expression_op(const exp1 & o1) :o1(o1),scal(0.0) @@ -183,6 +266,8 @@ public: //! this operation produce a vector as result of size dims static const unsigned int nvals = 1; + typedef typename exp1::coord_type coord_type; + //! constructor from an expression __device__ __host__ inline explicit point_expression_op(const exp1 & o1) :o1(o1),scal(0.0) @@ -421,6 +506,84 @@ template <typename T, typename P> __device__ __host__ auto distance(T exp1, P e } + +//////////////////////////////////////// normINF operation ////////////////// + +/* \brief Calculate the norm of the point + * + * \param va point expression one + * + * \return an object that encapsulate the expression + * + */ +template<typename orig,typename exp1, typename exp2, unsigned int op1> +__device__ __host__ inline point_expression_op<orig,point_expression_op<orig,exp1,exp2,op1>,void,POINT_NORM_INF> +norm_inf(const point_expression_op<orig,exp1,exp2,op1> & va) +{ + point_expression_op<orig,point_expression_op<orig,exp1,exp2,op1>,void,POINT_NORM_INF> exp_sum(va); + + return exp_sum; +} + + +/* \brief norm of a scalar + * + * \param d scalar double or float + * + * \return d + * + */ +template <typename T>__device__ __host__ T norm_inf(T d) +{ + return abs(d); +} + + +/* \brief Calculate the norm of the point + * + * \param va point expression one + * + * \return an object that encapsulate the expression + * + */ +template<unsigned int dim, typename T> +__device__ __host__ inline point_expression_op<Point<dim,T>,point_expression<T[dim]>,void,POINT_NORM_INF> +norm_inf(const point_expression<T[dim]> & d) +{ + point_expression_op<Point<dim,T>,point_expression<T[dim]>,void,POINT_NORM_INF> exp_sum( (point_expression<T[dim]>(d)) ); + + return exp_sum; +} + + +/* \brief Divide two points expression + * + * \param va point expression one + * \param vb point expression two + * + * \return an object that encapsulate the expression + * + */ +template<unsigned int dim, typename T> +__device__ __host__ inline point_expression_op<Point<dim,T>,Point<dim,T>,void,POINT_NORM_INF> +norm_inf(const Point<dim,T> & va) +{ + point_expression_op<Point<dim,T>,Point<dim,T>,void,POINT_NORM_INF> exp_sum(va); + + return exp_sum; +} + + +/*! \brief General distance formula + * + * + */ +template <typename T, typename P> __device__ __host__ auto distance_inf(T exp1, P exp2) -> decltype(norm_inf(exp1 - exp2)) +{ + return norm_inf(exp1 - exp2); +} + + ///// Here we define point operator functions CREATE_ARG_FUNC(std::abs,abs,POINT_ABS) diff --git a/src/Space/Shape/Point_unit_test.hpp b/src/Space/Shape/Point_unit_test.hpp index d903a5354e0cd53963ac2a189f3e1ea56a17c38f..5e61e279a614c22d721eae81405a2b90c9533696 100644 --- a/src/Space/Shape/Point_unit_test.hpp +++ b/src/Space/Shape/Point_unit_test.hpp @@ -554,7 +554,7 @@ BOOST_AUTO_TEST_CASE( Point_expression_usage_with_conversion ) double tmp = 5 + (p1 * p2); double check = 5 + p1.get(0)*p2.get(0) + p1.get(1)*p2.get(1) + p1.get(2)*p2.get(2); BOOST_REQUIRE_EQUAL(tmp,check); - p3 = 5 + (p1 * p2); + p3 = 5.0 + (p1 * p2); for (size_t i = 0 ; i < 3 ; i++) {BOOST_REQUIRE_EQUAL(p3.get(i),check) ;} tmp = 5 - (p1 * p2); check = 5 - p1.get(0)*p2.get(0) - p1.get(1)*p2.get(1) - p1.get(2)*p2.get(2); diff --git a/src/Space/SpaceBox.hpp b/src/Space/SpaceBox.hpp index 0b4fdb8842c2f18009bb6f18697d13320d96edca..b595b3e26bda5f1c78d8ddd1a40b8700dbefebf1 100644 --- a/src/Space/SpaceBox.hpp +++ b/src/Space/SpaceBox.hpp @@ -211,7 +211,7 @@ class SpaceBox : public Box<dim,T> Point<dim,T> p; for (size_t i = 0 ; i < dim ; i++) - p.get(i) = ((T)rand())/RAND_MAX * (this->getHigh(i) - this->getLow(i)) + this->getLow(i); + p.get(i) = ((T)rand())/(T)RAND_MAX * (this->getHigh(i) - this->getLow(i)) + this->getLow(i); return p; } diff --git a/src/SparseGrid/SparseGrid.hpp b/src/SparseGrid/SparseGrid.hpp index 6bbcb4028d38a4f2b8a4d4c29fb21160c331c35b..f2d0beaf62816e406c73b9ef88e936f2ae0f2cf0 100644 --- a/src/SparseGrid/SparseGrid.hpp +++ b/src/SparseGrid/SparseGrid.hpp @@ -28,83 +28,8 @@ #include "VTKWriter/VTKWriter.hpp" #endif -template<typename chunk_def> -struct sparse_grid_bck_value -{ - chunk_def bck; - - sparse_grid_bck_value(chunk_def bck) - :bck(bck) - {} - - template<unsigned int p> - auto get() -> decltype(bck.template get<p>()[0]) - { - return bck.template get<p>()[0]; - } - - template<unsigned int p> - auto get() const -> decltype(bck.template get<p>()[0]) - { - return bck.template get<p>()[0]; - } -}; - -template<typename ArrTypeView> -struct std_array_vector_view -{ - int pos; - ArrTypeView arr; - - std_array_vector_view(int pos,ArrTypeView arr) - :pos(pos),arr(arr) - {} - - decltype(arr[0][0]) operator[](int comp) - { - return arr[comp][pos]; - } - - decltype(std::declval<const ArrTypeView>()[0][0]) operator[](int comp) const - { - return arr[comp][pos]; - } -}; - -template<typename T> -struct get_selector -{ - template<unsigned int p, typename chunks_vector_type> - static T & get(chunks_vector_type & chunks, size_t active_cnk, int ele_id) - { - return chunks.template get<p>(active_cnk)[ele_id]; - } - - template<unsigned int p, typename chunks_vector_type> - static const T & get_const(chunks_vector_type & chunks, size_t active_cnk, int ele_id) - { - return chunks.template get<p>(active_cnk)[ele_id]; - } -}; - -template<typename T, unsigned int N1> -struct get_selector<T[N1]> -{ - template<unsigned int p, typename chunks_vector_type> - static std_array_vector_view<decltype(std::declval<chunks_vector_type>().template get<p>(0))> get(chunks_vector_type & chunks, size_t active_cnk, int ele_id) - { - return std_array_vector_view<decltype(chunks.template get<p>(active_cnk))>(ele_id,chunks.template get<p>(active_cnk)); - } - - template<unsigned int p, typename chunks_vector_type> - static const std_array_vector_view<decltype(std::declval<chunks_vector_type>().template get<p>(0))> get_const(chunks_vector_type & chunks, size_t active_cnk, int ele_id) - { - return std_array_vector_view<decltype(chunks.template get<p>(active_cnk))>(ele_id,chunks.template get<p>(active_cnk)); - } -}; - template<typename Tsrc,typename Tdst> class copy_bck @@ -974,7 +899,11 @@ public: void setBackgroundValue(const typename boost::mpl::at<typename T::type,boost::mpl::int_<p>>::type & val) { for (int i = 0 ; i < chunking::size::value ; i++) - {meta_copy<typename boost::mpl::at<typename T::type,boost::mpl::int_<p>>::type>::meta_copy_(val,chunks.get(0).template get<p>()[i]);} + { + meta_copy<typename boost::mpl::at<typename T::type,boost::mpl::int_<p>>::type>:: + meta_copy_(val, + get_selector<typename boost::mpl::at<typename T::type,boost::mpl::int_<p>>::type>::template get<p>(chunks,0,i)); + } } /*! \brief Get the background value @@ -2018,9 +1947,13 @@ public: size_t pos_src_id = key_src_s.getPos(); size_t pos_dst_id; - auto block_src = grid_src.getBlock(key_src_s); + + ///////// WARNING insert_o MUST be done before getBlock, otherwise if grid_src == *this, and insert_o produce a reallocation + ///////// block_src will be invalidated ////// auto block_dst = this->insert_o(key_dst,pos_dst_id); + auto block_src = grid_src.getBlock(key_src_s); + copy_sparse_to_sparse_bb<dim,decltype(block_src),decltype(block_dst),T> caps(block_src,block_dst,pos_src_id,pos_dst_id); boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(caps); diff --git a/src/SparseGrid/SparseGridUtil.hpp b/src/SparseGrid/SparseGridUtil.hpp index 98841c1e2872485ed2b65e6a62af9b6b565c1a8c..f5da68ef3b7033d227f26b2947becee0076c36fc 100644 --- a/src/SparseGrid/SparseGridUtil.hpp +++ b/src/SparseGrid/SparseGridUtil.hpp @@ -608,4 +608,59 @@ struct sublin<8,chunk> }; +template<typename chunk_def> +struct sparse_grid_bck_value +{ + chunk_def bck; + + sparse_grid_bck_value(chunk_def bck) + :bck(bck) + {} + + template<unsigned int p> + auto get() -> decltype(bck.template get<p>()[0]) + { + return bck.template get<p>()[0]; + } + + template<unsigned int p> + auto get() const -> decltype(bck.template get<p>()[0]) + { + return bck.template get<p>()[0]; + } +}; + + +template<typename T> +struct get_selector +{ + template<unsigned int p, typename chunks_vector_type> + static T & get(chunks_vector_type & chunks, size_t active_cnk, int ele_id) + { + return chunks.template get<p>(active_cnk)[ele_id]; + } + + template<unsigned int p, typename chunks_vector_type> + static const T & get_const(chunks_vector_type & chunks, size_t active_cnk, int ele_id) + { + return chunks.template get<p>(active_cnk)[ele_id]; + } +}; + +template<typename T, unsigned int N1> +struct get_selector<T[N1]> +{ + template<unsigned int p, typename chunks_vector_type> + static std_array_vector_view<decltype(std::declval<chunks_vector_type>().template get<p>(0))> get(chunks_vector_type & chunks, size_t active_cnk, int ele_id) + { + return std_array_vector_view<decltype(chunks.template get<p>(active_cnk))>(ele_id,chunks.template get<p>(active_cnk)); + } + + template<unsigned int p, typename chunks_vector_type> + static const std_array_vector_view<decltype(std::declval<chunks_vector_type>().template get<p>(0))> get_const(chunks_vector_type & chunks, size_t active_cnk, int ele_id) + { + return std_array_vector_view<decltype(chunks.template get<p>(active_cnk))>(ele_id,chunks.template get<p>(active_cnk)); + } +}; + #endif /* OPENFPM_DATA_SRC_SPARSEGRID_SPARSEGRIDUTIL_HPP_ */ diff --git a/src/SparseGrid/SparseGrid_chunk_copy.hpp b/src/SparseGrid/SparseGrid_chunk_copy.hpp index 41b8ff729dc227a245ec9dd00d0b0682d06007eb..8fe4d603a0af628f87275bc144c9995b0a8a9ad3 100644 --- a/src/SparseGrid/SparseGrid_chunk_copy.hpp +++ b/src/SparseGrid/SparseGrid_chunk_copy.hpp @@ -8,9 +8,11 @@ #ifndef SPARSEGRID_CHUNK_COPY_HPP_ #define SPARSEGRID_CHUNK_COPY_HPP_ -#if !defined(__NVCC__) || defined(CUDA_ON_CPU) +#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__) +// Nvcc does not like VC ... for some reason #include <Vc/Vc> #endif + #include "util/mathutil.hpp" diff --git a/src/SparseGrid/SparseGrid_conv_opt.hpp b/src/SparseGrid/SparseGrid_conv_opt.hpp index 7d50d97fb3d80c420ac00613a4bbcee064ebd7e8..c8e702950612359161e0b9e83207d283c723c8d8 100644 --- a/src/SparseGrid/SparseGrid_conv_opt.hpp +++ b/src/SparseGrid/SparseGrid_conv_opt.hpp @@ -110,7 +110,7 @@ struct conv_impl } }; -#if !defined(__NVCC__) || defined(CUDA_ON_CPU) +#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__) template<unsigned int dir,int p, unsigned int prop_src1,typename chunk_type, typename vect_type, typename ids_type> @@ -199,14 +199,16 @@ void load_crs_v(vect_type & cs1, chunk_type & chunk, ids_type & ids) } } + +template<typename prop_type> struct cross_stencil_v { - Vc::double_v xm; - Vc::double_v xp; - Vc::double_v ym; - Vc::double_v yp; - Vc::double_v zm; - Vc::double_v zp; + Vc::Vector<prop_type> xm; + Vc::Vector<prop_type> xp; + Vc::Vector<prop_type> ym; + Vc::Vector<prop_type> yp; + Vc::Vector<prop_type> zm; + Vc::Vector<prop_type> zp; }; template<> @@ -421,7 +423,7 @@ struct conv_impl<3> for (int k = 0 ; k < sx::value ; k += Vc::Vector<prop_type>::Size) { // we do only id exist the point - if (*(int *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;} + if (*(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;} data_il<Vc::Vector<prop_type>::Size> mxm; data_il<Vc::Vector<prop_type>::Size> mxp; @@ -430,7 +432,7 @@ struct conv_impl<3> data_il<Vc::Vector<prop_type>::Size> mzm; data_il<Vc::Vector<prop_type>::Size> mzp; - cross_stencil_v cs; + cross_stencil_v<prop_type> cs; Vc::Vector<prop_type> cmd(&chunk.template get<prop_src>()[s2]); @@ -451,37 +453,21 @@ struct conv_impl<3> long int sumzp = (v == sz::value-1)?offset_jump[5] - (sz::value - 1)*sx::value*sy::value:sx::value*sy::value; sumzp += s2; - if (Vc::Vector<prop_type>::Size == 2) + if (Vc::Vector<prop_type>::Size == 2 || Vc::Vector<prop_type>::Size == 4 || Vc::Vector<prop_type>::Size == 8) { - mxm.i = *(short int *)&mask.mask[s2]; + mxm.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2]; mxm.i = mxm.i << 8; - mxm.i |= (short int)mask.mask[sumxm]; + mxm.i |= (typename data_il<Vc::Vector<prop_type>::Size>::type)mask.mask[sumxm]; - mxp.i = *(short int *)&mask.mask[s2]; + mxp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2]; mxp.i = mxp.i >> 8; - mxp.i |= ((short int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8; + mxp.i |= ((typename data_il<Vc::Vector<prop_type>::Size>::type)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8; - mym.i = *(short int *)&mask.mask[sumym]; - myp.i = *(short int *)&mask.mask[sumyp]; + mym.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumym]; + myp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumyp]; - mzm.i = *(short int *)&mask.mask[sumzm]; - mzp.i = *(short int *)&mask.mask[sumzp]; - } - else if (Vc::Vector<prop_type>::Size == 4) - { - mxm.i = *(int *)&mask.mask[s2]; - mxm.i = mxm.i << 8; - mxm.i |= (int)mask.mask[sumxm]; - - mxp.i = *(int *)&mask.mask[s2]; - mxp.i = mxp.i >> 8; - mxp.i |= ((int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8; - - mym.i = *(int *)&mask.mask[sumym]; - myp.i = *(int *)&mask.mask[sumyp]; - - mzm.i = *(int *)&mask.mask[sumzm]; - mzp.i = *(int *)&mask.mask[sumzp]; + mzm.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumzm]; + mzp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumzp]; } else { @@ -737,7 +723,7 @@ struct conv_impl<3> for (int k = 0 ; k < sx::value ; k += Vc::Vector<prop_type>::Size) { // we do only id exist the point - if (*(int *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;} + if (*(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;} data_il<4> mxm; data_il<4> mxp; @@ -746,8 +732,8 @@ struct conv_impl<3> data_il<4> mzm; data_il<4> mzp; - cross_stencil_v cs1; - cross_stencil_v cs2; + cross_stencil_v<prop_type> cs1; + cross_stencil_v<prop_type> cs2; Vc::Vector<prop_type> cmd1(&chunk.template get<prop_src1>()[s2]); Vc::Vector<prop_type> cmd2(&chunk.template get<prop_src2>()[s2]); @@ -769,41 +755,21 @@ struct conv_impl<3> long int sumzp = (v == sz::value-1)?offset_jump[5] - (sz::value - 1)*sx::value*sy::value:sx::value*sy::value; sumzp += s2; - if (Vc::Vector<prop_type>::Size == 2) + if (Vc::Vector<prop_type>::Size == 1 || Vc::Vector<prop_type>::Size == 2 || Vc::Vector<prop_type>::Size == 4 || Vc::Vector<prop_type>::Size == 8) { - mxm.i = *(short int *)&mask.mask[s2]; + mxm.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2]; mxm.i = mxm.i << 8; - mxm.i |= (short int)mask.mask[sumxm]; + mxm.i |= (typename data_il<Vc::Vector<prop_type>::Size>::type)mask.mask[sumxm]; - mxp.i = *(short int *)&mask.mask[s2]; + mxp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2]; mxp.i = mxp.i >> 8; - mxp.i |= ((short int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8; + mxp.i |= ((typename data_il<Vc::Vector<prop_type>::Size>::type)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8; - mym.i = *(short int *)&mask.mask[sumym]; - myp.i = *(short int *)&mask.mask[sumyp]; + mym.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumym]; + myp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumyp]; - mzm.i = *(short int *)&mask.mask[sumzm]; - mzp.i = *(short int *)&mask.mask[sumzp]; - } - else if (Vc::Vector<prop_type>::Size == 4) - { - mxm.i = *(int *)&mask.mask[s2]; - mxm.i = mxm.i << 8; - mxm.i |= (int)mask.mask[sumxm]; - - mxp.i = *(int *)&mask.mask[s2]; - mxp.i = mxp.i >> 8; - mxp.i |= ((int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8; - - mym.i = *(int *)&mask.mask[sumym]; - myp.i = *(int *)&mask.mask[sumyp]; - - mzm.i = *(int *)&mask.mask[sumzm]; - mzp.i = *(int *)&mask.mask[sumzp]; - } - else - { - std::cout << __FILE__ << ":" << __LINE__ << " UNSUPPORTED" << std::endl; + mzm.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumzm]; + mzp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumzp]; } cs1.xm = cmd1; diff --git a/src/SparseGrid/SparseGrid_iterator_block.hpp b/src/SparseGrid/SparseGrid_iterator_block.hpp index 639215a7f644670d7b2397be9a2c136e00ae42b2..9a37ee99ce68645e76102346f7d167a8541a7a93 100644 --- a/src/SparseGrid/SparseGrid_iterator_block.hpp +++ b/src/SparseGrid/SparseGrid_iterator_block.hpp @@ -8,6 +8,19 @@ #ifndef SPARSEGRID_ITERATOR_BLOCK_HPP_ #define SPARSEGRID_ITERATOR_BLOCK_HPP_ +#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__) +// Nvcc does not like VC ... for some reason +#include <Vc/Vc> +#define DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR (Vc::float_v::Size *sizeof(float) >= sizeof(T)) + +#else + +#define DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR 1 + +#endif + + + #include "Grid/iterators/grid_skin_iterator.hpp" #include "SparseGrid_chunk_copy.hpp" @@ -270,7 +283,7 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext> auto & chunk = data.template get<prop>(chunk_id); - copy_xyz<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,false>::template copy<N1>(arr,mask,h,chunk); + copy_xyz<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,false>::template copy<N1>(arr,mask,h,chunk); } @@ -285,7 +298,7 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext> auto & chunk = data.template get<prop>(chunk_id); - copy_xyz<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,false>::template store<N1>(arr,chunk); + copy_xyz<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,false>::template store<N1>(arr,chunk); } @@ -340,11 +353,11 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext> if (exist == true) { auto & h = header_mask.get(r); - copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<0,stencil_size+sz2::value,N1>(arr,mask,h,data.get(r)); + copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<0,stencil_size+sz2::value,N1>(arr,mask,h,data.get(r)); } else { - copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<stencil_size+sz2::value,N1>(mask); + copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<stencil_size+sz2::value,N1>(mask); } if (findNN == false) { @@ -360,11 +373,11 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext> if (exist == true) { auto & h = header_mask.get(r); - copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<sz2::value - stencil_size,0,N1>(arr,mask,h,data.get(r)); + copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<sz2::value - stencil_size,0,N1>(arr,mask,h,data.get(r)); } else { - copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<0,N1>(mask); + copy_xy_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<0,N1>(mask); } if (findNN == false) @@ -381,11 +394,11 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext> if (exist == true) { auto & h = header_mask.get(r); - copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<0,stencil_size+sz1::value,N1>(arr,mask,h,data.get(r)); + copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<0,stencil_size+sz1::value,N1>(arr,mask,h,data.get(r)); } else { - copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<stencil_size+sz1::value,N1>(mask); + copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<stencil_size+sz1::value,N1>(mask); } if (findNN == false) { @@ -401,11 +414,11 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext> if (exist == true) { auto & h = header_mask.get(r); - copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<sz1::value-stencil_size,0,N1>(arr,mask,h,data.get(r)); + copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<sz1::value-stencil_size,0,N1>(arr,mask,h,data.get(r)); } else { - copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<0,N1>(mask); + copy_xz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<0,N1>(mask); } if (findNN == false) @@ -422,11 +435,11 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext> if (exist == true) { auto & h = header_mask.get(r); - copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<0,sz0::value+stencil_size,N1>(arr,mask,h,data.get(r)); + copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<0,sz0::value+stencil_size,N1>(arr,mask,h,data.get(r)); } else { - copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<sz0::value+stencil_size,N1>(mask); + copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<sz0::value+stencil_size,N1>(mask); } if (findNN == false) { @@ -442,11 +455,11 @@ struct loadBlock_impl<prop,stencil_size,3,vector_blocks_exts,vector_ext> if (exist == true) { auto & h = header_mask.get(r); - copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<sz0::value-stencil_size,0,N1>(arr,mask,h,data.get(r)); + copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template copy<sz0::value-stencil_size,0,N1>(arr,mask,h,data.get(r)); } else { - copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<0,N1>(mask); + copy_yz_3<is_layout_inte<typename SparseGridType::memory_traits >::type::value && DISABLE_VECTORIZATION_OPTIMIZATION_WHEN_VCDEVEL_IS_SCALAR ,prop,stencil_size,typename vector_blocks_exts::type,NNType::is_cross>::template mask_null<0,N1>(mask); } } }; diff --git a/src/SparseGrid/SparseGrid_unit_tests.cpp b/src/SparseGrid/SparseGrid_unit_tests.cpp index ba7026a91e7fc53cf7c3802cd3cb655a8c8efccd..428de8c109e2aee8cc0aec585348d2cccff3d925 100644 --- a/src/SparseGrid/SparseGrid_unit_tests.cpp +++ b/src/SparseGrid/SparseGrid_unit_tests.cpp @@ -1378,7 +1378,7 @@ BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_cross_simplified) for (int i = 0 ; i < 1 ; i++) { - grid.conv_cross<0,1,1>(start,stop,[]( Vc::double_v & cmd, cross_stencil_v & s, + grid.conv_cross<0,1,1>(start,stop,[]( Vc::double_v & cmd, cross_stencil_v<double> & s, unsigned char * mask_sum){ Vc::double_v Lap = s.xm + s.xp + @@ -1456,6 +1456,108 @@ BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_cross_simplified) // print_grid("debug_out",grid); } +BOOST_AUTO_TEST_CASE( sparse_grid_fast_stencil_vectorized_cross_simplified_float) +{ + size_t sz[3] = {501,501,501}; + size_t sz_cell[3] = {500,500,500}; + + sgrid_soa<3,aggregate<float,float,int>,HeapMemory> grid(sz); + + grid.getBackgroundValue().template get<0>() = 0.0; + + CellDecomposer_sm<3, float, shift<3,float>> cdsm; + + Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0}); + + cdsm.setDimensions(domain, sz_cell, 0); + + fill_sphere_quad(grid,cdsm); + + //grid.reorder(); + + grid_key_dx<3> start({1,1,1}); + grid_key_dx<3> stop({499,499,499}); + + for (int i = 0 ; i < 1 ; i++) + { + grid.conv_cross<0,1,1>(start,stop,[]( Vc::float_v & cmd, cross_stencil_v<float> & s, + unsigned char * mask_sum){ + + Vc::float_v Lap = s.xm + s.xp + + s.ym + s.yp + + s.zm + s.zp - 6.0f*cmd; + + Vc::Mask<float> surround; + + for (int i = 0 ; i < Vc::float_v::Size ; i++) + {surround[i] = (mask_sum[i] == 6);} + + Lap = Vc::iif(surround,Lap,Vc::float_v(1.0f)); + + return Lap; + }); + } + + int tot_six = 0; + int tot_one = 0; + + bool check = true; + auto it2 = grid.getIterator(start,stop); + while (it2.isNext()) + { + auto p = it2.get(); + + check &= (grid.template get<1>(p) == 6 || grid.template get<1>(p) == 1); + + // Check the six should be a six + auto xp = p.move(0,1); + auto xm = p.move(0,-1); + + auto yp = p.move(1,1); + auto ym = p.move(1,-1); + + auto zp = p.move(2,1); + auto zm = p.move(2,-1); + + bool is_six; + if (grid.existPoint(xp) && grid.existPoint(xm) && + grid.existPoint(yp) && grid.existPoint(ym) && + grid.existPoint(zp) && grid.existPoint(zm)) + { + is_six = true; + } + else + { + is_six = false; + } + + if (is_six == true && grid.template get<1>(p) != 6.0) + { + check = false; + break; + } + + if (grid.template get<1>(p) == 1) + { + tot_one++; + } + + if (grid.template get<1>(p) == 6) + { + tot_six++; + } + + ++it2; + } + + BOOST_REQUIRE_EQUAL(check,true); + BOOST_REQUIRE_EQUAL(tot_six,15857813); + BOOST_REQUIRE_EQUAL(tot_one,2977262); + // Check correct-ness + +// print_grid("debug_out",grid); +} + constexpr int x = 0; constexpr int y = 1; constexpr int z = 2; diff --git a/src/SparseGridGpu/BlockMapGpu.hpp b/src/SparseGridGpu/BlockMapGpu.hpp index cc336f58d191177428256fe331d5535c10c053dd..92a7f651c7cba96c9631fadc954b56d14e0c7aac 100644 --- a/src/SparseGridGpu/BlockMapGpu.hpp +++ b/src/SparseGridGpu/BlockMapGpu.hpp @@ -7,17 +7,6 @@ #include "DataBlock.cuh" #include <set> -template<typename BlockT, typename T> -struct AggregateAppend -{ -}; - -template<typename BlockT, typename ... list> -struct AggregateAppend<BlockT, aggregate<list ...>> -{ - typedef aggregate<list..., BlockT> type; -}; - template<typename AggregateT, unsigned int p> using BlockTypeOf = typename std::remove_reference<typename boost::fusion::result_of::at_c<typename AggregateT::type, p>::type>::type; @@ -29,6 +18,8 @@ class BlockMapGpu { private: typedef BlockTypeOf<AggregateBlockT, 0> BlockT0; + + bool is_new; #ifdef SE_CLASS1 @@ -119,13 +110,19 @@ public: * */ template<unsigned int p> - auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId).template get<p>()) + auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId,is_new).template get<p>()) { typedef BlockTypeOf<AggregateBlockT, p> BlockT; - auto aggregate = blockMap.insertFlush(blockId); - auto &block = aggregate.template get<p>(); - + auto aggregate = blockMap.insertFlush(blockId,is_new); + auto &block = aggregate.template get<p>(); + + if (is_new == true) + { + for (int i = 0 ; i < BlockT::size ; i++) + {aggregate.template get<pMask>()[i] = 0;} + } + return block; } @@ -136,9 +133,18 @@ public: * \return a reference to the block data * */ - auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId)) + auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId,is_new)) { - return blockMap.insertFlush(blockId); + typedef BlockTypeOf<AggregateBlockT, 0> BlockT; + auto b = blockMap.insertFlush(blockId,is_new); + + if (is_new == true) + { + for (int i = 0 ; i < BlockT::size ; i++) + {b.template get<pMask>()[i] = 0;} + } + + return b; } BlockMapGpu_ker<AggregateInternalT, indexT, layout_base> toKernel() diff --git a/src/SparseGridGpu/BlockMapGpu_ker.cuh b/src/SparseGridGpu/BlockMapGpu_ker.cuh index 1ef24069d83e0c97d27aa7c304cf36539552213c..b99de92fedc4c4386019d4db105624feb05e4ce2 100644 --- a/src/SparseGridGpu/BlockMapGpu_ker.cuh +++ b/src/SparseGridGpu/BlockMapGpu_ker.cuh @@ -103,18 +103,70 @@ public: : blockMap(blockMap) {}; template<unsigned int p> - inline __device__ auto get(unsigned int linId) const -> ScalarTypeOf<AggregateBlockT, p>; + inline __device__ auto get(unsigned int linId) const -> ScalarTypeOf<AggregateBlockT, p> + { + #ifdef __NVCC__ + typedef BlockTypeOf<AggregateBlockT, p> BlockT; + unsigned int blockId = linId / BlockT::size; + unsigned int offset = linId % BlockT::size; + return get<p>(blockId, offset); + #else // __NVCC__ + std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; + #endif // __NVCC__ + } + template<unsigned int p> - inline __device__ auto get(unsigned int blockId, unsigned int offset) const -> ScalarTypeOf<AggregateBlockT, p>; + inline __device__ auto get(unsigned int blockId, unsigned int offset) const -> ScalarTypeOf<AggregateBlockT, p> + { + #ifdef __NVCC__ + + const auto sid = blockMap.get_sparse(blockId); + const auto & block = blockMap.template get_ele<p>(sid.id)[offset]; + const auto mask = blockMap.template get_ele<pMask>(sid.id)[offset]; + // Now check if the element actually exists + return exist(mask) + ? block + : blockMap.template getBackground<p>()[offset]; + + #else // __NVCC__ + std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; + #endif // __NVCC__ + } - inline __device__ auto getBlock(unsigned int blockId) -> decltype(blockMap.get(0)); + inline __device__ auto getBlock(unsigned int blockId) -> decltype(blockMap.get(0)) + { + #ifdef __NVCC__ + return blockMap.get(blockId); + #else // __NVCC__ + std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; + #endif // __NVCC__ + } template<unsigned int p> - inline __device__ ScalarTypeOf<AggregateBlockT, p> & getReference(unsigned int linId); + inline __device__ ScalarTypeOf<AggregateBlockT, p> & getReference(unsigned int linId) + { + // Only call this if you are TOTALLY SURE the element exists! Otherwise KABOOOOOM! :D + #ifdef __NVCC__ + typedef BlockTypeOf<AggregateBlockT, p> BlockT; + unsigned int blockId = linId / BlockT::size; + unsigned int offset = linId % BlockT::size; + return getReference<p>(blockId, offset); + #else // __NVCC__ + std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; + #endif // __NVCC__ + } template<unsigned int p> - inline __device__ ScalarTypeOf<AggregateBlockT, p> & getReference(unsigned int blockId, unsigned int offset); + inline __device__ ScalarTypeOf<AggregateBlockT, p> & getReference(unsigned int blockId, unsigned int offset) + { + // Only call this if you are TOTALLY SURE the element exists! Otherwise KABOOOOOM! :D + #ifdef __NVCC__ + return blockMap.template get<p>(blockId)[offset]; + #else // __NVCC__ + std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; + #endif // __NVCC__ + } template<unsigned int p> inline __device__ auto insert(unsigned int linId) -> ScalarTypeOf<AggregateBlockT, p>& @@ -130,7 +182,19 @@ public: } template<unsigned int p> - inline __device__ auto insert(unsigned int blockId, unsigned int offset) -> ScalarTypeOf<AggregateBlockT, p>&; + inline __device__ auto insert(unsigned int blockId, unsigned int offset) -> ScalarTypeOf<AggregateBlockT, p>& + { + #ifdef __NVCC__ + auto aggregate = blockMap.insert(blockId); + auto &block = aggregate.template get<p>(); + auto &mask = aggregate.template get<pMask>(); + setExist(mask[offset]); + + return block[offset]; + #else // __NVCC__ + std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; + #endif // __NVCC__ + } template<unsigned int nChunksPerBlocks = 1> inline __device__ auto insertBlock(indexT blockId, unsigned int stride = 8192) -> decltype(blockMap.insert(0)) @@ -313,108 +377,7 @@ public: }; -template<typename AggregateBlockT, typename indexT, template<typename> class layout_base> -template<unsigned int p> -inline __device__ auto BlockMapGpu_ker<AggregateBlockT, indexT, layout_base> -::get(unsigned int linId) const -> ScalarTypeOf<AggregateBlockT, p> -{ -#ifdef __NVCC__ - typedef BlockTypeOf<AggregateBlockT, p> BlockT; - unsigned int blockId = linId / BlockT::size; - unsigned int offset = linId % BlockT::size; - return get<p>(blockId, offset); -#else // __NVCC__ - std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; -#endif // __NVCC__ -} - -template<typename AggregateBlockT, typename indexT, template<typename> class layout_base> -template<unsigned int p> -inline __device__ auto BlockMapGpu_ker<AggregateBlockT, indexT, layout_base> -::get(unsigned int blockId, unsigned int offset) const -> ScalarTypeOf<AggregateBlockT, p> -{ -#ifdef __NVCC__ -// const auto & aggregate = blockMap.get(blockId); -// const auto & block = aggregate.template get<p>(); -// const auto & mask = aggregate.template get<pMask>(); -// // Now check if the element actually exists -// return exist(mask[offset]) -// ? block[offset] -// : blockMap.template getBackground<p>()[offset]; -//// return blockMap.template get<p>(blockId)[offset]; - - const auto sid = blockMap.get_sparse(blockId); - const auto & block = blockMap.template get_ele<p>(sid.id)[offset]; - const auto mask = blockMap.template get_ele<pMask>(sid.id)[offset]; - // Now check if the element actually exists - return exist(mask) - ? block - : blockMap.template getBackground<p>()[offset]; - -#else // __NVCC__ - std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; -#endif // __NVCC__ -} - - -template<typename AggregateBlockT, typename indexT, template<typename> class layout_base> -inline __device__ auto BlockMapGpu_ker<AggregateBlockT, indexT, layout_base> -::getBlock(unsigned int blockId) -> decltype(blockMap.get(0)) -{ -#ifdef __NVCC__ - return blockMap.get(blockId); -#else // __NVCC__ - std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; -#endif // __NVCC__ -} - -template<typename AggregateBlockT, typename indexT, template<typename> class layout_base> -template<unsigned int p> -inline __device__ ScalarTypeOf<AggregateBlockT, p> & BlockMapGpu_ker<AggregateBlockT, indexT, layout_base> -::getReference(const unsigned int linId) -{ - // Only call this if you are TOTALLY SURE the element exists! Otherwise KABOOOOOM! :D -#ifdef __NVCC__ - typedef BlockTypeOf<AggregateBlockT, p> BlockT; - unsigned int blockId = linId / BlockT::size; - unsigned int offset = linId % BlockT::size; - return getReference<p>(blockId, offset); -#else // __NVCC__ - std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; -#endif // __NVCC__ -} - -template<typename AggregateBlockT, typename indexT, template<typename> class layout_base> -template<unsigned int p> -inline __device__ ScalarTypeOf<AggregateBlockT, p> & BlockMapGpu_ker<AggregateBlockT, indexT, layout_base> -::getReference(const unsigned int blockId, const unsigned int offset) -{ - // Only call this if you are TOTALLY SURE the element exists! Otherwise KABOOOOOM! :D -#ifdef __NVCC__ - return blockMap.template get<p>(blockId)[offset]; -#else // __NVCC__ - std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; -#endif // __NVCC__ -} - - -template<typename AggregateBlockT, typename indexT, template<typename> class layout_base> -template<unsigned int p> -inline __device__ auto BlockMapGpu_ker<AggregateBlockT, indexT, layout_base> -::insert(unsigned int blockId, unsigned int offset) -> ScalarTypeOf<AggregateBlockT, p>& -{ -#ifdef __NVCC__ - auto aggregate = blockMap.insert(blockId); - auto &block = aggregate.template get<p>(); - auto &mask = aggregate.template get<pMask>(); - setExist(mask[offset]); - - return block[offset]; -#else // __NVCC__ - std::cout << __FILE__ << ":" << __LINE__ << " error: you are supposed to compile this file with nvcc, if you want to use it with gpu" << std::endl; -#endif // __NVCC__ -} #endif /* BLOCK_MAP_GPU_KER_CUH_ */ diff --git a/src/SparseGridGpu/SparseGridGpu.hpp b/src/SparseGridGpu/SparseGridGpu.hpp index c7b69fd7cddd8ee0f084fd389271c03e0b1dfd5e..7d3199361e2efa94d60bbc657acc47ae81631055 100644 --- a/src/SparseGridGpu/SparseGridGpu.hpp +++ b/src/SparseGridGpu/SparseGridGpu.hpp @@ -2416,6 +2416,46 @@ public: applyStencils< SparseGridGpuKernels::stencil_cross_func_conv<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...); } + /*! \brief Apply a free type convolution using blocks + * + * + */ + template<unsigned int prop_src, unsigned int prop_dst, unsigned int stencil_size, typename lambda_f, typename ... ArgsT > + void conv_cross_b(grid_key_dx<3> start, grid_key_dx<3> stop , lambda_f func, ArgsT ... args) + { + Box<dim,int> box; + + for (int i = 0 ; i < dim ; i++) + { + box.setLow(i,start.get(i)); + box.setHigh(i,stop.get(i)); + } + + constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value; + + applyStencils< SparseGridGpuKernels::stencil_cross_func_conv_block_read<dim,nLoop,prop_src,prop_dst,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...); + } + + /*! \brief Apply a free type convolution using blocks + * + * + */ + template<unsigned int prop_src1, unsigned int prop_src2, unsigned int prop_dst1 , unsigned int prop_dst2, unsigned int stencil_size, typename lambda_f, typename ... ArgsT > + void conv2_b(grid_key_dx<dim> start, grid_key_dx<dim> stop , lambda_f func, ArgsT ... args) + { + Box<dim,int> box; + + for (int i = 0 ; i < dim ; i++) + { + box.setLow(i,start.get(i)); + box.setHigh(i,stop.get(i)); + } + + constexpr unsigned int nLoop = UIntDivCeil<(IntPow<blockEdgeSize + 2, dim>::value), (blockSize)>::value; + + applyStencils< SparseGridGpuKernels::stencil_func_conv2_b<dim,nLoop,prop_src1,prop_src2,prop_dst1,prop_dst2,stencil_size> >(box,STENCIL_MODE_INPLACE,func, args ...); + } + /*! \brief Apply a free type convolution using blocks * * @@ -2520,7 +2560,33 @@ public: { return gridGeometry.BlockLinId(blockCoord); } + + /*! \brief Insert the point on host side and flush directly + * + * First you have to move everything on host with deviceToHost, insertFlush and than move to GPU again + * + * \param grid point where to insert + * + * \return a reference to the data to fill + * + * + */ + template<unsigned int p> + auto insertFlush(const sparse_grid_gpu_index<self> &coord) -> ScalarTypeOf<AggregateBlockT, p> & + { + auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer(); + + indexT block_id = indexBuffer.template get<0>(coord.get_cnk_pos_id()); + indexT local_id = coord.get_data_id(); + + typedef BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base> BMG; + auto block_data = this->insertBlockFlush(block_id); + block_data.template get<BMG::pMask>()[local_id] = 1; + + return block_data.template get<p>()[local_id]; + } + /*! \brief Insert the point on host side and flush directly * * First you have to move everything on host with deviceToHost, insertFlush and than move to GPU again diff --git a/src/SparseGridGpu/SparseGridGpu_kernels.cuh b/src/SparseGridGpu/SparseGridGpu_kernels.cuh index 33edebbf6987a2a555eda1053903086e3ced7739..0a44dc4d813f58ebe8046048d1baceb2b763176e 100644 --- a/src/SparseGridGpu/SparseGridGpu_kernels.cuh +++ b/src/SparseGridGpu/SparseGridGpu_kernels.cuh @@ -117,6 +117,17 @@ namespace SparseGridGpuKernels template<> struct stencil_conv_func_impl<3> { + template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT> + __device__ static inline void stencil_block(ScalarT & res, coordType & coord , + CpBlockType & cpb, + DataBlockWrapperT & DataBlockLoad, + int offset, + lambda_func f, + ArgsT ... args) + { + res = f(cpb,DataBlockLoad,offset,coord[0],coord[1],coord[2]); + } + template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT> __device__ static inline void stencil(ScalarT & res, coordType & coord , CpBlockType & cpb, @@ -135,11 +146,34 @@ namespace SparseGridGpuKernels { f(res1,res2,cpb1,cpb2,coord[0],coord[1],coord[2]); } + + template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT> + __device__ static inline void stencil2_block(ScalarT & res1, ScalarT & res2, coordType & coord , + CpBlockType & cpb1, + CpBlockType & cpb2, + DataBlockWrapperT & DataBlockLoad, + int offset, + lambda_func f, + ArgsT ... args) + { + f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1],coord[2]); + } }; template<> struct stencil_conv_func_impl<2> { + template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT> + __device__ static inline void stencil_block(ScalarT & res, coordType & coord, + CpBlockType & cpb, + DataBlockWrapperT & DataBlockLoad, + int offset, + lambda_func f, + ArgsT ... args) + { + res = f(cpb,DataBlockLoad,offset,coord[0],coord[1]); + } + template<typename ScalarT, typename coordType, typename CpBlockType, typename lambda_func, typename ... ArgsT> __device__ static inline void stencil(ScalarT & res, coordType & coord , CpBlockType & cpb, @@ -158,6 +192,18 @@ namespace SparseGridGpuKernels { f(res1,res2,cpb1,cpb2,coord[0],coord[1]); } + + template<typename ScalarT, typename coordType, typename CpBlockType, typename DataBlockWrapperT, typename lambda_func, typename ... ArgsT> + __device__ static inline void stencil2_block(ScalarT & res1, ScalarT & res2, coordType & coord , + CpBlockType & cpb1, + CpBlockType & cpb2, + DataBlockWrapperT & DataBlockLoad, + int offset, + lambda_func f, + ArgsT ... args) + { + f(res1,res2,cpb1,cpb2,DataBlockLoad,offset,coord[0],coord[1]); + } }; template<unsigned int dim, unsigned int n_loop, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size> @@ -233,6 +279,151 @@ namespace SparseGridGpuKernels } }; + + template<unsigned int dim, unsigned int n_loop, unsigned int p_src, unsigned int p_dst, unsigned int stencil_size> + struct stencil_cross_func_conv_block_read + { + typedef NNStar<dim> stencil_type; + + static constexpr unsigned int supportRadius = stencil_size; + + template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT> + static inline __device__ void stencil( + SparseGridT & sparseGrid, + const unsigned int dataBlockId, + openfpm::sparse_index<unsigned int> dataBlockIdPos, + unsigned int offset, + grid_key_dx<dim, int> & pointCoord, + DataBlockWrapperT & dataBlockLoad, + DataBlockWrapperT & dataBlockStore, + unsigned char curMask, + lambda_func f, + ArgT ... args) + { + typedef typename SparseGridT::AggregateBlockType AggregateT; + typedef ScalarTypeOf<AggregateT, p_src> ScalarT; + + constexpr unsigned int enlargedBlockSize = IntPow< + SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value; + + __shared__ ScalarT enlargedBlock[enlargedBlockSize]; + + for (int i = 0; i < n_loop ; i++) + { + if (i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x < enlargedBlockSize) + { + enlargedBlock[i*IntPow<SparseGridT::getBlockEdgeSize(), dim>::value + threadIdx.x] = sparseGrid.getblockMap().template getBackground<p_src>()[0]; + } + } + + __syncthreads(); + + typedef typename vmpl_create_constant<dim,SparseGridT::blockEdgeSize_>::type block_sizes; + typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes; + + cp_block<ScalarT,stencil_size,vmpl_sizes,dim> cpb(enlargedBlock); + + sparseGrid.template loadGhostBlock<p_src>(dataBlockLoad, dataBlockIdPos, enlargedBlock); + + __syncthreads(); + + ScalarT res = 0; + + if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING)) + { + int coord[dim]; + + unsigned int linIdTmp = offset; + for (unsigned int d = 0; d < dim; ++d) + { + coord[d] = linIdTmp % SparseGridT::blockEdgeSize_; + linIdTmp /= SparseGridT::blockEdgeSize_; + } + + stencil_conv_func_impl<dim>::stencil_block(res,coord,cpb,dataBlockLoad,offset,f,args...); + + dataBlockStore.template get<p_dst>()[offset] = res; + } + } + + template <typename SparseGridT, typename CtxT> + static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx) + { + // No flush + } + }; + + template<unsigned int dim, unsigned int n_loop, unsigned int p_src1, unsigned int p_src2, unsigned int p_dst1, unsigned int p_dst2, unsigned int stencil_size> + struct stencil_func_conv2_b + { + typedef NNStar<dim> stencil_type; + + static constexpr unsigned int supportRadius = stencil_size; + + template<typename SparseGridT, typename DataBlockWrapperT, typename lambda_func, typename ... ArgT> + static inline __device__ void stencil( + SparseGridT & sparseGrid, + const unsigned int dataBlockId, + openfpm::sparse_index<unsigned int> dataBlockIdPos, + unsigned int offset, + grid_key_dx<dim, int> & pointCoord, + DataBlockWrapperT & dataBlockLoad, + DataBlockWrapperT & dataBlockStore, + unsigned char curMask, + lambda_func f, + ArgT ... args) + { + typedef typename SparseGridT::AggregateBlockType AggregateT; + typedef ScalarTypeOf<AggregateT, p_src1> ScalarT1; + typedef ScalarTypeOf<AggregateT, p_src1> ScalarT2; + + constexpr unsigned int enlargedBlockSize = IntPow< + SparseGridT::getBlockEdgeSize() + 2 * supportRadius, dim>::value; + + __shared__ ScalarT1 enlargedBlock1[enlargedBlockSize]; + __shared__ ScalarT2 enlargedBlock2[enlargedBlockSize]; + + // fill with background + + typedef typename vmpl_create_constant<dim,SparseGridT::blockEdgeSize_>::type block_sizes; + typedef typename vmpl_sum_constant<2*stencil_size,block_sizes>::type vmpl_sizes; + + cp_block<ScalarT1,stencil_size,vmpl_sizes,dim> cpb1(enlargedBlock1); + cp_block<ScalarT2,stencil_size,vmpl_sizes,dim> cpb2(enlargedBlock2); + + sparseGrid.template loadGhostBlock<p_src1>(dataBlockLoad, dataBlockIdPos, enlargedBlock1); + sparseGrid.template loadGhostBlock<p_src2>(dataBlockLoad, dataBlockIdPos, enlargedBlock2); + + __syncthreads(); + + ScalarT1 res1 = 0; + ScalarT2 res2 = 0; + + if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING)) + { + int coord[dim]; + + unsigned int linIdTmp = offset; + for (unsigned int d = 0; d < dim; ++d) + { + coord[d] = linIdTmp % SparseGridT::blockEdgeSize_; + linIdTmp /= SparseGridT::blockEdgeSize_; + } + + stencil_conv_func_impl<dim>::stencil2_block(res1,res2,coord,cpb1,cpb2,dataBlockLoad,offset,f,args...); + + dataBlockStore.template get<p_dst1>()[offset] = res1; + dataBlockStore.template get<p_dst2>()[offset] = res2; + } + } + + template <typename SparseGridT, typename CtxT> + static inline void __host__ flush(SparseGridT & sparseGrid, CtxT & ctx) + { + // No flush + } + }; + template<unsigned int dim, unsigned int n_loop, unsigned int p_src1, unsigned int p_src2, unsigned int p_dst1, unsigned int p_dst2, unsigned int stencil_size> struct stencil_func_conv2 { @@ -1076,7 +1267,7 @@ namespace SparseGridGpuKernels #ifdef SE_CLASS1 if (numCnt > blockDim.x) - {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,blockDim.x);} + {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(int)blockDim.x);} #endif @@ -1164,7 +1355,7 @@ namespace SparseGridGpuKernels #ifdef SE_CLASS1 if (numCnt > blockDim.x) - {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,blockDim.x);} + {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(int)blockDim.x);} #endif diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu index 14cc99869eda16d3e0a8cfe51acd931d50d214cb..abec6ef5b6b96a058e61d47b8d2e285065d9381b 100644 --- a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu +++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu @@ -82,7 +82,6 @@ void testStencilHeatSparse_perf(unsigned int i, std::string base, float fillMult sparseGrid.template applyStencils<BoundaryStencilSetXRescaled<dim,0,0>>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, centerPoint, centerPoint + 2*blockEdgeSize*gridEdgeSize, 0.0, 10.0); - cudaDeviceSynchronize(); iterations /= 2; for (unsigned int iter=0; iter<iterations; ++iter) diff --git a/src/SparseGridGpu/performance/performancePlots.cpp b/src/SparseGridGpu/performance/performancePlots.cpp index 6fd461b4c50edba13c92f02b801f439c1a43b78f..01a7092441de8e3d784704bc12319c10266594d3 100644 --- a/src/SparseGridGpu/performance/performancePlots.cpp +++ b/src/SparseGridGpu/performance/performancePlots.cpp @@ -59,7 +59,7 @@ void write_test_report(report_sparse_grid_tests &report_sparsegrid_funcs, std::s StandardXMLPerformanceGraph(file_xml_results, file_xml_ref, cg, 1); - addUpdtateTime(cg,1); + addUpdateTime(cg,1,"data","SparseGridGpu_performance"); cg.write("SparseGridGpu_performance.html"); } diff --git a/src/Vector/cuda/map_vector_cuda_funcs_tests.cu b/src/Vector/cuda/map_vector_cuda_funcs_tests.cu index 907d29c60c382f669f12492eeef9398d57d9cb5b..01aec66f7b07f25262493ab9614b500315106e9a 100644 --- a/src/Vector/cuda/map_vector_cuda_funcs_tests.cu +++ b/src/Vector/cuda/map_vector_cuda_funcs_tests.cu @@ -8,7 +8,7 @@ #define BOOST_GPU_ENABLED __host__ __device__ -#include "util/cudify/cudify.hpp" +#include "util/cuda_launch.hpp" #include "config.h" #define BOOST_TEST_DYN_LINK diff --git a/src/Vector/cuda/map_vector_cuda_ker.cuh b/src/Vector/cuda/map_vector_cuda_ker.cuh index d6ebfd8d68ff2fe1ba444388954c91add026ce61..f1545afe093f6b805a76c2040be12524d54b87b5 100644 --- a/src/Vector/cuda/map_vector_cuda_ker.cuh +++ b/src/Vector/cuda/map_vector_cuda_ker.cuh @@ -257,7 +257,7 @@ namespace openfpm * */ - inline __device__ __host__ auto get_o(unsigned int id) const -> decltype(base.get_o(id)) + inline __device__ __host__ auto get_o(unsigned int id) const -> decltype(base.get_o(grid_key_dx<1>(id))) { #ifdef SE_CLASS1 if (check_bound(id) == false) @@ -281,7 +281,7 @@ namespace openfpm * */ - inline __device__ __host__ auto get_o(unsigned int id) -> decltype(base.get_o(id)) + inline __device__ __host__ auto get_o(unsigned int id) -> decltype(base.get_o(grid_key_dx<1>(id))) { #ifdef SE_CLASS1 if (check_bound(id) == false) @@ -298,7 +298,7 @@ namespace openfpm * \return the last element (encapsulated) * */ - inline auto last() const -> decltype(base.get_o(0)) + inline auto last() const -> decltype(base.get_o(grid_key_dx<1>(0))) { grid_key_dx<1> key(size()-1); @@ -333,7 +333,7 @@ namespace openfpm * \return the element (encapsulated) * */ - inline auto last() -> decltype(base.get_o(0)) + inline auto last() -> decltype(base.get_o(grid_key_dx<1>(0))) { grid_key_dx<1> key(size()-1); @@ -453,7 +453,7 @@ namespace openfpm * * */ - __host__ ite_gpu<1> getGPUIterator(size_t n_thr = 1024) const + __host__ ite_gpu<1> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<1> start(0); grid_key_dx<1> stop(size()-1); @@ -465,7 +465,7 @@ namespace openfpm * * */ - ite_gpu<1> getGPUIteratorTo(size_t stop, size_t n_thr = 1024) const + ite_gpu<1> getGPUIteratorTo(size_t stop, size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<1> start(0); grid_key_dx<1> stop_(stop); @@ -478,7 +478,7 @@ namespace openfpm * \param object to copy * */ - vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v) + __host__ vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v) { v_size = v.v_size; base = v.base; diff --git a/src/Vector/cuda/map_vector_sparse_cuda_ker.cuh b/src/Vector/cuda/map_vector_sparse_cuda_ker.cuh index b3b7fceae6b09f6ff46f4f8f84e3c18fded376b0..b33a093fbad5460860cdc6fc160dedacb339a50b 100644 --- a/src/Vector/cuda/map_vector_sparse_cuda_ker.cuh +++ b/src/Vector/cuda/map_vector_sparse_cuda_ker.cuh @@ -317,7 +317,9 @@ namespace openfpm vct_add_index.template get<0>(slot_base*nslot_add+pos) = ele; return vct_add_data.template get<p>(slot_base*nslot_add+pos); #else - std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl; + + printf("vector_sparse_gpu_ker.insert[1]: Error, this function in order to work is supposed to be compiled with nvcc\n"); + #endif } @@ -339,7 +341,7 @@ namespace openfpm vct_rem_index.template get<0>(slot_base*nslot_rem+pos) = ele; #else - std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl; + printf("vector_sparse_gpu_ker.remove: Error, this function in order to work is supposed to be compiled with nvcc\n"); #endif } @@ -362,7 +364,7 @@ namespace openfpm return vct_add_data.get(slot_base*nslot_add+pos); #else - std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl; + printf("vector_sparse_gpu_ker.insert[2]: Error, this function in order to work is supposed to be compiled with nvcc\n"); #endif } @@ -378,7 +380,7 @@ namespace openfpm vct_rem_index.template get<0>(slot_base*nslot_rem+pos) = ele; #else - std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl; + printf("vector_sparse_gpu_ker.remove_b: Error, this function in order to work is supposed to be compiled with nvcc\n"); #endif } @@ -395,7 +397,7 @@ namespace openfpm vct_add_index.template get<0>(slot_base*nslot_add+pos) = ele; return vct_add_data.template get<p>(slot_base*nslot_add+pos); #else - std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl; + printf("vector_sparse_gpu_ker.insert_b: Error, this function in order to work is supposed to be compiled with nvcc\n"); #endif } @@ -411,7 +413,7 @@ namespace openfpm vct_add_index.template get<0>(slot_base*nslot_add+pos) = ele; return vct_add_data.get(slot_base*nslot_add+pos); #else - std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl; + printf("vector_sparse_gpu_ker.insert_b: Error, this function in order to work is supposed to be compiled with nvcc\n"); #endif } @@ -432,7 +434,7 @@ namespace openfpm } #else - std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl; + printf("vector_sparse_gpu_ker.flush_block_insert: Error, this function in order to work is supposed to be compiled with nvcc\n"); #endif } @@ -453,7 +455,7 @@ namespace openfpm } #else - std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl; + printf("vector_sparse_gpu_ker.flush_block_remove: Error, this function in order to work is supposed to be compiled with nvcc\n"); #endif } @@ -477,7 +479,7 @@ namespace openfpm #else - std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl; + printf("vector_sparse_gpu_ker.flush_block_insert: Error, this function in order to work is supposed to be compiled with nvcc\n"); #endif } @@ -500,7 +502,7 @@ namespace openfpm {vct_nrem_index.template get<0>(b) = vct_atomic_rem;} #else - std::cout << __FILE__ << ":" << __LINE__ << " Error, this function in order to work is supposed to be compiled with nvcc" << std::endl; + printf("vector_sparse_gpu_ker.flush_block_remove: Error, this function in order to work is supposed to be compiled with nvcc\n"); #endif } diff --git a/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu b/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu index c3697d4208260b8eb283e7ce607b0272e09d7284..6a9a1537154b66558328d4973402883f45e8807d 100644 --- a/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu +++ b/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu @@ -299,6 +299,12 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_incremental_add ) match &= vs.template get<0>(9000 - 2*i) == 3*(2*i + 3000); match &= vs.template get<1>(9000 - 2*i) == 2*i + 13000; match &= vs.template get<2>(9000 - 2*i) == 2*i + 23000; + + if (match == false) + { + std::cout << i << " " << vs.template get<0>(9000 - 2*i) << "!=" << 3*(2*i + 3000) << " " << vs.template get<1>(9000 - 2*i) << "!=" << 2*i + 13000 << " " << vs.template get<2>(9000 - 2*i) << "!=" << 2*i + 23000 << std::endl; + break; + } } for (size_t i = 0 ; i < 500 ; i++) @@ -306,6 +312,12 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_incremental_add ) match &= vs.template get<0>(9000 - 2*i) == 3*(2*i + 3000) + 2*i + 1100; match &= vs.template get<1>(9000 - 2*i) == 2*i + 11100; match &= vs.template get<2>(9000 - 2*i) == 2*i + 23000; + + if (match == false) + { + std::cout << i << " " << vs.template get<0>(9000 - 2*i) << "!=" << 3*(2*i + 3000) << " " << vs.template get<1>(9000 - 2*i) << "!=" << 2*i + 13000 << " " << vs.template get<2>(9000 - 2*i) << "!=" << 2*i + 23000 << std::endl; + break; + } } for (size_t i = 0 ; i < 500 ; i++) @@ -313,6 +325,12 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_incremental_add ) match &= vs.template get<0>(10000 - 2*i) == 2*i + 100; match &= vs.template get<1>(10000 - 2*i) == 2*i + 10100; match &= vs.template get<2>(10000 - 2*i) == 2*i + 20100; + + if (match == false) + { + std::cout << i << " " << vs.template get<0>(9000 - 2*i) << "!=" << 3*(2*i + 3000) << " " << vs.template get<1>(9000 - 2*i) << "!=" << 2*i + 13000 << " " << vs.template get<2>(9000 - 2*i) << "!=" << 2*i + 23000 << std::endl; + break; + } } BOOST_REQUIRE_EQUAL(match,true); diff --git a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh index 28d3963ca7d116ecbb65c4a165335a1266f2cba0..33b0a68a390c974565afe1e3fbe4d5dcd2b07c1f 100644 --- a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh +++ b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh @@ -10,14 +10,15 @@ #ifdef __NVCC__ +#include "config.h" + #if CUDART_VERSION < 11000 #include "util/cuda/cub_old/util_type.cuh" #include "util/cuda/cub_old/block/block_scan.cuh" #include "util/cuda/moderngpu/operators.hxx" +#include "util/cuda_launch.hpp" #else - #if !defined(CUDA_ON_CPU) - #include "cub/util_type.cuh" - #include "cub/block/block_scan.cuh" + #if !defined(CUDA_ON_CPU) #include "util/cuda/moderngpu/operators.hxx" #endif #endif diff --git a/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu b/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu index 6370fbc752a0f49c62660d2290d218f4c2d2ffcb..0dd883bcf5f52f943b588a893afa9a75412f3cc6 100644 --- a/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu +++ b/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu @@ -3,9 +3,7 @@ #include <Vector/map_vector.hpp> #include <Vector/cuda/map_vector_sparse_cuda_kernels.cuh> #include "util/cuda/scan_ofp.cuh" -#ifndef CUDA_ON_CPU -#include "util/cuda/moderngpu/kernel_merge.hxx" -#endif +#include "util/cuda/merge_ofp.cuh" BOOST_AUTO_TEST_SUITE( vector_sparse_cuda_kernels ) @@ -176,7 +174,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_merge_use ) vct_index_old.template hostToDevice<0,1>(); vct_add_index.template hostToDevice<0,1>(); - mgpu::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(), + openfpm::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(), (int *)vct_add_index.template getDeviceBuffer<0>(),(int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.size(), (int *)vct_index.template getDeviceBuffer<0>(),(int *)vct_index.template getDeviceBuffer<1>(),mgpu::less_t<int>(),ctx); @@ -272,7 +270,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_solve_conflicts_use ) vct_data_old.template hostToDevice<0,1,2>(); vct_add_data.template hostToDevice<0,1,2>(); - mgpu::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(), + openfpm::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(), (int *)vct_add_index.template getDeviceBuffer<0>(),(int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.size(), (int *)vct_index.template getDeviceBuffer<0>(),(int *)merge_indexes.template getDeviceBuffer<0>(),mgpu::less_t<int>(),ctx); diff --git a/src/Vector/map_vector.hpp b/src/Vector/map_vector.hpp index 1118c76ebdd3cc9537ab5af87c4c4d72efb2dfea..cf0c6ef0ef00dad84aac7be7868d52a4992a4f8a 100644 --- a/src/Vector/map_vector.hpp +++ b/src/Vector/map_vector.hpp @@ -44,6 +44,50 @@ namespace openfpm { + template<bool active> + struct copy_two_vectors_activate_impl + { + template<typename vector_type1, typename vector_type2> + static void copy(vector_type1 & v1, vector_type2 & v2) + { + + } + + template<typename vector_type1, typename vector_type2> + static void copy2(vector_type1 & v1, vector_type2 & v2) + { + + } + }; + + template<> + struct copy_two_vectors_activate_impl<true> + { + template<typename vector_type1, typename vector_type2> + static void copy(vector_type1 & v1, vector_type2 & v2) + { +#ifdef __NVCC__ + if (v1.size() != 0) + { + auto it = v1.getGPUIterator(); + CUDA_LAUNCH(copy_two_vectors,it,v1.toKernel(),v2.toKernel()); + } +#endif + } + + template<typename vector_type1, typename vector_type2> + static void copy2(vector_type1 & v1, vector_type2 & v2) + { +#ifdef __NVCC__ + if (v2.size() != 0) + { + auto it = v1.getGPUIterator(); + CUDA_LAUNCH(copy_two_vectors,it,v1.toKernel(),v2.toKernel()); + } +#endif + } + }; + template<bool is_ok_cuda,typename T, typename Memory, template<typename> class layout_base, typename grow_p> @@ -288,6 +332,16 @@ namespace openfpm return v_size; } + /*! \brief Return the size of the vector + * + * \return the size + * + */ + size_t size_local() const + { + return v_size; + } + /*! \brief return the maximum capacity of the vector before reallocation * * \return the capacity of the vector @@ -1095,6 +1149,48 @@ namespace openfpm v_size -= keys.size() - start; } + /*! \brief Remove several entries from the vector + * + * \warning the keys in the vector MUST be sorted + * + * \param keys objects id to remove + * \param start key starting point + * + */ + void remove(openfpm::vector<aggregate<int>> & keys, size_t start = 0) + { + // Nothing to remove return + if (keys.size() <= start ) + return; + + size_t a_key = start; + size_t d_k = keys.template get<0>(a_key); + size_t s_k = keys.template get<0>(a_key) + 1; + + // keys + while (s_k < size()) + { + // s_k should always point to a key that is not going to be deleted + while (a_key+1 < keys.size() && s_k == keys.template get<0>(a_key+1)) + { + a_key++; + s_k = keys.template get<0>(a_key) + 1; + } + + // In case of overflow + if (s_k >= size()) + break; + + set(d_k,get(s_k)); + d_k++; + s_k++; + } + + // re-calculate the vector size + + v_size -= keys.size() - start; + } + /*! \brief Get an element of the vector * * Get an element of the vector @@ -1118,23 +1214,14 @@ namespace openfpm return base.template get<p>(key); } - /*! \brief Get an element of the vector + /*! \brief Indicate that this class is not a subset * - * Get an element of the vector - * - * \param id Element to get - * - * \return the element (encapsulated) + * \return false * */ - inline auto get(size_t id) -> decltype(base.get_o(grid_key_dx<1>(id))) + bool isSubset() const { -#if defined(SE_CLASS1) && !defined(__NVCC__) - check_overflow(id); -#endif - grid_key_dx<1> key(id); - - return base.get_o(key); + return false; } /*! \brief Get an element of the vector @@ -1146,7 +1233,7 @@ namespace openfpm * \return the element (encapsulated) * */ - inline auto get(size_t id) const -> const decltype(base.get_o(grid_key_dx<1>(id))) + inline auto get(size_t id) -> decltype(base.get_o(grid_key_dx<1>(id))) { #if defined(SE_CLASS1) && !defined(__NVCC__) check_overflow(id); @@ -1225,6 +1312,39 @@ namespace openfpm return base.get_o(key); } + /*! \brief Get an element of the vector + * + * Get an element of the vector + * + * \tparam p Property to get + * \param id Element to get + * + * \return the element value requested + * + */ + + template <unsigned int p,typename KeyType> + inline auto getProp(const KeyType & id) -> decltype(base.template get<p>(grid_key_dx<1>(0))) + { + return this->template get<p>(id.getKey()); + } + + /*! \brief Get an element of the vector + * + * Get an element of the vector + * + * \tparam p Property to get + * \param id Element to get + * + * \return the element value requested + * + */ + template <unsigned int p, typename keyType> + inline auto getProp(const keyType & id) const -> decltype(base.template get<p>(grid_key_dx<1>(0))) + { + return this->template get<p>(id.getKey()); + } + /*! \brief Get an element of the vector * * Get an element of the vector @@ -1247,6 +1367,28 @@ namespace openfpm return base.template get<p>(key); } + /*! \brief Get an element of the vector + * + * Get an element of the vector + * + * \param id Element to get + * + * \return the element (encapsulated) + * + */ + inline auto get(size_t id) const -> const decltype(base.get_o(grid_key_dx<1>(id))) + { +#ifdef SE_CLASS2 + check_valid(this,8); +#endif +#if defined(SE_CLASS1) && !defined(__NVCC__) + check_overflow(id); +#endif + grid_key_dx<1> key(id); + + return base.get_o(key); + } + /*! \brief Get the last element of the vector * * \return the element (encapsulated) @@ -1278,18 +1420,7 @@ namespace openfpm dup.v_size = v_size; dup.base.swap(base.duplicate()); - // copy the device part - // and device - if (Memory::isDeviceHostSame() == false) - { -#ifdef __NVCC__ - if (dup.size() != 0) - { - auto it = dup.getGPUIterator(); - CUDA_LAUNCH(copy_two_vectors,it,dup.toKernel(),toKernel()); - } -#endif - } + copy_two_vectors_activate_impl<Memory::isDeviceHostSame() == false>::copy(dup,*this); return dup; } @@ -1396,6 +1527,12 @@ namespace openfpm base.set(id,v.base,src); } + template<typename key_type> + key_type getOriginKey(key_type vec_key) + { + return vec_key; + } + /*! \brief Assignment operator * @@ -1436,18 +1573,7 @@ namespace openfpm base.set(key,mv.base,key); } - // and device - if (Memory::isDeviceHostSame() == false) - { -#ifdef __NVCC__ - if (mv.size() != 0) - { - auto it = mv.getGPUIterator(); - CUDA_LAUNCH(copy_two_vectors,it,toKernel(),mv.toKernel()); - } -#endif - } - + copy_two_vectors_activate_impl<Memory::isDeviceHostSame() == false>::copy2(*this,mv); return *this; } @@ -1491,17 +1617,7 @@ namespace openfpm base.set(key,mv.getInternal_base(),key); } - // and device - if (Memory::isDeviceHostSame() == false && Mem::isDeviceHostSame() == false) - { -#ifdef __NVCC__ - if (mv.size() != 0) - { - auto it = mv.getGPUIterator(); - CUDA_LAUNCH(copy_two_vectors,it,toKernel(),mv.toKernel()); - } -#endif - } + copy_two_vectors_activate_impl<Memory::isDeviceHostSame() == false && Mem::isDeviceHostSame() == false>::copy2(*this,mv); return *this; } @@ -1550,17 +1666,7 @@ namespace openfpm base.set_general(key,mv.getInternal_base(),key); } - // and device - if (Memory::isDeviceHostSame() == false && Mem::isDeviceHostSame() == false) - { -#ifdef __NVCC__ - if (mv.size() != 0) - { - auto it = mv.getGPUIterator(); - CUDA_LAUNCH(copy_two_vectors,it,toKernel(),mv.toKernel()); - } -#endif - } + copy_two_vectors_activate_impl<Memory::isDeviceHostSame() == false && Mem::isDeviceHostSame() == false>::copy2(*this,mv); return *this; } @@ -1680,7 +1786,7 @@ namespace openfpm * * */ - ite_gpu<1> getGPUIteratorTo(long int stop, size_t n_thr = 1024) const + ite_gpu<1> getGPUIteratorTo(long int stop, size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<1,long int> start(0); grid_key_dx<1,long int> stop_(stop); @@ -1690,6 +1796,21 @@ namespace openfpm #endif + /*! \brief Get the vector elements iterator + * + * + * \return an iterator to iterate through all the elements of the vector + * + */ + + vector_key_iterator getDomainIterator() const + { +#ifdef SE_CLASS2 + check_valid(this,8); +#endif + return getIterator(); + } + /*! \brief Get the vector elements iterator * * @@ -1702,13 +1823,28 @@ namespace openfpm return vector_key_iterator(v_size); } + /*! \brief Get the vector elements iterator + * + * + * \return an iterator to iterate through all the elements of the vector + * + */ + template<unsigned int p> + vector_key_iterator_ele<p,self_type> getIteratorElements() const + { +#ifdef SE_CLASS2 + check_valid(this,8); +#endif + return vector_key_iterator_ele<p,self_type>(*this,size()); + } + #ifdef CUDA_GPU /*! \brief Get an iterator for the GPU * * */ - ite_gpu<1> getGPUIterator(size_t n_thr = 1024) const + ite_gpu<1> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<1> start(0); grid_key_dx<1> stop(size()-1); @@ -1987,6 +2123,7 @@ namespace openfpm template <typename T> using vector_std = vector<T, HeapMemory, memory_traits_lin, openfpm::grow_policy_double, STD_VECTOR>; template<typename T> using vector_gpu = openfpm::vector<T,CudaMemory,memory_traits_inte>; + template<typename T> using vector_gpu_lin = openfpm::vector<T,CudaMemory,memory_traits_lin>; template<typename T> using vector_gpu_single = openfpm::vector<T,CudaMemory,memory_traits_inte,openfpm::grow_policy_identity>; template<typename T> using vector_custd = vector<T, CudaMemory, memory_traits_inte, openfpm::grow_policy_double, STD_VECTOR>; } diff --git a/src/Vector/map_vector_sparse.hpp b/src/Vector/map_vector_sparse.hpp index e34e8ea14b00ae7761f8e8d51fba4c7780badc28..95c667d93f179396b816a777d74bccf98a792b1d 100644 --- a/src/Vector/map_vector_sparse.hpp +++ b/src/Vector/map_vector_sparse.hpp @@ -17,7 +17,7 @@ #include <limits> #if defined(__NVCC__) - #if !defined(CUDA_ON_CPU) + #if !defined(CUDA_ON_CPU) && !defined(__HIP__) #include "util/cuda/moderngpu/kernel_segreduce.hxx" #include "util/cuda/moderngpu/kernel_merge.hxx" #endif @@ -26,6 +26,8 @@ #include "util/cuda/scan_ofp.cuh" #include "util/cuda/sort_ofp.cuh" +#include "util/cuda/segreduce_ofp.cuh" +#include "util/cuda/merge_ofp.cuh" enum flush_type { @@ -151,7 +153,7 @@ namespace openfpm assert((std::is_same<seg_type,int>::value == true)); - mgpu::segreduce( + openfpm::segreduce( (red_type *)vector_data.template getDeviceBuffer<reduction_type::prop::value>(), vector_data.size(), (int *)segment_offset.template getDeviceBuffer<1>(), segment_offset.size(), (red_type *)vector_data_red.template getDeviceBuffer<reduction_type::prop::value>(), @@ -1108,7 +1110,7 @@ namespace openfpm // we merge with vct_index with vct_add_index_unique in vct_merge_index, vct_merge_index contain the merged indexes // - mgpu::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(), + openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(), (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_add_index_unique.size(), (Ti *)vct_merge_index.template getDeviceBuffer<0>(),(Ti *)vct_merge_index_map.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context); @@ -1325,7 +1327,7 @@ namespace openfpm // we merge with vct_index with vct_add_index_unique in vct_index_tmp, vct_intex_tmp2 contain the merging index // - mgpu::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(), + openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(), (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),vct_add_index_unique.size(), (Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context); @@ -1672,8 +1674,9 @@ namespace openfpm * */ template <unsigned int p> - auto insertFlush(Ti ele) -> decltype(vct_data.template get<p>(0)) + auto insertFlush(Ti ele, bool & is_new) -> decltype(vct_data.template get<p>(0)) { + is_new = false; size_t di; // first we have to search if the block exist @@ -1684,7 +1687,8 @@ namespace openfpm // block exist return vct_data.template get<p>(di); } - + is_new = true; + // It does not exist, we create it di contain the index where we have to create the new block vct_index.insert(di); vct_data.isert(di); @@ -1697,8 +1701,9 @@ namespace openfpm * \param ele element id * */ - auto insertFlush(Ti ele) -> decltype(vct_data.get(0)) + auto insertFlush(Ti ele, bool & is_new) -> decltype(vct_data.get(0)) { + is_new = false; Ti di; // first we have to search if the block exist @@ -1713,6 +1718,7 @@ namespace openfpm // It does not exist, we create it di contain the index where we have to create the new block vct_index.insert(di); vct_data.insert(di); + is_new = true; vct_index.template get<0>(di) = ele; diff --git a/src/Vector/performance/vector_performance_test.cu b/src/Vector/performance/vector_performance_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..26803ccac979981524be937b40c08ce60ef10e33 --- /dev/null +++ b/src/Vector/performance/vector_performance_test.cu @@ -0,0 +1,592 @@ +#define BOOST_TEST_DYN_LINK +#include <boost/test/unit_test.hpp> +#include "Plot/GoogleChart.hpp" +#include "timer.hpp" +#include <boost/property_tree/ptree.hpp> +#include <boost/property_tree/xml_parser.hpp> +#include "util/performance/performance_util.hpp" +#include "Point_test.hpp" +#include "util/stat/common_statistics.hpp" + +extern const char * test_dir; + +typedef Point_test<float> P; + +constexpr int N_STAT = 32; + +BOOST_AUTO_TEST_SUITE( performance ) + +#define NADD 128*128*128 +#define NADD_GPU 256*256*256 + +// Property tree +struct report_vector_func_tests +{ + boost::property_tree::ptree graphs; +}; + +report_vector_func_tests report_vector_funcs; + +BOOST_AUTO_TEST_SUITE( vector_performance ) + +BOOST_AUTO_TEST_CASE(vector_performance) +{ + report_vector_funcs.graphs.put("performance.vector(0).funcs.nele",NADD); + report_vector_funcs.graphs.put("performance.vector(0).funcs.name","add"); + + report_vector_funcs.graphs.put("performance.vector(1).funcs.nele",NADD); + report_vector_funcs.graphs.put("performance.vector(1).funcs.name","get"); + + std::vector<double> times(N_STAT + 1); + std::vector<double> times_g(N_STAT + 1); + + // get test + double tot_accu = 0.0; + + for (size_t i = 0 ; i < N_STAT+1 ; i++) + { + timer t; + t.start(); + + // create a vector + openfpm::vector<Point_test<float>> v1; + + // Point + Point_test<float> p; + p.setx(1.0); + p.sety(2.0); + p.setz(3.0); + p.sets(4.0); + + p.get<P::v>()[0] = 1.0; + p.get<P::v>()[1] = 2.0; + p.get<P::v>()[2] = 7.0; + + p.get<P::t>()[0][0] = 10.0; + p.get<P::t>()[0][1] = 13.0; + p.get<P::t>()[0][2] = 8.0; + p.get<P::t>()[1][0] = 19.0; + p.get<P::t>()[1][1] = 23.0; + p.get<P::t>()[1][2] = 5.0; + p.get<P::t>()[2][0] = 4.0; + p.get<P::t>()[2][1] = 3.0; + p.get<P::t>()[2][2] = 11.0; + + // Add test + + for (size_t j = 0 ; j < NADD ; j++) + { + v1.add(p); + } + + t.stop(); + times[i] = t.getwct(); + + timer tg; + tg.start(); + + for (size_t j = 0 ; j < NADD ; j++) + { + double accu1 = v1.template get<P::x>(j); + double accu2 = v1.template get<P::y>(j); + double accu3 = v1.template get<P::z>(j); + double accu4 = v1.template get<P::s>(j); + + double accu5 = v1.template get<P::v>(j)[0]; + double accu6 = v1.template get<P::v>(j)[1]; + double accu7 = v1.template get<P::v>(j)[2]; + + double accu8 = v1.template get<P::t>(j)[0][0]; + double accu9 = v1.template get<P::t>(j)[0][1]; + double accu10 = v1.template get<P::t>(j)[0][2]; + double accu11 = v1.template get<P::t>(j)[1][0]; + double accu12 = v1.template get<P::t>(j)[1][1]; + double accu13 = v1.template get<P::t>(j)[1][2]; + double accu14 = v1.template get<P::t>(j)[2][0]; + double accu15 = v1.template get<P::t>(j)[2][1]; + double accu16 = v1.template get<P::t>(j)[2][2]; + + tot_accu += accu1 + accu2 + accu3 + accu4 + accu5 + accu6 + accu7 + accu8 + accu9 + accu10 + accu11 + accu12 + + accu13 + accu14 + accu15 + accu16; + } + + tg.stop(); + + times_g[i] = tg.getwct(); + } + + double mean; + double dev; + standard_deviation(times,mean,dev); + + report_vector_funcs.graphs.put("performance.vector(0).y.data.mean",mean); + report_vector_funcs.graphs.put("performance.vector(0).y.data.dev",dev); + + standard_deviation(times_g,mean,dev); + + report_vector_funcs.graphs.put("performance.vector(1).y.data.mean",mean); + report_vector_funcs.graphs.put("performance.vector(1).y.data.dev",dev); +} + +template<typename vector_prop_type, typename vector_pos_type> +__device__ __host__ void read_write(vector_prop_type & vd_prop, vector_pos_type & vd_pos, unsigned int p) +{ + vd_prop.template get<0>(p) = vd_pos.template get<0>(p)[0] + vd_pos.template get<0>(p)[1]; + + vd_prop.template get<1>(p)[0] = vd_pos.template get<0>(p)[0]; + vd_prop.template get<1>(p)[1] = vd_pos.template get<0>(p)[1]; + + vd_prop.template get<2>(p)[0][0] = vd_pos.template get<0>(p)[0]; + vd_prop.template get<2>(p)[0][1] = vd_pos.template get<0>(p)[1]; + vd_prop.template get<2>(p)[1][0] = vd_pos.template get<0>(p)[0] + + vd_pos.template get<0>(p)[1]; + vd_prop.template get<2>(p)[1][1] = vd_pos.template get<0>(p)[1] - + vd_pos.template get<0>(p)[0]; + + vd_pos.template get<0>(p)[0] += 0.01f; + vd_pos.template get<0>(p)[1] += 0.01f; +} + +template<typename vector_type1, typename vector_type2> +__global__ void read_write_ker(vector_type1 v1, vector_type2 v2) +{ + unsigned int p = + blockIdx.x * blockDim.x + threadIdx.x; + + read_write(v1,v2,p); +} + + +struct ele +{ + double s; + double v[2]; + double t[2][2]; +}; + +__device__ __host__ void read_write_lin(double * pos, ele * prp, unsigned int p) +{ + prp[p].s = pos[2*p] + pos[2*p+1]; + + prp[p].v[0] = pos[2*p]; + prp[p].v[1] = pos[2*p+1]; + + prp[p].t[0][0] = pos[2*p]; + prp[p].t[0][1] = pos[2*p+1]; + prp[p].t[1][0] = pos[2*p] + pos[2*p+1]; + prp[p].t[1][1] = pos[2*p+1] - pos[2*p]; + + pos[2*p] += 0.01f; + pos[2*p+1] += 0.01f; +} + + +__global__ void read_write_lin_ker(double * pos, ele * prp) +{ + unsigned int p = blockIdx.x * blockDim.x + threadIdx.x; + + read_write_lin(pos,prp,p); +} + +__device__ __host__ void read_write_inte(double * pos, double * prp0, double * prp1, double * prp2, unsigned int p, unsigned int n_pos) +{ + prp0[0*n_pos + p] = pos[0*n_pos + p] + pos[1*n_pos+p]; + + prp1[0*n_pos + p] = pos[0*n_pos + p]; + prp1[1*n_pos + p] = pos[1*n_pos + p]; + + prp2[0*n_pos*2+0*n_pos + p] = pos[0*n_pos + p]; + prp2[0*n_pos*2+1*n_pos + p] = pos[1*n_pos + p]; + prp2[1*n_pos*2+0*n_pos + p] = pos[0*n_pos + p] + + pos[1*n_pos + p]; + prp2[1*n_pos*2+1*n_pos + p] = pos[1*n_pos + p] - + pos[0*n_pos + p]; + + pos[0*n_pos + p] += 0.01f; + pos[1*n_pos + p] += 0.01f; +} + +__global__ void read_write_inte_ker(double * pos, double * prp0, double * prp1, double * prp2, unsigned int n_pos) +{ + unsigned int p = blockIdx.x * blockDim.x + threadIdx.x; + + read_write_inte(pos,prp0,prp1,prp2,p,n_pos); +} + +BOOST_AUTO_TEST_CASE(vector_performance_layout_vs_plain_array) +{ + std::vector<double> times(N_STAT + 1); + std::vector<double> times_g(N_STAT + 1); + + std::vector<double> times2(N_STAT + 1); + std::vector<double> times2_g(N_STAT + 1); + + report_vector_funcs.graphs.put("performance.vector_layout(0).funcs.nele",NADD); + report_vector_funcs.graphs.put("performance.vector_layout(0).funcs.name","read_write_lin"); + + for (size_t i = 0 ; i < N_STAT+1 ; i++) + { + // create a vector + openfpm::vector<aggregate<double,double[2],double[2][2]>> v1; + openfpm::vector<aggregate<double[2]>> v2; + + // Point + aggregate<double[2]> p; + p.get<0>()[0] = 1.0; + p.get<0>()[1] = 2.0; + + aggregate<double,double[2],double[2][2]> pa; + pa.get<0>() = 1.0; + + pa.get<1>()[0] = 1.0; + pa.get<1>()[1] = 1.0; + + pa.get<2>()[0][0] = 1.0; + pa.get<2>()[0][1] = 1.0; + pa.get<2>()[1][0] = 1.0; + pa.get<2>()[1][1] = 1.0; + + // Add test + + for (size_t j = 0 ; j < NADD ; j++) + { + v1.add(pa); + v2.add(p); + } + + timer tg; + tg.start(); + + for (size_t j = 0 ; j < NADD ; j++) + { + read_write(v1,v2,j); + } + + tg.stop(); + + times_g[i] = tg.getwct(); + + timer tga; + tga.start(); + + double * prp = (double *)v1.getPointer<0>(); + double * pos = (double *)v2.getPointer<0>(); + + for (size_t j = 0 ; j < NADD ; j++) + { + read_write_lin(pos,(struct ele *)prp,j); + } + + tga.stop(); + + times[i] = tga.getwct(); + } + + double mean; + double dev; + standard_deviation(times_g,mean,dev); + + double mean_; + double dev_; + standard_deviation(times,mean_,dev_); + + report_vector_funcs.graphs.put("performance.vector_layout(0).y.data.mean",mean_/mean); + + // Deviation od x/y = x/y^2 dy + 1/y dx + + report_vector_funcs.graphs.put("performance.vector_layout(0).y.data.dev",mean_/(mean*mean)*dev + dev_ / mean ); + + report_vector_funcs.graphs.put("performance.vector_layout(1).funcs.nele",NADD); + report_vector_funcs.graphs.put("performance.vector_layout(1).funcs.name","read_write_inte"); + + for (size_t i = 0 ; i < N_STAT+1 ; i++) + { + // create a vector + openfpm::vector<aggregate<double,double[2],double[2][2]>,HeapMemory,memory_traits_inte> v1; + openfpm::vector<aggregate<double[2]>,HeapMemory,memory_traits_inte> v2; + + // Point + aggregate<double[2]> p; + p.get<0>()[0] = 1.0; + p.get<0>()[1] = 2.0; + + aggregate<double,double[2],double[2][2]> pa; + pa.get<0>() = 1.0; + + pa.get<1>()[0] = 1.0; + pa.get<1>()[1] = 1.0; + + pa.get<2>()[0][0] = 1.0; + pa.get<2>()[0][1] = 1.0; + pa.get<2>()[1][0] = 1.0; + pa.get<2>()[1][1] = 1.0; + + // Add test + + for (size_t j = 0 ; j < NADD ; j++) + { + v1.add(pa); + v2.add(p); + } + + timer tg; + tg.start(); + + for (size_t j = 0 ; j < NADD ; j++) + { + read_write(v1,v2,j); + } + + tg.stop(); + + times2_g[i] = tg.getwct(); + int sz = v1.size(); + + timer tga; + tga.start(); + + double * prp0 = (double *)v1.getPointer<0>(); + double * prp1 = (double *)v1.getPointer<1>(); + double * prp2 = (double *)v1.getPointer<2>(); + + double * pos = (double *)v2.getPointer<0>(); + + for (size_t j = 0 ; j < NADD ; j++) + { + read_write_inte(pos,prp0,prp1,prp2,j,sz); + } + + tga.stop(); + + times2[i] = tga.getwct(); + } + + double mean2; + double dev2; + standard_deviation(times2_g,mean2,dev2); + + double mean2_; + double dev2_; + standard_deviation(times2,mean2_,dev2_); + + report_vector_funcs.graphs.put("performance.vector_layout(1).y.data.mean",mean2_/mean2); + + // Deviation od x/y = x/y^2 dy + 1/y dx + + report_vector_funcs.graphs.put("performance.vector_layout(1).y.data.dev",mean2_/(mean2*mean2)*dev2 + dev2_ / mean2 ); +} + +BOOST_AUTO_TEST_CASE(vector_performance_gpu_layout_vs_plain_array) +{ + std::vector<double> times(N_STAT + 1); + std::vector<double> times_g(N_STAT + 1); + + std::vector<double> times2(N_STAT + 1); + std::vector<double> times2_g(N_STAT + 1); + + // get test + double tot_accu = 0.0; + + report_vector_funcs.graphs.put("performance.vector_layout_gpu(0).funcs.nele",NADD_GPU); + report_vector_funcs.graphs.put("performance.vector_layout_gpu(0).funcs.name","read_write_lin"); + + for (size_t i = 0 ; i < N_STAT+1 ; i++) + { + // create a vector + openfpm::vector<aggregate<double,double[2],double[2][2]>,CudaMemory> v1; + openfpm::vector<aggregate<double[2]>,CudaMemory> v2; + + // Point + aggregate<double[2]> p; + p.get<0>()[0] = 1.0; + p.get<0>()[1] = 2.0; + + aggregate<double,double[2],double[2][2]> pa; + pa.get<0>() = 1.0; + + pa.get<1>()[0] = 1.0; + pa.get<1>()[1] = 1.0; + + pa.get<2>()[0][0] = 1.0; + pa.get<2>()[0][1] = 1.0; + pa.get<2>()[1][0] = 1.0; + pa.get<2>()[1][1] = 1.0; + + // Add test + + for (size_t j = 0 ; j < NADD_GPU ; j++) + { + v1.add(pa); + v2.add(p); + } + + auto ite = v1.getGPUIterator(1536); + + { + + timer tga; + tga.startGPU(); + CUDA_LAUNCH(read_write_ker,ite,v1.toKernel(),v2.toKernel()); + + tga.stopGPU(); + times_g[i] = tga.getwctGPU(); + } + + std::cout << "OpenFPM: " << times_g[i] << std::endl; + + timer tga2; + tga2.startGPU(); + + double * prp = (double *)v1.toKernel().getPointer<0>(); + double * pos = (double *)v2.toKernel().getPointer<0>(); + + CUDA_LAUNCH(read_write_lin_ker,ite,pos,(struct ele *)prp); + + tga2.stopGPU(); + + times[i] = tga2.getwctGPU(); + std::cout << "Array: " << times[i] << std::endl; + } + + double mean; + double dev; + standard_deviation(times_g,mean,dev); + + double mean_; + double dev_; + standard_deviation(times,mean_,dev_); + + report_vector_funcs.graphs.put("performance.vector_layout_gpu(0).y.data.mean",mean_/mean); + + // Deviation od x/y = x/y^2 dy + 1/y dx + + report_vector_funcs.graphs.put("performance.vector_layout_gpu(0).y.data.dev",mean_/(mean*mean)*dev + dev_ / mean ); + + report_vector_funcs.graphs.put("performance.vector_layout_gpu(1).funcs.nele",NADD); + report_vector_funcs.graphs.put("performance.vector_layout_gpu(1).funcs.name","read_write_inte"); + + for (size_t i = 0 ; i < N_STAT+1 ; i++) + { + // create a vector + openfpm::vector<aggregate<double,double[2],double[2][2]>,CudaMemory,memory_traits_inte> v1; + openfpm::vector<aggregate<double[2]>,CudaMemory,memory_traits_inte> v2; + + // Point + aggregate<double[2]> p; + p.get<0>()[0] = 1.0; + p.get<0>()[1] = 2.0; + + aggregate<double,double[2],double[2][2]> pa; + pa.get<0>() = 1.0; + + pa.get<1>()[0] = 1.0; + pa.get<1>()[1] = 1.0; + + pa.get<2>()[0][0] = 1.0; + pa.get<2>()[0][1] = 1.0; + pa.get<2>()[1][0] = 1.0; + pa.get<2>()[1][1] = 1.0; + + // Add test + + for (size_t j = 0 ; j < NADD_GPU ; j++) + { + v1.add(pa); + v2.add(p); + } + + timer tg; + tg.startGPU(); + + auto ite = v1.getGPUIterator(1536); + + CUDA_LAUNCH(read_write_ker,ite,v1.toKernel(),v2.toKernel()); + + tg.stopGPU(); + + times2_g[i] = tg.getwctGPU(); + std::cout << "OpenFPM inte: " << times2_g[i] << std::endl; + + int sz = v1.size(); + + timer tga; + tga.startGPU(); + + double * prp0 = (double *)v1.toKernel().getPointer<0>(); + double * prp1 = (double *)v1.toKernel().getPointer<1>(); + double * prp2 = (double *)v1.toKernel().getPointer<2>(); + + double * pos = (double *)v2.toKernel().getPointer<0>(); + + CUDA_LAUNCH(read_write_inte_ker,ite,pos,prp0,prp1,prp2,sz); + + tga.stopGPU(); + + times2[i] = tga.getwctGPU(); + + std::cout << "Array inte: " << times2[i] << std::endl; + } + + double mean2; + double dev2; + standard_deviation(times2_g,mean2,dev2); + + double mean2_; + double dev2_; + standard_deviation(times2,mean2_,dev2_); + + report_vector_funcs.graphs.put("performance.vector_layout_gpu(1).y.data.mean",mean2_/mean2); + + // Deviation od x/y = x/y^2 dy + 1/y dx + + report_vector_funcs.graphs.put("performance.vector_layout_gpu(1).y.data.dev",mean2_/(mean2*mean2)*dev2 + dev2_ / mean2 ); +} + +BOOST_AUTO_TEST_CASE(vector_performance_write_report) +{ + // Create a graphs + + report_vector_funcs.graphs.put("graphs.graph(0).type","line"); + report_vector_funcs.graphs.add("graphs.graph(0).title","Vector add and get"); + report_vector_funcs.graphs.add("graphs.graph(0).x.title","Tests"); + report_vector_funcs.graphs.add("graphs.graph(0).y.title","Time seconds"); + report_vector_funcs.graphs.add("graphs.graph(0).y.data(0).source","performance.vector(#).y.data.mean"); + report_vector_funcs.graphs.add("graphs.graph(0).x.data(0).source","performance.vector(#).funcs.name"); + report_vector_funcs.graphs.add("graphs.graph(0).y.data(0).title","Actual"); + report_vector_funcs.graphs.add("graphs.graph(0).interpolation","lines"); + + report_vector_funcs.graphs.put("graphs.graph(1).type","line"); + report_vector_funcs.graphs.add("graphs.graph(1).title","Vector read write"); + report_vector_funcs.graphs.add("graphs.graph(1).x.title","Layout"); + report_vector_funcs.graphs.add("graphs.graph(1).y.title","Time seconds"); + report_vector_funcs.graphs.add("graphs.graph(1).y.data(0).source","performance.vector_layout(#).y.data.mean"); + report_vector_funcs.graphs.add("graphs.graph(1).x.data(0).source","performance.vector_layout(#).funcs.name"); + report_vector_funcs.graphs.add("graphs.graph(1).y.data(0).title","Actual"); + report_vector_funcs.graphs.add("graphs.graph(1).interpolation","lines"); + + report_vector_funcs.graphs.put("graphs.graph(2).type","line"); + report_vector_funcs.graphs.add("graphs.graph(2).title","Vector GPU read write"); + report_vector_funcs.graphs.add("graphs.graph(2).x.title","Layout"); + report_vector_funcs.graphs.add("graphs.graph(2).y.title","Time seconds"); + report_vector_funcs.graphs.add("graphs.graph(2).y.data(0).source","performance.vector_layout_gpu(#).y.data.mean"); + report_vector_funcs.graphs.add("graphs.graph(2).x.data(0).source","performance.vector_layout_gpu(#).funcs.name"); + report_vector_funcs.graphs.add("graphs.graph(2).y.data(0).title","Actual"); + report_vector_funcs.graphs.add("graphs.graph(2).interpolation","lines"); + + boost::property_tree::xml_writer_settings<std::string> settings(' ', 4); + boost::property_tree::write_xml("vector_performance_funcs.xml", report_vector_funcs.graphs,std::locale(),settings); + + GoogleChart cg; + + std::string file_xml_ref(test_dir); + file_xml_ref += std::string("/openfpm_data/vector_performance_funcs_ref.xml"); + + StandardXMLPerformanceGraph("vector_performance_funcs.xml",file_xml_ref,cg); + + addUpdateTime(cg,1,"data","vector_performance_funcs"); + + cg.write("vector_performance_funcs.html"); +} + +BOOST_AUTO_TEST_SUITE_END() + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/Vector/performance/vector_performance_test.hpp b/src/Vector/performance/vector_performance_test.hpp index 1cf7ad3520e65f14ddfb213c2fb84cdc466c6016..d1dfe49198d278cdbbebc9eff420119aec985db2 100644 --- a/src/Vector/performance/vector_performance_test.hpp +++ b/src/Vector/performance/vector_performance_test.hpp @@ -392,7 +392,7 @@ BOOST_AUTO_TEST_CASE(vector_performance_write_report) StandardXMLPerformanceGraph("vector_performance_funcs.xml",file_xml_ref,cg); - addUpdtateTime(cg,1); + addUpdtateTime(cg,1,"data","vector_performance_funcs"); cg.write("vector_performance_funcs.html"); } diff --git a/src/Vector/vector_map_iterator.hpp b/src/Vector/vector_map_iterator.hpp index 65b419d91825bda5cd295306c7fdcdf82730d436..78bb4958fb207051c657f5c2e28cc06ef5d2ccee 100644 --- a/src/Vector/vector_map_iterator.hpp +++ b/src/Vector/vector_map_iterator.hpp @@ -11,6 +11,97 @@ namespace openfpm { /*! \brief Vector iterator + * + */ + template<unsigned int p, typename vector_type> + class vector_key_iterator_ele + { + //! Linearized end element + const vector_type & vect; + + size_t end; + + protected: + + //! Actual key + size_t gk; + + public: + + /*! \brief Constructor require the size of the vector + * + * \param end point + * \param start starting point + * + */ + vector_key_iterator_ele(const vector_type & vect, size_t end, size_t start = 0) + : vect(vect),end(end),gk(start) + {} + + + /*! \brief Get the next element + * + * Get the next element + * + * \return the next grid_key + * + */ + vector_key_iterator_ele & operator++() + { + //! increment the first index + + gk++; + + return *this; + } + + /*! \brief Set the dimension + * + * \param d is the dimension (IGNORED is by default 0) + * \param sz set the counter to sz + * + */ + void set(int d, size_t sz) + { + // set the counter dim to sz + + gk = sz; + } + + /*! \brief Check if there is the next element + * + * Check if there is the next element + * + * \return true if there is the next, false otherwise + * + */ + bool isNext() const + { + if (gk < end) + { + //! we did not reach the end of the grid + + return true; + } + + //! we reach the end of the grid + return false; + } + + /*! \brief Get the actual key + * + * Get the actual key + * + * \return the actual key + * + */ + auto get() const -> decltype(vect.template get<p>(gk)) + { + return vect.template get<p>(gk); + } + }; + + /*! \brief Vector iterator * */ diff --git a/src/Vector/vector_test_util.hpp b/src/Vector/vector_test_util.hpp index 5a69757d1c2180ca377b7bc88d09f5cacbccf92e..6641954d64d9ad5c2324b39f71adb285d18b1308 100644 --- a/src/Vector/vector_test_util.hpp +++ b/src/Vector/vector_test_util.hpp @@ -27,7 +27,7 @@ typedef Point_test<float> P; #include "timer.hpp" -std::vector<Point_orig<float>> allocate_stl() +static std::vector<Point_orig<float>> allocate_stl() { std::vector<Point_orig<float>> v_stl_test; @@ -76,7 +76,7 @@ openfpm::vector<T> allocate_openfpm_primitive(size_t n, size_t fill) return v; } -openfpm::vector<Point_test<float>> allocate_openfpm_fill(size_t n, size_t fill) +static openfpm::vector<Point_test<float>> allocate_openfpm_fill(size_t n, size_t fill) { Point_test<float> pt; openfpm::vector<Point_test<float>> v_send; @@ -175,7 +175,7 @@ template<typename vector> vector allocate_openfpm(size_t n_ele) return v_ofp_test; } -openfpm::vector<Point_test_prp<float>> allocate_openfpm_prp(size_t n_ele) +static openfpm::vector<Point_test_prp<float>> allocate_openfpm_prp(size_t n_ele) { openfpm::vector<Point_test_prp<float>> v_ofp_test; @@ -242,7 +242,7 @@ openfpm::vector<Point_test_prp<float>> allocate_openfpm_prp(size_t n_ele) } -openfpm::vector< aggregate<float,float,float,float,float[3],float[3][3],openfpm::vector<int>> > allocate_openfpm_aggregate_with_complex(size_t n_ele) +static openfpm::vector< aggregate<float,float,float,float,float[3],float[3][3],openfpm::vector<int>> > allocate_openfpm_aggregate_with_complex(size_t n_ele) { //! [Create add and access] openfpm::vector< aggregate<float,float,float,float,float[3],float[3][3],openfpm::vector<int>> > v_ofp_test; diff --git a/src/Vector/vector_unit_tests.hpp b/src/Vector/vector_unit_tests.hpp index d13f6ac7c0cc08b905baf9d363eae1333fce5ed7..27ba66962c58d8dc5af30007b5a35f94cb7c0860 100644 --- a/src/Vector/vector_unit_tests.hpp +++ b/src/Vector/vector_unit_tests.hpp @@ -156,6 +156,80 @@ template <typename vector> void test_vector_remove() } } + +template <typename vector> void test_vector_remove_aggregate() +{ + typedef Point_test<float> p; + + //! [Create push and multiple remove] + + vector v1; + + for (size_t i = 0 ; i < V_REM_PUSH ; i++) + { + // Point + Point_test<float> p; + p.setx(i); + + v1.add(p); + } + + { + openfpm::vector<aggregate<int>> rem; + rem.add(); + rem.last().get<0>() = 0; + rem.add(); + rem.last().get<0>() = 1; + rem.add(); + rem.last().get<0>() = 2; + rem.add(); + rem.last().get<0>() = 3; + + v1.remove(rem); + } + + //! [Create push and multiple remove] + + BOOST_REQUIRE_EQUAL(v1.size(),1020ul); + BOOST_REQUIRE_EQUAL(v1.template get<p::x>(0),4); + + { + openfpm::vector<aggregate<int>> rem; + rem.add(); + rem.last().get<0>() = v1.size()-3; + rem.add(); + rem.last().get<0>() = v1.size()-2; + rem.add(); + rem.last().get<0>() = v1.size()-1; + rem.add(); + rem.last().get<0>() = v1.size(); + + v1.remove(rem); + } + + BOOST_REQUIRE_EQUAL(v1.size(),1016ul); + BOOST_REQUIRE_EQUAL(v1.template get<p::x>(v1.size()-1),1019); + + { + openfpm::vector<aggregate<int>> rem; + for (size_t i = 0 ; i < (V_REM_PUSH - 8) / 2 ; i++) + { + rem.add(); + rem.last().get<0>() = i * 2; + } + // remove all the even number + v1.remove(rem); + } + + BOOST_REQUIRE_EQUAL(v1.size(),508ul); + + // Check only odd + for (size_t i = 0 ; i < v1.size() ; i++) + { + BOOST_REQUIRE_EQUAL((size_t)v1.template get<p::x>(v1.size()-1) % 2, 1ul); + } +} + template <typename vector> void test_vector_insert() { typedef Point_test<float> p; @@ -446,6 +520,12 @@ BOOST_AUTO_TEST_CASE(vector_remove ) test_vector_remove< openfpm::vector<Point_test<float>,HeapMemory, memory_traits_inte> >(); } +BOOST_AUTO_TEST_CASE(vector_remove_aggregate ) +{ + test_vector_remove_aggregate<openfpm::vector<Point_test<float>>>(); + test_vector_remove_aggregate< openfpm::vector<Point_test<float>,HeapMemory, memory_traits_inte> >(); +} + BOOST_AUTO_TEST_CASE(vector_insert ) { test_vector_insert<openfpm::vector<Point_test<float>>>(); diff --git a/src/config/config_cmake.h.in b/src/config/config_cmake.h.in index 01c196978f3feb7cb93fd2bc3d0aec46446a3162..b15e33f40c557f801cc251b997c2876e21f71e08 100644 --- a/src/config/config_cmake.h.in +++ b/src/config/config_cmake.h.in @@ -4,6 +4,18 @@ ${DEFINE_COVERTY_SCAN} /* GPU support */ ${DEFINE_CUDA_GPU} +/* Define CUDIFY backend */ +${DEFINE_CUDIFY_BACKEND} + +/* OpenMP support */ +${DEFINE_HAVE_OPENMP} + +/* HIP GPU support */ +${DEFINE_HIP_GPU} + +/* HIP Cudify GPU support */ +${DEFINE_CUDIFY_USE_HIP} + /* Debug */ ${DEFINE_DEBUG} /**/ @@ -174,6 +186,9 @@ ${DEFINE_STDC_HEADERS} /* If an error occur stop the program */ ${DEFINE_STOP_ON_ERROR} +/* Garbage injector*/ +${DEFINE_GARBAGE_INJECTOR} + /* Test coverage mode */ ${DEFINE_TEST_COVERAGE_MODE} @@ -183,4 +198,3 @@ ${DEFINE_TEST_COVERAGE_MODE} /* Version number of package */ #define VERSION "1.0.0" -#define OPENFPM_PDATA diff --git a/src/data_type/aggregate.hpp b/src/data_type/aggregate.hpp index 3a80ee31b51a5760ff7217d989ab66aec0478560..e8b61fdb8708bf07db74625dea02028ab78379a2 100644 --- a/src/data_type/aggregate.hpp +++ b/src/data_type/aggregate.hpp @@ -304,4 +304,15 @@ namespace openfpm } } +template<typename BlockT, typename T> +struct AggregateAppend +{ +}; + +template<typename BlockT, typename ... list> +struct AggregateAppend<BlockT, aggregate<list ...>> +{ + typedef aggregate<list..., BlockT> type; +}; + #endif /* OPENFPM_DATA_SRC_UTIL_AGGREGATE_HPP_ */ diff --git a/src/main.cpp b/src/main.cpp index fd69236807133256957bcabaf2831de402ceb62f..893167fb884a3377a884127ce7e013bade27a617 100755 --- a/src/main.cpp +++ b/src/main.cpp @@ -2,27 +2,13 @@ #define FUSION_MAX_VECTOR_SIZE 20 #include "config.h" + #define BOOST_TEST_DYN_LINK #include <boost/test/unit_test.hpp> #include "util/math_util_complex.hpp" #define DISABLE_MPI_WRITTERS -// initialization function: -bool init_unit_test() -{ - return true; -} - -std::vector<int> sieve_spf; - -// entry point: -int main(int argc, char* argv[]) -{ - openfpm::math::init_getFactorization(); - return boost::unit_test::unit_test_main( &init_unit_test, argc, argv ); -} - #include <boost/fusion/include/mpl.hpp> #include <iostream> @@ -55,7 +41,32 @@ int main(int argc, char* argv[]) #include "NN/VerletList/VerletList_test.hpp" #include "Grid/iterators/grid_iterators_unit_tests.cpp" #include "util/test/compute_optimal_device_grid_unit_tests.hpp" + #ifdef PERFORMANCE_TEST #include "performance.hpp" #endif + +#ifndef NO_INIT_AND_MAIN + +// initialization function: +bool init_unit_test() +{ + return true; +} + +std::vector<int> sieve_spf; + +// entry point: +int main(int argc, char* argv[]) +{ + openfpm::math::init_getFactorization(); + return boost::unit_test::unit_test_main( &init_unit_test, argc, argv ); +} + #include "unit_test_init_cleanup.hpp" + +#endif + + + + diff --git a/src/timer.hpp b/src/timer.hpp index 787223a87a5d0b99988195179e317eab8c080a20..f87a0e1cff42bd94268d70791de7ea96d167d792 100644 --- a/src/timer.hpp +++ b/src/timer.hpp @@ -42,14 +42,22 @@ class timer clock_t cstop; #if defined(__NVCC__) && !defined(CUDA_ON_CPU) - cudaEvent_t start_g, stop_g; + #ifdef __HIP__ + hipEvent_t start_g, stop_g; + #else + cudaEvent_t start_g, stop_g; + #endif #endif // Fill the stop point void check() { -#if defined(SYNC_BEFORE_TAKE_TIME) && defined(__NVCC__) +#if defined(SYNC_BEFORE_TAKE_TIME) && defined(__NVCC__) + #ifdef __HIP__ + hipDeviceSynchronize(); + #else cudaDeviceSynchronize(); + #endif #endif #ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time @@ -153,21 +161,37 @@ public: void startGPU() { + #ifdef __HIP__ + hipEventCreate(&start_g); + hipEventRecord(start_g,0); + #else cudaEventCreate(&start_g); cudaEventRecord(start_g,0); + #endif } void stopGPU() { - cudaEventCreate(&stop_g); - cudaEventRecord(stop_g,0); - cudaEventSynchronize(stop_g); + #ifdef __HIP__ + hipEventCreate(&stop_g); + hipEventRecord(stop_g,0); + hipEventSynchronize(stop_g); + #else + cudaEventCreate(&stop_g); + cudaEventRecord(stop_g,0); + cudaEventSynchronize(stop_g); + #endif } double getwctGPU() { float elapsedTime; + + #ifdef __HIP__ + hipEventElapsedTime(&elapsedTime, start_g,stop_g); + #else cudaEventElapsedTime(&elapsedTime, start_g,stop_g); + #endif return elapsedTime; } diff --git a/src/unit_test_init_cleanup.hpp b/src/unit_test_init_cleanup.hpp index 5157a1345f0aa8afbd74514aedb10d3f97395989..64b0c580b8443817abd07e58ce99c44f7a50c232 100644 --- a/src/unit_test_init_cleanup.hpp +++ b/src/unit_test_init_cleanup.hpp @@ -8,6 +8,8 @@ #ifndef UNIT_TEST_INIT_CLEANUP_HPP_ #define UNIT_TEST_INIT_CLEANUP_HPP_ +#include "util/cudify/cudify.hpp" + //! boost unit test fixation (start procedure to call before testing) struct ut_start { diff --git a/src/util/common.hpp b/src/util/common.hpp index 6b8a12a5d9d8a34d23ae9fc8ce956a8e2f24a479..cbcff1361e4eb13d9e829ef9175bdefc9c7cd0b7 100644 --- a/src/util/common.hpp +++ b/src/util/common.hpp @@ -13,6 +13,7 @@ constexpr int RUN_ON_DEVICE = 1024; + __GNUC_MINOR__ * 100 \ + __GNUC_PATCHLEVEL__) + template<unsigned int N,typename T> struct static_array { @@ -330,17 +331,16 @@ struct is_openfpm_native<T,true> : std::true_type template<typename T, typename Sfinae = void> struct has_value_type_ofp: std::false_type {}; -/*! \brief has_value_type check if a type has defined a member value_type +/*! \brief has_value_type_ofp check if a type has defined a member value_type * * ### Example * - * \snippet util_test.hpp Check has_value_type + * \snippet util_test.hpp Check has_value_type_ofp * * return true if T::value_type is a valid type * */ template<typename T> -//struct has_value_type<T, typename Void<decltype( typename T::value_type )>::type> : std::true_type struct has_value_type_ofp<T, typename Void< typename T::value_type>::type> : std::true_type {}; diff --git a/src/util/copy_compare/copy_general.hpp b/src/util/copy_compare/copy_general.hpp index 5b1c75c938198e2b3b08e2c038b6991476fc82f1..0a26a43452ab8a592606014277991fd095a54eb6 100644 --- a/src/util/copy_compare/copy_general.hpp +++ b/src/util/copy_compare/copy_general.hpp @@ -80,7 +80,8 @@ struct add_ } }; -#ifdef __clang__ +#if defined(__clang__) && !defined(CUDA_ON_CPU) + template<typename Tsrc> __host__ Tsrc atomicAdd(Tsrc * ptr, Tsrc value) { diff --git a/src/util/copy_compare/meta_copy.hpp b/src/util/copy_compare/meta_copy.hpp index 394c209b146dde26dd54e2cdcc69b3b6c0a7e307..96e7a85a05c01a765f6179fb36893738959e908a 100644 --- a/src/util/copy_compare/meta_copy.hpp +++ b/src/util/copy_compare/meta_copy.hpp @@ -5,6 +5,28 @@ #include "util/cuda_util.hpp" #include "util/multi_array_openfpm/multi_array_ref_openfpm.hpp" + +template<typename ArrTypeView> +struct std_array_vector_view +{ + int pos; + ArrTypeView arr; + + std_array_vector_view(int pos,ArrTypeView arr) + :pos(pos),arr(arr) + {} + + decltype(arr[0][0]) operator[](int comp) + { + return arr[comp][pos]; + } + + decltype(std::declval<const ArrTypeView>()[0][0]) operator[](int comp) const + { + return arr[comp][pos]; + } +}; + /*! \brief This class copy general objects * * * primitives @@ -102,6 +124,21 @@ struct meta_copy<T[N1]> } } + /*! \brief copy and object from src to dst + * + * \param src source object to copy + * \param dst destination object + * + */ + template<typename T2> + __device__ __host__ static inline void meta_copy_(const T src[N1], std_array_vector_view<T2> dst) + { + for (size_t i1 = 0 ; i1 < N1 ; i1++) + { + copy_general<T>(src[i1],dst[i1]); + } + } + /*! \brief copy and object from src to dst * * \param src source object to copy diff --git a/src/util/cuda/merge_ofp.cuh b/src/util/cuda/merge_ofp.cuh new file mode 100644 index 0000000000000000000000000000000000000000..2c9250053ffe7d0b05b93cf788248703baebd36d --- /dev/null +++ b/src/util/cuda/merge_ofp.cuh @@ -0,0 +1,112 @@ +/* + * segreduce_ofp.hpp + * + * Created on: May 15, 2019 + * Author: i-bird + */ + + #ifndef MERGE_OFP_HPP_ + #define MERGE_OFP_HPP_ + + #ifdef __NVCC__ + + #include "Vector/map_vector.hpp" + #include "util/cuda_launch.hpp" + + #if CUDART_VERSION >= 11000 + #ifndef CUDA_ON_CPU + // Here we have for sure CUDA >= 11 + #ifdef __HIP__ + #undef __CUDACC__ + #undef __CUDA__ + #include <thrust/merge.h> + #include <thrust/execution_policy.h> + #define __CUDACC__ + #define __CUDA__ + #else + #include "util/cuda/moderngpu/kernel_merge.hxx" + #endif + #endif + #else + #include "util/cuda/moderngpu/kernel_merge.hxx" + #endif + #include "util/cuda/ofp_context.hxx" + + + namespace openfpm + { + template<typename a_keys_it, typename a_vals_it, + typename b_keys_it, typename b_vals_it, + typename c_keys_it, typename c_vals_it, + typename comp_t, typename context_t> + void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count, + b_keys_it b_keys, b_vals_it b_vals, int b_count, + c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context) + { + #ifdef CUDA_ON_CPU + + int a_it = 0; + int b_it = 0; + int c_it = 0; + + while (a_it < a_count || b_it < b_count) + { + if (a_it < a_count) + { + if (b_it < b_count) + { + if (comp(b_keys[b_it],a_keys[a_it])) + { + c_keys[c_it] = b_keys[b_it]; + c_vals[c_it] = b_vals[b_it]; + c_it++; + b_it++; + } + else + { + c_keys[c_it] = a_keys[a_it]; + c_vals[c_it] = a_vals[a_it]; + c_it++; + a_it++; + } + } + else + { + c_keys[c_it] = a_keys[a_it]; + c_vals[c_it] = a_vals[a_it]; + c_it++; + a_it++; + } + } + else + { + c_keys[c_it] = b_keys[b_it]; + c_vals[c_it] = b_vals[b_it]; + c_it++; + b_it++; + } + } + + #else + + #ifdef __HIP__ + + thrust::merge_by_key(thrust::device, a_keys,a_keys + a_count, + b_keys,b_keys + b_count, + a_vals,b_vals, + c_keys,c_vals,comp); + + #else + + mgpu::merge(a_keys,a_vals,a_count,b_keys,b_vals,b_count,c_keys,c_vals,comp,context); + + #endif + + #endif + } + } + + #endif /* __NVCC__ */ + + #endif /* SCAN_OFP_HPP_ */ + \ No newline at end of file diff --git a/src/util/cuda/modern_gpu_tests.cu b/src/util/cuda/modern_gpu_tests.cu index 167a7103c6afac34e675b9f40450375facb633df..383d46ab7bb6ea196a498d6bc1ab27421bfa599c 100644 --- a/src/util/cuda/modern_gpu_tests.cu +++ b/src/util/cuda/modern_gpu_tests.cu @@ -7,6 +7,7 @@ #ifndef CUDA_ON_CPU +#ifndef __HIP__ #include "util/cuda/moderngpu/kernel_load_balance.hxx" #include "util/cuda/moderngpu/kernel_mergesort.hxx" #include "util/cuda/moderngpu/kernel_reduce.hxx" @@ -15,7 +16,7 @@ BOOST_AUTO_TEST_SUITE( modern_gpu_tests ) -BOOST_AUTO_TEST_CASE( modern_gpu_transform_lbs ) +BOOST_AUTO_TEST_CASE( modern_gpu_loadbalance_lbs ) { std::cout << "Test modern gpu test tansform_lbs" << "\n"; @@ -217,3 +218,5 @@ BOOST_AUTO_TEST_SUITE_END() #endif +#endif + diff --git a/src/util/cuda/moderngpu/context.hxx b/src/util/cuda/moderngpu/context.hxx index 6df90491e88280d7a309ed72820c052bbd6bab1c..93af53d9ac79ac8ee66ef08144d2e1bfc551fa1a 100644 --- a/src/util/cuda/moderngpu/context.hxx +++ b/src/util/cuda/moderngpu/context.hxx @@ -86,7 +86,7 @@ protected: template<int dummy_arg = 0> void init() { cudaFuncAttributes attr; - cudaError_t result = cudaFuncGetAttributes(&attr, dummy_k<0>); + cudaError_t result = cudaFuncGetAttributes(&attr, (void *)dummy_k<0>); if(cudaSuccess != result) throw cuda_exception_t(result); _ptx_version = attr.ptxVersion; diff --git a/src/util/cuda/moderngpu/cta_load_balance.hxx b/src/util/cuda/moderngpu/cta_load_balance.hxx index 4629b0e4cf7398d944ed8bd0623b200e9cfe2a8a..c397b7893d1c8435427477f647419f7c72f2e37a 100644 --- a/src/util/cuda/moderngpu/cta_load_balance.hxx +++ b/src/util/cuda/moderngpu/cta_load_balance.hxx @@ -135,7 +135,7 @@ struct cta_load_balance_t { int* a_shared = storage.indices - range.a_begin; int* b_shared = storage.indices + range.a_count(); - lbs_placement_t placement = cta_load_balance_place<nt, vt>(tid, range, + lbs_placement_t placement = cta_load_balance_place<nt, vt>(tid, range, count, segments, num_segments, b_shared); // Adjust the b pointer by the loaded b_begin. This lets us dereference it diff --git a/src/util/cuda/moderngpu/cta_segscan.hxx b/src/util/cuda/moderngpu/cta_segscan.hxx index d2b781ee09ef0a4eb76239f4a12fe05661c9cc10..e8738c5c3a8c4972537266665635c01fd2af611e 100644 --- a/src/util/cuda/moderngpu/cta_segscan.hxx +++ b/src/util/cuda/moderngpu/cta_segscan.hxx @@ -30,18 +30,31 @@ struct cta_segscan_t { int warp_mask = 0xffffffff>> (31 - lane); // inclusive search. int cta_mask = 0x7fffffff>> (31 - lane); // exclusive search. + #ifdef __HIP__ + // Build a head flag bitfield and store it into shared memory. + long int warp_bits = ballot(has_head_flag); + storage.delta[warp] = (int)warp_bits; + #else // Build a head flag bitfield and store it into shared memory. int warp_bits = ballot(has_head_flag); storage.delta[warp] = warp_bits; + #endif + + __syncthreads(); if(tid < num_warps) { + #ifdef __HIP__ + int cta_bits = (int)ballot(0 != storage.delta[tid]); + #else unsigned mask = __activemask(); int cta_bits = ballot(0 != storage.delta[tid], mask); + #endif int warp_segment = 31 - clz(cta_mask & cta_bits); int start = (-1 != warp_segment) ? (31 - clz(storage.delta[warp_segment]) + 32 * warp_segment) : 0; storage.delta[num_warps + tid] = start; + } __syncthreads(); diff --git a/src/util/cuda/moderngpu/intrinsics.hxx b/src/util/cuda/moderngpu/intrinsics.hxx index efe881a1d25e0709a9fd973d0a4cb596e0518cf7..39af8b6e6cf7702d6b81f373d49c9025f34da27f 100644 --- a/src/util/cuda/moderngpu/intrinsics.hxx +++ b/src/util/cuda/moderngpu/intrinsics.hxx @@ -2,7 +2,7 @@ #include "operators.hxx" -#ifndef __CUDACC__ +#if !defined(__CUDACC__) && !defined(__HIP__) #error "You must compile this file with nvcc. You must." #endif diff --git a/src/util/cuda/moderngpu/meta.hxx b/src/util/cuda/moderngpu/meta.hxx index 6fd8ccec007356fc1dd02759878fba46cc2f7124..369c303e4f79438b06ab8ff88a453aa66af64ffa 100644 --- a/src/util/cuda/moderngpu/meta.hxx +++ b/src/util/cuda/moderngpu/meta.hxx @@ -14,7 +14,7 @@ #define MGPU_HOST_DEVICE __forceinline__ __device__ __host__ #endif -#ifndef MGPU_DEVICE +#ifndef MGPU_DEVICE #define MGPU_DEVICE __device__ #endif diff --git a/src/util/cuda/moderngpu/transform.hxx b/src/util/cuda/moderngpu/transform.hxx index 617394d03af26eb10d5a10ecbd4d3c5858aa1519..99295a818ef7ff9b811402505a07e13bbd094358 100644 --- a/src/util/cuda/moderngpu/transform.hxx +++ b/src/util/cuda/moderngpu/transform.hxx @@ -1,6 +1,7 @@ // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com #pragma once + #include <random> #include <algorithm> #include <cuda.h> @@ -19,8 +20,10 @@ void cta_launch(func_t f, int num_ctas, context_t& context, args_t... args) { grid_dim = dim3(256, div_up(num_ctas, 256)); if(num_ctas) + { launch_box_cta_k<launch_box, func_t> - <<<grid_dim, cta.nt, 0, context.stream()>>>(f, num_ctas, args...); + <<<grid_dim, cta.nt,0,context.stream()>>>(f, num_ctas, args...); + } } template<int nt, int vt = 1, typename func_t, typename... args_t> diff --git a/src/util/cuda/ofp_context.hxx b/src/util/cuda/ofp_context.hxx index 47d5327e11be0adc0067f49f04efb8a2482dff09..70c4ed9e48b09305680aaaf69ce88df3aa564d0b 100644 --- a/src/util/cuda/ofp_context.hxx +++ b/src/util/cuda/ofp_context.hxx @@ -113,9 +113,11 @@ namespace mgpu #ifdef CUDA_GPU - #ifdef __NVCC__ - + #ifdef __NVCC__ #include "util/cuda/moderngpu/context.hxx" + #else + #include "util/cuda/moderngpu/context_reduced.hxx" + #endif namespace mgpu { @@ -151,9 +153,14 @@ namespace mgpu void init(int dev_num, gpu_context_opt opt) { cudaFuncAttributes attr; - cudaError_t result = cudaFuncGetAttributes(&attr, dummy_k<0>); + #ifdef __NVCC__ + cudaError_t result = cudaFuncGetAttributes(&attr, (void *)dummy_k<0>); if(cudaSuccess != result) throw cuda_exception_t(result); _ptx_version = attr.ptxVersion; + #else + _ptx_version = 60; + //std::cout << __FILE__ << ":" << __LINE__ << " Warning initialization of GPU context has been done from a standard Cpp file, rather than a CUDA or HIP file" << std::endl; + #endif int num_dev; cudaGetDeviceCount(&num_dev); @@ -286,163 +293,6 @@ namespace mgpu } - #else - - #include "util/cuda/moderngpu/context_reduced.hxx" - - namespace mgpu - { - enum gpu_context_opt - { - no_print_props,//!< no_print_props - print_props, //!< print_props - dummy //!< dummy - }; - - - //////////////////////////////////////////////////////////////////////////////// - // standard_context_t is a trivial implementation of context_t. Users can - // derive this type to provide a custom allocator. - - class ofp_context_t : public context_t - { - protected: - cudaDeviceProp _props; - int _ptx_version; - cudaStream_t _stream; - - cudaEvent_t _timer[2]; - cudaEvent_t _event; - - openfpm::vector<aggregate<unsigned char>> tmem; - - // Making this a template argument means we won't generate an instance - // of dummy_k for each translation unit. - template<int dummy_arg = 0> - void init(int dev_num, gpu_context_opt opt) - { - cudaFuncAttributes attr; - - _ptx_version = 0; - - int num_dev; - cudaGetDeviceCount(&num_dev); - - if (num_dev == 0) {return;} - - if (opt != gpu_context_opt::dummy) - { - cudaSetDevice(dev_num % num_dev); - } - - int ord; - cudaGetDevice(&ord); - cudaGetDeviceProperties(&_props, ord); - - cudaEventCreate(&_timer[0]); - cudaEventCreate(&_timer[1]); - cudaEventCreate(&_event); - } - - public: - - /*! \brief gpu context constructor - * - * \param opt options for this gpu context - * - */ - ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, cudaStream_t stream_ = 0) - :context_t(), _stream(stream_) - { - init(dev_num,opt); - if(opt == gpu_context_opt::print_props) - { - printf("%s\n", device_prop_string(_props).c_str()); - } - } - - ~ofp_context_t() - { - cudaEventDestroy(_timer[0]); - cudaEventDestroy(_timer[1]); - cudaEventDestroy(_event); - } - - virtual const cudaDeviceProp& props() const - { - return _props; - } - - virtual int ptx_version() const - { - std::cout << __FILE__ << ":" << __LINE__ << " error to use this function you must compile the class ofp_context_t with NVCC" << std::endl; - return 0; - } - - virtual cudaStream_t stream() { return _stream; } - - // Alloc GPU memory. - virtual void* alloc(size_t size, memory_space_t space) - { - void* p = nullptr; - if(size) - { - cudaError_t result = (memory_space_device == space) ?cudaMalloc(&p, size) : cudaMallocHost(&p, size); - if(cudaSuccess != result) throw cuda_exception_t(result); - } - return p; - } - - virtual void free(void* p, memory_space_t space) - { - if(p) - { - cudaError_t result = (memory_space_device == space) ? cudaFree(p) : cudaFreeHost(p); - if(cudaSuccess != result) throw cuda_exception_t(result); - } - } - - virtual void synchronize() - { - cudaError_t result = _stream ? - cudaStreamSynchronize(_stream) : - cudaDeviceSynchronize(); - if(cudaSuccess != result) throw cuda_exception_t(result); - } - - virtual cudaEvent_t event() - { - return _event; - } - - virtual void timer_begin() - { - cudaEventRecord(_timer[0], _stream); - } - - virtual double timer_end() - { - cudaEventRecord(_timer[1], _stream); - cudaEventSynchronize(_timer[1]); - float ms; - cudaEventElapsedTime(&ms, _timer[0], _timer[1]); - return ms / 1.0e3; - } - - virtual int getDevice() - { - int dev = 0; - - cudaGetDevice(&dev); - - return dev; - } - }; - - } - - #endif - #else namespace mgpu diff --git a/src/util/cuda/reduce_ofp.cuh b/src/util/cuda/reduce_ofp.cuh index 7c4ecb0f7fb4375f3836c06274bf24af41302229..a9f2cf2ef5d57154019ffb9ed7f9c0b2d96b45bd 100644 --- a/src/util/cuda/reduce_ofp.cuh +++ b/src/util/cuda/reduce_ofp.cuh @@ -15,7 +15,11 @@ #if CUDART_VERSION >= 11000 #ifndef CUDA_ON_CPU // Here we have for sure CUDA >= 11 - #include "cub/cub.cuh" + #ifdef __HIP__ + #include "hipcub/hipcub.hpp" + #else + #include "cub/cub.cuh" + #endif #ifndef REDUCE_WITH_CUB #define REDUCE_WITH_CUB #endif @@ -44,9 +48,11 @@ namespace openfpm #else #ifdef REDUCE_WITH_CUB + #ifdef __HIP__ + void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; - cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input, + hipcub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input, output, count, op, @@ -56,11 +62,32 @@ namespace openfpm temporal.resize(temp_storage_bytes); // Run - cub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, + hipcub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, output, count, op, false); + #else + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input, + output, + count, + op, + false); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + // Run + cub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, + output, + count, + op, + false); + + #endif #else mgpu::reduce(input,count,output,op,context); diff --git a/src/util/cuda/scan_cuda.cuh b/src/util/cuda/scan_cuda.cuh deleted file mode 100644 index f80ad87c0b3ebfd9656f8a1ea569d488672232cd..0000000000000000000000000000000000000000 --- a/src/util/cuda/scan_cuda.cuh +++ /dev/null @@ -1,532 +0,0 @@ -/* - * scan_cuda.cuh - * - * Created on: Jun 28, 2018 - * Author: i-bird - */ - -#ifndef SCAN_CUDA_CUH_ -#define SCAN_CUDA_CUH_ - -#include "Vector/map_vector.hpp" - -#ifdef __NVCC__ - -////////////////// Ratio reduction /////////////////////////////////////////////////////////////// - -/*template<typename cnt_type, typename ids_type, unsigned int rt = sizeof(cnt_type)/sizeof(ids_type)> -struct ratio_reduction -{ - static inline __device__ cnt_type reduction(cnt_type * rd) - { - return -1; - } -}; - -template<typename cnt_type, typename ids_type> -struct ratio_reduction<cnt_type,ids_type,1> -{ - static inline __device__ unsigned int reduction(cnt_type * val) - { - return val[0] + val[1] + val[2] + val[3]; - } -}; - -template<typename cnt_type, typename ids_type> -struct ratio_reduction<cnt_type,ids_type,2> -{ - static inline __device__ unsigned int reduction(cnt_type * val) - { - val[0] = ((val[0] & 0xFFFF0000) >> 16) + (val[0] & 0xFFFF); - val[1] = ((val[1] & 0xFFFF0000) >> 16) + (val[1] & 0xFFFF); - val[2] = ((val[2] & 0xFFFF0000) >> 16) + (val[2] & 0xFFFF); - val[3] = ((val[3] & 0xFFFF0000) >> 16) + (val[3] & 0xFFFF); - return val[0] + val[1] + val[2] + val[3]; - } -}; - -template<typename cnt_type, typename ids_type> -struct ratio_reduction<cnt_type,ids_type,4> -{ - static inline __device__ unsigned int reduction(cnt_type * val) - { - val[0] = ((val[0] & 0xFF000000) >> 24) + ((val[0] & 0xFF0000) >> 16) + ((val[0] & 0xFF00) >> 8) + (val[0] & 0xFF); - val[1] = ((val[1] & 0xFF000000) >> 24) + ((val[1] & 0xFF0000) >> 16) + ((val[1] & 0xFF00) >> 8) + (val[1] & 0xFF); - val[2] = ((val[2] & 0xFF000000) >> 24) + ((val[2] & 0xFF0000) >> 16) + ((val[2] & 0xFF00) >> 8) + (val[2] & 0xFF); - val[3] = ((val[3] & 0xFF000000) >> 24) + ((val[3] & 0xFF0000) >> 16) + ((val[3] & 0xFF00) >> 8) + (val[3] & 0xFF); - return val[0] + val[1] + val[2] + val[3]; - } -}; - - -///////////////////////// Ratio extends //////////////////////////////////////////////////////////////////// - - -template<typename T> -struct to_four_type -{ - typedef T type; -}; - -template<> -struct to_four_type<unsigned int> -{ - typedef uint4 type; -}; - -template<> -struct to_four_type<int> -{ - typedef int4 type; -}; - -template<typename T> -struct make4 -{ - __device__ inline static T make() - { - return 0; - } -}; - -template<> -struct make4<uint4> -{ - __device__ inline static uint4 make() - { - return make_uint4(0,0,0,0); - } -}; - -template<> -struct make4<int4> -{ - __device__ inline static int4 make() - { - return make_int4(0,0,0,0); - } -}; - -template<typename cnt_type, typename ids_type, unsigned int rt = sizeof(cnt_type)/sizeof(ids_type)> -struct ratio_extend -{ - static inline __device__ cnt_type reduction(ids_type * rd) - { - return -1; - } -}; - -template<typename cnt_type, typename ids_type> -struct ratio_extend<cnt_type,ids_type,1> -{ - typedef boost::mpl::int_<1> size; - - typedef cnt_type cnt_type_; - typedef typename to_four_type<cnt_type>::type cnt_type4; - - static inline __device__ void extend(cnt_type4 * val) - { - unsigned int tmp = val[0].w; - val[0].w = val[0].z; - val[0].z = val[0].y; - val[0].y = val[0].x; - val[0].x = tmp; - } - - static inline __device__ unsigned int reduce(cnt_type4 * tu4, cnt_type4 * val) - { - tu4->x = val[0].x + val[0].y + val[0].z + val[0].w; - - return tu4->x; - } - - static inline __device__ void scan(unsigned int woff_w, cnt_type4 tu4, cnt_type4 * val) - { - unsigned int lps; - lps = woff_w + tu4.w; - - val[0].x = lps; - - val[0].y += val[0].x; - - val[0].z += val[0].y; - - val[0].w += val[0].z; - } - - static inline __device__ cnt_type4 zero() - { - return make4<cnt_type4>::make(); - } -}; - - - -template<typename cnt_type, typename ids_type> -struct ratio_extend<cnt_type,ids_type,2> -{ - typedef boost::mpl::int_<2> size; - - typedef cnt_type cnt_type_; - typedef typename to_four_type<cnt_type>::type cnt_type4; - - static inline __device__ void extend(cnt_type4 * val) - { - val[1].w = (val[0].w & 0x0000FFFF); - val[1].z = (val[0].z & 0xFFFF0000) >> 16; - - val[1].y = (val[0].z & 0x0000FFFF); - val[1].x = (val[0].w & 0xFFFF0000) >> 16; - - val[0].w = (val[0].y & 0x0000FFFF); - val[0].z = (val[0].x & 0xFFFF0000) >> 16; - - short int tmp = (val[0].y & 0xFFFF0000) >> 16; - val[0].y = (val[0].x & 0x0000FFFF); - val[0].x = tmp; - } - - static inline __device__ unsigned int reduce(cnt_type4 * tu4, cnt_type4 * val) - { - tu4->x = val[0].x + val[0].y + val[0].z + val[0].w; - tu4->y = val[1].x + val[1].y + val[1].z + val[1].w; - - return tu4->x + tu4->y; - } - - static inline __device__ void scan(unsigned int woff_w, cnt_type4 tu4, cnt_type4 * val) - { - uint2 lps; - lps.x = woff_w + tu4.w; - lps.y = lps.x + tu4.x; - - val[0].x = lps.x; - val[1].x = lps.y; - - val[0].y += val[0].x; - val[1].y += val[1].x; - - val[0].z += val[0].y; - val[1].z += val[1].y; - - val[0].w += val[0].z; - val[1].w += val[1].z; - } - - static inline __device__ cnt_type4 zero() - { - return make4<cnt_type4>::make(); - } -}; - -template<typename cnt_type, typename ids_type> -struct ratio_extend<cnt_type,ids_type,4> -{ - typedef boost::mpl::int_<4> size; - - typedef cnt_type cnt_type_; - typedef typename to_four_type<cnt_type>::type cnt_type4; - - static inline __device__ void extend(cnt_type4 * val) - { - val[3].y = (val[0].w & 0x000000FF); - val[3].z = (val[0].w & 0x0000FF00) >> 8; - val[3].w = (val[0].w & 0x00FF0000) >> 16; - val[3].x = (val[0].w & 0xFF000000) >> 24; - - val[2].y = (val[0].z & 0x000000FF); - val[2].z = (val[0].z & 0x0000FF00) >> 8; - val[2].w = (val[0].z & 0x00FF0000) >> 16; - val[2].x = (val[0].z & 0xFF000000) >> 24; - - val[1].y = (val[0].y & 0x000000FF); - val[1].z = (val[0].y & 0x0000FF00) >> 8; - val[1].w = (val[0].y & 0x00FF0000) >> 16; - val[1].x = (val[0].y & 0xFF000000) >> 24; - - val[0].y = (val[0].x & 0x000000FF); - val[0].z = (val[0].x & 0x0000FF00) >> 8; - val[0].w = (val[0].x & 0x00FF0000) >> 16; - val[0].x = (val[0].x & 0xFF000000) >> 24; - } - - static inline __device__ unsigned int reduce(cnt_type4 * tu4, cnt_type4 * val) - { - tu4->x = val[0].x + val[0].y + val[0].z + val[0].w; - tu4->y = val[1].x + val[1].y + val[1].z + val[1].w; - tu4->z = val[2].x + val[2].y + val[2].z + val[2].w; - tu4->w = val[3].x + val[3].y + val[3].z + val[3].w; - - return tu4->x + tu4->y + tu4->z + tu4->w; - } - - static inline __device__ void scan(unsigned int woff_w, cnt_type4 tu4, cnt_type4 * val) - { - uint4 lps; - lps.x = woff_w + tu4.w; - lps.y = lps.x + tu4.x; - lps.z = lps.y + tu4.y; - lps.w = lps.z + tu4.z; - - val[0].x = lps.x; - val[1].x = lps.y; - val[2].x = lps.z; - val[3].x = lps.w; - - val[0].y += val[0].x; - val[1].y += val[1].x; - val[2].y += val[2].x; - val[3].y += val[3].x; - - val[0].z += val[0].y; - val[1].z += val[1].y; - val[2].z += val[2].y; - val[3].z += val[3].y; - - val[0].w += val[0].z; - val[1].w += val[1].z; - val[2].w += val[2].z; - val[3].w += val[3].z; - } - - static inline __device__ cnt_type4 zero() - { - return make4<cnt_type4>::make(); - } -};*/ - -////////////////////////////////////////////////////////////////////////////////////////////////////////// - -/*template<typename cnt_type, typename ids_type> -__global__ void compress4(const cnt_type n_cell, const cnt_type *const count, ids_type *const compressed) -{ - const int gid = 4*(threadIdx.x + blockDim.x * blockIdx.x); - - if (gid >= n_cell) - {return;} - - compressed[gid] = count[gid]; - compressed[gid+1] = count[gid+1]; - compressed[gid+2] = count[gid+2]; - compressed[gid+3] = count[gid+3]; -} - -template <int NWARP, typename cnt_type, typename ids_type, typename reduction> -__global__ void breduce(int n, const cnt_type *vin, cnt_type *vout) -{ - const int wid = threadIdx.x/32; - const int lid = threadIdx.x%32; - const int tid = 4*(blockDim.x*blockIdx.x + threadIdx.x); - cnt_type val[4] = {0,0,0,0}; - - __shared__ unsigned int shtmp[NWARP]; - - if (tid < n) - { - val[0] = vin[0+tid]; - val[1] = vin[1+tid]; - val[2] = vin[2+tid]; - val[3] = vin[3+tid]; - } - - val[0] = reduction::reduction(val); - -#pragma unroll - for(int i = 16; i > 0; i >>= 1) - {val[0] += __shfl_down_sync(0xFFFFFFFF,(int)val[0],i);} - - if (0 == lid) - {shtmp[wid] = val[0];} - - __syncthreads(); - if (0 == wid) - { - val[0] = (lid < NWARP) ? shtmp[lid] : 0; - -#pragma unroll - for(int i = 16; i > 0; i >>= 1) - {val[0] += __shfl_down_sync(0xFFFFFFFF,(int)val[0], i);} - } - - if (0 == threadIdx.x) - { - vout[blockIdx.x] = val[0]; - } -} - -template <int BDIM, typename cnt_type> -__global__ void bexscan(int n, cnt_type *v) -{ - extern __shared__ unsigned int shtmp[]; - - for(int i = threadIdx.x; i < n; i += BDIM) - {shtmp[i] = v[i];} - - int i = threadIdx.x; - __syncthreads(); - for(; n >= BDIM; i += BDIM, n -= BDIM) - { - __syncthreads(); - if (i > 0 && 0 == threadIdx.x) - {shtmp[i] += shtmp[i-1];} - - unsigned int a = 0; - -#pragma unroll - for(int j = 1; j < BDIM; j <<= 1) - { - a = 0; - - __syncthreads(); - if (threadIdx.x >= j) {a = shtmp[i] + shtmp[i-j];} - - __syncthreads(); - if (threadIdx.x >= j) {shtmp[i] = a;} - } - v[i] = shtmp[i]; - } - if (threadIdx.x < n) - { - __syncthreads(); - if (i > 0 && 0 == threadIdx.x) - {shtmp[i] += shtmp[i-1];} - - unsigned int a = 0; - for(int j = 1; j < BDIM; j <<= 1) - { - a = 0; - - __syncthreads(); - if (threadIdx.x >= j) a = shtmp[i] + shtmp[i-j]; - - __syncthreads(); - if (threadIdx.x >= j) shtmp[i] = a; - } - v[i] = shtmp[i]; - } -} - -template <int NWARP, typename extend> -__global__ void gexscan(int n, - const typename extend::cnt_type4 *vin, - typename extend::cnt_type_ *offs, - typename extend::cnt_type4 *vout) -{ - const int wid = threadIdx.x/32; - const int lid = threadIdx.x%32; - const int tid = blockDim.x*blockIdx.x + threadIdx.x; - typename extend::cnt_type4 val[extend::size::value]; - - __shared__ unsigned int woff[NWARP]; - - if (tid < n) val[0] = vin[tid]; - else val[0] = extend::zero(); - - extend::extend(val); - - typename extend::cnt_type4 tu4; - typename extend::cnt_type_ tmp = extend::reduce(&tu4,val); - - tu4.w = tmp; -#pragma unroll - for(int i = 1; i < 32; i <<= 1) - {tu4.w += (lid >= i)*__shfl_up_sync(0xFFFFFFFF,(int)tu4.w, i);} - - if (lid == 31) - { - if (wid < NWARP-1) woff[wid+1] = tu4.w; - else woff[0] = (blockIdx.x > 0) ? offs[blockIdx.x-1] : 0; - } - - tu4.w -= tmp; - - __syncthreads(); - - if (0 == wid) - { - tmp = (lid < NWARP) ? woff[lid] : 0; -#pragma unroll - for(int i = 1; i < NWARP; i <<= 1) - {tmp += (lid >= i)*__shfl_up_sync(0xFFFFFFFF,(int)tmp, i);} - - if (lid < NWARP) woff[lid] = tmp; - } - __syncthreads(); - - if (tid >= n) return; - - extend::scan(woff[wid],tu4,val); - - vout += tid*extend::size::value; - -#pragma unroll - for(int i = 0; i < extend::size::value; i++) - {vout[i] = val[i];} -}*/ - -/*! \brief Scan is a class because it use internally temporary buffers that are heavy to reconstruct - * - * - * - */ -/*template<typename cnt_type, typename ids_type> -class scan -{ - openfpm::vector<aggregate<cnt_type>,CudaMemory,typename memory_traits_inte<aggregate<cnt_type>>::type,memory_traits_inte> red; - openfpm::vector<aggregate<ids_type>,CudaMemory,typename memory_traits_inte<aggregate<ids_type>>::type,memory_traits_inte> compressed; - -public: - - void scan_(openfpm::vector<aggregate<cnt_type>,CudaMemory,typename memory_traits_inte<aggregate<cnt_type>>::type,memory_traits_inte> & cl_n, - openfpm::vector<aggregate<cnt_type>,CudaMemory,typename memory_traits_inte<aggregate<cnt_type>>::type,memory_traits_inte> & cl_n_scan) - { - constexpr int THREADS = 128; - constexpr int ratio = 4*sizeof(cnt_type)/sizeof(ids_type); - - int nblocks = (cl_n.size() + THREADS * ratio - 1 ) / (THREADS * ratio); - red.resize(nblocks); - - auto ite = cl_n.getGPUIterator(); - - compressed.resize(cl_n.size()); - compress4<cnt_type,ids_type><<<ite.wthr,ite.thr>>>((cnt_type)cl_n.size(), - static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()), - static_cast<ids_type *>(compressed.template getDeviceBuffer<0>())); - - breduce<THREADS/32,cnt_type,ids_type,ratio_reduction<cnt_type,ids_type>><<<nblocks, THREADS >>>(cl_n.size() / ratio * 4, - (cnt_type *)compressed.template getDeviceBuffer<0>(), - static_cast<cnt_type *>(red.template getDeviceBuffer<0>())); - - - bexscan<THREADS,cnt_type><<<1, THREADS, nblocks*sizeof(uint)>>>(nblocks, - static_cast<cnt_type *>(red.template getDeviceBuffer<0>())); - - size_t raw_size = cl_n.size(); - - // resize to be multiple of 16 - - size_t ml = ((raw_size + ratio - 1) / ratio) *ratio; - cl_n_scan.resize(ml); - - gexscan<THREADS/32,ratio_extend<cnt_type,ids_type>> <<< nblocks, THREADS >>>((cl_n.size() + ratio - 1 ) / ratio, - static_cast<typename ratio_extend<cnt_type,ids_type>::cnt_type4 *>(compressed.template getDeviceBuffer<0>()), - static_cast<cnt_type *>(red.template getDeviceBuffer<0>()), - static_cast<typename ratio_extend<cnt_type,ids_type>::cnt_type4 *>(cl_n_scan.template getDeviceBuffer<0>())); - - cl_n_scan.resize(raw_size); - } - -};*/ - -#else - -// In case we do not have NVCC we create a stub - -template<typename cnt_type, typename ids_type> -class scan -{ -}; - -#endif - -#endif /* SCAN_CUDA_CUH_ */ diff --git a/src/util/cuda/scan_ofp.cuh b/src/util/cuda/scan_ofp.cuh index 7d778b2a1d6e3415383622e3c5ce3e3149c02e7e..2ef79fdb024018ed2ea5809044fe65f0087590c0 100644 --- a/src/util/cuda/scan_ofp.cuh +++ b/src/util/cuda/scan_ofp.cuh @@ -15,7 +15,11 @@ #if CUDART_VERSION >= 11000 #ifndef CUDA_ON_CPU // Here we have for sure CUDA >= 11 - #include "cub/cub.cuh" + #ifdef __HIP__ + #include "hipcub/hipcub.hpp" + #else + #include "cub/cub.cuh" + #endif #ifndef SCAN_WITH_CUB #define SCAN_WITH_CUB #endif @@ -25,6 +29,7 @@ #include "cub_old/cub.cuh" #include "util/cuda/moderngpu/kernel_scan.hxx" #endif + #include "util/cuda/ofp_context.hxx" namespace openfpm @@ -36,32 +41,53 @@ namespace openfpm if (count == 0) {return;} - typename std::remove_reference<decltype(input[0])>::type scan_a = 0; - // No particular speed-up from omp amd simd -// #pragma omp parallel for simd reduction(inscan, +:scan_a) - for(int i = 0; i < count; i++) + auto prec = input[0]; + output[0] = 0; + for (int i = 1 ; i < count ; i++) { - output[i] = scan_a; -// #pragma omp scan exclusive(scan_a) - scan_a += input[i]; - } + auto next = prec + output[i-1]; + prec = input[i]; + output[i] = next; + } #else #ifdef SCAN_WITH_CUB - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input, - output, - count); + #ifdef __HIP__ + + if (count == 0) {return;} + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input, + output, + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + // Run + hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, + output, + count); + + #else + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input, + output, + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); + // Run + cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, + output, + count); - // Run - cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, - output, - count); + #endif #else mgpu::scan(input,count,output,context); diff --git a/src/util/cuda/scan_sort_cuda_unit_tests.cu b/src/util/cuda/scan_sort_cuda_unit_tests.cu index 65c4c343a6afacb22a0de4b3efbdd3c641213e7c..7314e87485d9665532fe5ee1cb642aa0905fd922 100644 --- a/src/util/cuda/scan_sort_cuda_unit_tests.cu +++ b/src/util/cuda/scan_sort_cuda_unit_tests.cu @@ -7,12 +7,12 @@ #include "util/cuda_util.hpp" #include "Vector/map_vector.hpp" -#include "scan_cuda.cuh" #define SORT_WITH_CUB #include "sort_ofp.cuh" #include "scan_ofp.cuh" +#include "segreduce_ofp.cuh" BOOST_AUTO_TEST_SUITE( scan_tests ) @@ -109,4 +109,81 @@ BOOST_AUTO_TEST_CASE( test_sort_cub_wrapper ) // Test the cell list } +BOOST_AUTO_TEST_CASE( test_seg_reduce_wrapper ) +{ + std::cout << "Test gpu segmented reduce" << "\n"; + + mgpu::ofp_context_t context; + + int count = 130; + + openfpm::vector_gpu<aggregate<int>> vgpu; + openfpm::vector_gpu<aggregate<int>> segment_offset; + openfpm::vector_gpu<aggregate<int>> output; + int init = 0; + + vgpu.resize(count); + + for (size_t i = 0 ; i < count ; i++) + { + vgpu.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 17; + } + + segment_offset.add(); + segment_offset.template get<0>(0) = 0; + size_t base = 0; + while (1) + { + int c = ((float)rand() / (float)RAND_MAX) * 17; + + if (c + base >= count) + {break;} + + segment_offset.add(); + segment_offset.template get<0>(segment_offset.size() - 1) = c + segment_offset.template get<0>(segment_offset.size() - 2); + + base += c; + } + + vgpu.hostToDevice<0>(); + + segment_offset.hostToDevice<0>(); + output.resize(segment_offset.size()); + + openfpm::segreduce((int *)vgpu.template getDeviceBuffer<0>(), vgpu.size(), + (int *)segment_offset.template getDeviceBuffer<0>(), segment_offset.size(), + (int *)output.template getDeviceBuffer<0>(), + mgpu::plus_t<int>(), init, context); + + + output.template deviceToHost<0>(); + + bool match = true; + size_t i = 0; + for ( ; i < segment_offset.size()-1 ; i++) + { + size_t red = 0; + for (size_t j = 0 ; j < segment_offset.template get<0>(i+1) - segment_offset.template get<0>(i) ; j++) + { + red += vgpu.template get<0>(segment_offset.template get<0>(i) + j); + } + match &= red == output.template get<0>(i); + } + + BOOST_REQUIRE_EQUAL(match,true); + + size_t red2 = 0; + for (size_t j = 0 ; j < vgpu.size() - segment_offset.template get<0>(i) ; j++) + { + red2 += vgpu.template get<0>(segment_offset.template get<0>(i) + j); + } + match &= red2 == output.template get<0>(i); + + BOOST_REQUIRE_EQUAL(match,true); + + std::cout << "End test modern gpu test reduce" << "\n"; + + // Test the cell list +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/util/cuda/segreduce_block_cuda.cuh b/src/util/cuda/segreduce_block_cuda.cuh deleted file mode 100644 index 9a1b29c585d933367fd01ae3b4a87f72434a48cf..0000000000000000000000000000000000000000 --- a/src/util/cuda/segreduce_block_cuda.cuh +++ /dev/null @@ -1,10 +0,0 @@ -// -// Created by tommaso on 20/05/19. -// - -#ifndef OPENFPM_PDATA_SEGREDUCE_BLOCK_CUDA_CUH -#define OPENFPM_PDATA_SEGREDUCE_BLOCK_CUDA_CUH - - - -#endif //OPENFPM_PDATA_SEGREDUCE_BLOCK_CUDA_CUH diff --git a/src/util/cuda/segreduce_ofp.cuh b/src/util/cuda/segreduce_ofp.cuh new file mode 100644 index 0000000000000000000000000000000000000000..9a0c3764108e2e0e009f3d2e2a62fb470b7a20df --- /dev/null +++ b/src/util/cuda/segreduce_ofp.cuh @@ -0,0 +1,157 @@ +/* + * segreduce_ofp.hpp + * + * Created on: May 15, 2019 + * Author: i-bird + */ + + #ifndef SEGREDUCE_OFP_HPP_ + #define SEGREDUCE_OFP_HPP_ + + #ifdef __NVCC__ + + #include "Vector/map_vector.hpp" + #include "util/cuda_launch.hpp" + #include "util/cuda/segreduce_ofp.cuh" + + #if CUDART_VERSION >= 11000 + #ifndef CUDA_ON_CPU + // Here we have for sure CUDA >= 11 + #ifdef __HIP__ + #undef __CUDACC__ + #undef __CUDA__ + #include <thrust/reduce.h> + #define __CUDACC__ + #define __CUDA__ + #else + #include "util/cuda/moderngpu/kernel_segreduce.hxx" + #endif + #endif + #else + #include "util/cuda/moderngpu/kernel_segreduce.hxx" + #endif + #include "util/cuda/ofp_context.hxx" + +template<typename segments_it, typename keys_type, typename output_it, typename seg_type, typename type_t> +__global__ void seg_to_keys(segments_it segs, keys_type keys, seg_type seg_out ,output_it output, int n_count, int num_segments,type_t init) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid >= num_segments) {return;} + + int s = segs[tid]; + int s_p1 = (tid == num_segments -1)?n_count:segs[tid+1]; + + int n_ele = s_p1 - s; + + seg_out.template get<1>(tid) = (s != s_p1); + output[tid] = init; + + for (int j = 0 ; j < n_ele ; j++) + { + keys.template get<0>(s + j) = tid; + } +} + +template<typename output_it, typename out_tmp_type ,typename segs_type> +__global__ void realign_output(output_it out, out_tmp_type out_tmp, segs_type segs, int num_segments) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid >= num_segments) {return;} + + int t = segs.template get<2>(tid); + int to_copy = segs.template get<1>(tid); + + auto op = out_tmp.template get<0>(t); + + if (to_copy == 1) + {out[tid] = op;} +} + + namespace openfpm + { + template<typename input_it, + typename segments_it, typename output_it, typename op_t, typename type_t> + void segreduce(input_it input, int count, segments_it segments, + int num_segments, output_it output, op_t op, type_t init, + mgpu::ofp_context_t & context) + { + #ifdef CUDA_ON_CPU + + int i = 0; + for ( ; i < num_segments - 1; i++) + { + int j = segments[i]; + output[i] = init; + if (j == segments[i+1]) {continue;} + output[i] = input[j]; + ++j; + for ( ; j < segments[i+1] ; j++) + { + output[i] = op(output[i],input[j]); + } + } + + // Last segment + int j = segments[i]; + if (j != count) + { + output[i] = input[j]; + ++j; + for ( ; j < count ; j++) + { + output[i] = op(output[i],input[j]); + } + } + + #else + + #ifdef __HIP__ + + typedef typename std::remove_pointer<segments_it>::type index_type; + typedef typename std::remove_pointer<output_it>::type out_type; + + openfpm::vector_gpu<aggregate<index_type>> keys; + keys.resize(count); + + openfpm::vector_gpu<aggregate<index_type,index_type,index_type>> segs_out; + segs_out.resize(num_segments); + + openfpm::vector_gpu<aggregate<out_type>> out_tmp; + out_tmp.resize(num_segments); + + grid_sm<1,void> g(num_segments); + + auto it = g.getGPUIterator(); + + CUDA_LAUNCH(seg_to_keys,it,segments,keys.toKernel(),segs_out.toKernel(),output,count,num_segments,init); + + openfpm::scan((index_type *)segs_out.template getDeviceBuffer<1>(),num_segments,(index_type *)segs_out.template getDeviceBuffer<2>(),context); + + thrust::pair<index_type *,out_type *> new_end; + new_end = thrust::reduce_by_key(thrust::device, (segments_it)keys.template getDeviceBuffer<0>(),((segments_it)keys.template getDeviceBuffer<0>()) + count, + input, + (segments_it)segs_out.template getDeviceBuffer<0>(), + (output_it)out_tmp.template getDeviceBuffer<0>(), + thrust::equal_to<int>(), + op); + + // .. Not so easy to emulate a segmented reduce we have to track the zeros segments and realign the output + + CUDA_LAUNCH(realign_output,it,output,out_tmp.toKernel(),segs_out.toKernel(),num_segments); + + #else + + mgpu::segreduce(input,count,segments,num_segments,output,op,init,context); + + #endif + + #endif + } + } + + #endif /* __NVCC__ */ + + #endif /* SCAN_OFP_HPP_ */ + \ No newline at end of file diff --git a/src/util/cuda/sort_ofp.cuh b/src/util/cuda/sort_ofp.cuh index 8a83b80a2eaf0b962472402414f0254c6d7ffe8c..e7939c60cf6f9d2372b728edda2708599e6fbc98 100644 --- a/src/util/cuda/sort_ofp.cuh +++ b/src/util/cuda/sort_ofp.cuh @@ -16,7 +16,11 @@ #if CUDART_VERSION >= 11000 #ifndef CUDA_ON_CPU // Here we have for sure CUDA >= 11 - #include "cub/cub.cuh" + #ifdef __HIP__ + #include "hipcub/hipcub.hpp" + #else + #include "cub/cub.cuh" + #endif #ifndef SORT_WITH_CUB #define SORT_WITH_CUB #endif @@ -125,6 +129,14 @@ struct key_val_it key_t * key; val_t * val; + + key_val_it & operator+=(int delta) + { + key += delta; + val += delta; + return *this; + } + bool operator==(const key_val_it & tmp) { return (key == tmp.key && val == tmp.val); @@ -193,7 +205,17 @@ struct key_val_it return key < tmp.key; } - key_val_it<key_t,val_t> & operator=(const key_val_it<key_t,val_t> & tmp) + bool operator>(const key_val_it & tmp) const + { + return key > tmp.key; + } + + bool operator>=(const key_val_it & tmp) const + { + return key >= tmp.key; + } + + key_val_it<key_t,val_t> & operator=(key_val_it<key_t,val_t> & tmp) { key = tmp.key; val = tmp.val; @@ -255,66 +277,134 @@ namespace openfpm #ifdef SORT_WITH_CUB - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - - auto & temporal2 = context.getTemporalCUB2(); - temporal2.resize(sizeof(key_t)*count); - - auto & temporal3 = context.getTemporalCUB3(); - temporal3.resize(sizeof(val_t)*count); - - if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true) - { - cub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); - - d_temp_storage = temporal.template getDeviceBuffer<0>(); - - // Run - cub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - } - else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true) - { - cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); - - d_temp_storage = temporal.template getDeviceBuffer<0>(); - - // Run - cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - } - - cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); - cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); + #ifdef __HIP__ + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + + auto & temporal2 = context.getTemporalCUB2(); + temporal2.resize(sizeof(key_t)*count); + + auto & temporal3 = context.getTemporalCUB3(); + temporal3.resize(sizeof(val_t)*count); + + if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true) + { + hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true) + { + hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + + cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); + cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); + + + #else + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + + auto & temporal2 = context.getTemporalCUB2(); + temporal2.resize(sizeof(key_t)*count); + + auto & temporal3 = context.getTemporalCUB3(); + temporal3.resize(sizeof(val_t)*count); + + if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true) + { + cub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + cub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true) + { + cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + + cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); + cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); + + #endif #else mgpu::mergesort(keys_input,vals_input,count,comp,context); diff --git a/src/util/mathutil.hpp b/src/util/mathutil.hpp index f1adaa8877e4025fe6e3bccb62deb90d4116fc75..1b05785a90aeac2155b2bae58745656c1a23daf4 100644 --- a/src/util/mathutil.hpp +++ b/src/util/mathutil.hpp @@ -108,6 +108,24 @@ namespace openfpm return (exponent == 0) ? 1 : (base * pow(base, exponent-1)); } + template<class T> + double intpowlog(const T x, unsigned const e) + { + if (e == 0) return 1.0; + if (e % 2 == 0) + { + double h = intpowlog(x, e / 2); + return h * h; + } + else + { + double h = intpowlog(x, e / 2); + return h * h * x; + } + } + + + /* \brief Return the positive modulo of a number * * # Example diff --git a/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp b/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp index 744efeacd19e7df2518ec103884e3d1552e6656c..2f412bb04aab1ddc448f47a1bfa90bcf97eb36cc 100644 --- a/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp +++ b/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp @@ -119,7 +119,7 @@ public: inline __host__ __device__ size_type size() const { return boost::mpl::at<vector,boost::mpl::int_<0>>::type::value; } size_type max_size() const { return num_elements(); } bool empty() const { return size() == 0; } - size_type num_dimensions() const { return NumDims; } + inline __device__ __host__ size_type num_dimensions() const { return NumDims; } inline __host__ __device__ const index* strides() const { return strides_; } size_type num_elements() const @@ -214,6 +214,7 @@ public: return *this; } + __device__ __host__ T* origin_mutable() const { return const_cast<T*>(this->base_); } __device__ __host__ T* origin() { return this->base_; } __device__ __host__ const T* origin() const { return this->base_; } diff --git a/src/util/object_creator.hpp b/src/util/object_creator.hpp index aafc92b9f13cf37576be9421b0f4825cd52bdace..6ecad2b5270be83b341d31849937f6afe085c7f7 100644 --- a/src/util/object_creator.hpp +++ b/src/util/object_creator.hpp @@ -300,6 +300,26 @@ struct object_creator_chunking_impl<v,vc,p1,prp...> typedef typename boost::mpl::push_front<vc_step, ele >::type type; }; +template <typename T> +struct object_creator_chunking_encapsulator +{ + typedef T type; +}; + +template <typename type> +struct get_base_type_for_object_creator_chunking +{ + typedef object_creator_chunking_encapsulator<typename type::value_type> ele; +}; + + +template <unsigned int N1, std::size_t N2, typename T> +struct get_base_type_for_object_creator_chunking< std::array<T, N2>[N1] > +{ + typedef object_creator_chunking_encapsulator<T[N1]> ele; +}; + + /*! \brief Implementation of object creator * * \tparam v original boost::fusion::vector @@ -312,10 +332,10 @@ struct object_creator_chunking_impl<v,vc,prp> typedef typename boost::remove_reference< typename boost::mpl::at< v,boost::mpl::int_<prp> >::type>::type ele_array; // Element without array - typedef typename ele_array::value_type ele; + typedef typename get_base_type_for_object_creator_chunking<ele_array>::ele ele; // push on the vector the element p1 - typedef typename boost::mpl::push_front<vc, ele >::type type; + typedef typename boost::mpl::push_front<vc, typename ele::type >::type type; }; diff --git a/src/util/object_s_di.hpp b/src/util/object_s_di.hpp index 905449e7a94d11f82ced620e71fec75b4db599c6..b26d391b1e0b331c0f551465f337eb16cd79e763 100644 --- a/src/util/object_s_di.hpp +++ b/src/util/object_s_di.hpp @@ -15,7 +15,28 @@ #include <boost/mpl/range_c.hpp> #include <boost/fusion/include/size.hpp> -template <typename> struct Debug; +template<typename T> +struct object_s_di_e_cnk_meta_copy_selector +{ + template<unsigned int T_value, typename v_prp, typename copy_rtype, typename Tsrc, typename Tdst> + static inline void copy(const Tsrc & src, Tdst & dst, int sub_id) + { + meta_copy<typename copy_rtype::value_type>::meta_copy_(src.template get<T_value>(),dst.template get<boost::mpl::at<v_prp,boost::mpl::int_<T_value>>::type::value>()[sub_id]); + } +}; + +template<unsigned int N1, typename T> +struct object_s_di_e_cnk_meta_copy_selector<T[N1]> +{ + template<unsigned int T_value, typename v_prp, typename copy_rtype, typename Tsrc, typename Tdst> + static inline void copy(const Tsrc & src, Tdst & dst, int sub_id) + { + for (int i = 0 ; i < N1 ; i++) + { + meta_copy<typename T::value_type>::meta_copy_(src.template get<T_value>()[i],dst.template get<boost::mpl::at<v_prp,boost::mpl::int_<T_value>>::type::value>()[i][sub_id]); + } + } +}; /*! \brief this class is a functor for "for_each" algorithm * @@ -60,9 +81,11 @@ struct object_s_di_e_cnk void operator()(T& t) { // Remove the reference from the type to copy - typedef typename boost::remove_reference<decltype(dst.template get<boost::mpl::at<v_prp,boost::mpl::int_<T::value>>::type::value>())>::type::value_type copy_rtype; + typedef typename boost::remove_reference<decltype(dst.template get<boost::mpl::at<v_prp,boost::mpl::int_<T::value>>::type::value>())>::type copy_rtype; + + //meta_copy<copy_rtype>::meta_copy_(src.template get<T::value>(),dst.template get<boost::mpl::at<v_prp,boost::mpl::int_<T::value>>::type::value>()[sub_id]); - meta_copy<copy_rtype>::meta_copy_(src.template get<T::value>(),dst.template get<boost::mpl::at<v_prp,boost::mpl::int_<T::value>>::type::value>()[sub_id]); + object_s_di_e_cnk_meta_copy_selector<copy_rtype>::template copy<T::value,v_prp,copy_rtype>(src,dst,sub_id); } }; diff --git a/src/util/object_si_d.hpp b/src/util/object_si_d.hpp index 17a4034b903034f3d3c417d6edd223601b880d4a..78490647478a3c1832e4b9580a0ad7148e027213 100644 --- a/src/util/object_si_d.hpp +++ b/src/util/object_si_d.hpp @@ -59,6 +59,29 @@ struct object_si_d_e } }; +template<typename T> +struct object_si_d_e_cnk_meta_copy_selector +{ + template<unsigned int T_value, typename v_prp, typename ctype, typename Tsrc, typename Tdst> + static inline void copy(const Tsrc & src, Tdst & dst, int sub_id) + { + meta_copy<ctype>::meta_copy_(src.template get<boost::mpl::at<v_prp,boost::mpl::int_<T_value>>::type::value>()[sub_id],dst.template get<T_value>()); + } +}; + +template<unsigned int N1, typename T> +struct object_si_d_e_cnk_meta_copy_selector<T[N1]> +{ + template<unsigned int T_value, typename v_prp, typename ctype, typename Tsrc, typename Tdst> + static inline void copy(const Tsrc & src, Tdst & dst, int sub_id) + { + for (int i = 0 ; i < N1 ; i++) + { + meta_copy<T>::meta_copy_(src.template get<boost::mpl::at<v_prp,boost::mpl::int_<T_value>>::type::value>()[i][sub_id],dst.template get<T_value>()[i]); + } + } +}; + /*! \brief this class is a functor for "for_each" algorithm * * This class is a functor for "for_each" algorithm. For each @@ -102,7 +125,7 @@ struct object_si_d_e_cnk { typedef typename boost::mpl::at<typename v_dst::type,typename boost::mpl::int_<T::value>>::type ctype; - meta_copy<ctype>::meta_copy_(src.template get<boost::mpl::at<v_prp,boost::mpl::int_<T::value>>::type::value>()[sub_id],dst.template get<T::value>()); + object_si_d_e_cnk_meta_copy_selector<ctype>::template copy<T::value,v_prp,ctype>(src,dst,sub_id); } }; diff --git a/src/util/performance/performance_util.hpp b/src/util/performance/performance_util.hpp index 4468ee45e7ad4208c169bfd2f44d4dfb7858f693..f90ec5c3d8f20f9218b0f5d32f760c24a37959b9 100644 --- a/src/util/performance/performance_util.hpp +++ b/src/util/performance/performance_util.hpp @@ -12,8 +12,9 @@ #include <boost/filesystem.hpp> #include <boost/property_tree/ptree.hpp> #include <boost/property_tree/xml_parser.hpp> +#include <fstream> -static void addUpdtateTime(GoogleChart & cg, int np) +static void addUpdateTime(GoogleChart & cg, int np, const std::string & base, const std::string & filename) { time_t t = time(0); // get time now struct tm * now = localtime( & t ); @@ -21,12 +22,36 @@ static void addUpdtateTime(GoogleChart & cg, int np) std::stringstream str; std::string commit; - commit = exec("git rev-parse HEAD"); + + std::system("git rev-parse HEAD >test.txt"); // execute the UNIX command "ls -l >test.txt" + std::ifstream("test.txt") >> commit; + + std::string prev_commit; + + std::system(std::string("cat commit_f_" + base + " > test.txt").c_str()); // execute the UNIX command "ls -l >test.txt" + std::ifstream("test.txt") >> prev_commit; str << "<h3>Updated: " << now->tm_mday << "/" << now->tm_mon + 1 << "/" << now->tm_year+1900 << " " << now->tm_hour << ":" << now->tm_min << ":" - << now->tm_sec << " commit: " << commit << " run with: " << np << " processes" << std::endl; + << now->tm_sec << " commit: " << commit << " run with: " << np << " processes<br>previous: <a href=\"" << filename << "_" << prev_commit << ".html\">here</a>" << "</h3>" << std::endl; + + cg.addHTML(str.str()); +} + +static void createCommitFile(const std::string & tmp) +{ + time_t t = time(0); // get time now + struct tm * now = localtime( & t ); + + std::stringstream str; + + std::string commit; + commit = exec("git rev-parse HEAD"); + std::cout << tmp << " " << commit << std::endl; + + std::ofstream f("commit_f_" + tmp); + f << commit << std::endl; - cg.addHTML(str.str()); + f.close(); } static inline void warning_set(int & warning_level, double mean, double mean_ref, double sigma) diff --git a/src/util/test/util_test.hpp b/src/util/test/util_test.hpp index 5960f6c5d945d11e1f777789af264714c43f1a74..842d24f234af58fbece45168a4f70e0da160eea5 100644 --- a/src/util/test/util_test.hpp +++ b/src/util/test/util_test.hpp @@ -586,9 +586,9 @@ BOOST_AUTO_TEST_CASE( check_templates_util_function ) } { - //! [Check has_value_type] + //! [Check has_value_type_ofp] - struct test_has_value_type + struct test_has_value_type_ofp { typedef int value_type; }; @@ -598,12 +598,12 @@ BOOST_AUTO_TEST_CASE( check_templates_util_function ) }; - int val = has_value_type_ofp<test_has_value_type>::value; + int val = has_value_type_ofp<test_has_value_type_ofp>::value; BOOST_REQUIRE_EQUAL(val, true); val = has_value_type_ofp<test_has_no_value_type>::value; BOOST_REQUIRE_EQUAL(val, false); - //! [Check has_value_type] + //! [Check has_value_type_ofp] }