From 640054a3b219bd7c612a9f8e21eed72cd344e3ac Mon Sep 17 00:00:00 2001 From: Pietro Incardona <incardon@mpi-cbg.de> Date: Thu, 1 Apr 2021 20:27:44 +0200 Subject: [PATCH] HIP moving on --- CMakeLists.txt | 15 +- configure | 4 + src/CMakeLists.txt | 50 ++++- src/Grid/cuda/cuda_grid_gpu_tests.cu | 28 +++ src/Grid/cuda/map_grid_cuda_ker.cuh | 2 +- src/Grid/grid_base_implementation.hpp | 2 +- src/Grid/grid_sm.hpp | 8 +- src/Grid/map_grid.hpp | 14 +- src/NN/CellList/CellList_gpu_test.cu | 3 + src/NN/CellList/CellList_test.hpp | 2 +- src/NN/CellList/cuda/CellList_gpu.hpp | 3 - src/Point_test.hpp | 2 +- src/Space/SpaceBox.hpp | 2 +- src/SparseGrid/SparseGrid_chunk_copy.hpp | 4 +- src/SparseGrid/SparseGrid_conv_opt.hpp | 2 +- src/SparseGridGpu/SparseGridGpu_kernels.cuh | 4 +- .../cuda/map_vector_cuda_funcs_tests.cu | 2 +- src/Vector/cuda/map_vector_cuda_ker.cuh | 18 +- .../cuda/map_vector_sparse_cuda_kernels.cuh | 7 +- src/Vector/map_vector.hpp | 5 +- src/Vector/map_vector_std.hpp | 16 +- src/config/config_cmake.h.in | 6 + src/timer.hpp | 34 ++- src/util/cuda/modern_gpu_tests.cu | 2 +- src/util/cuda/moderngpu/transform.hxx | 2 + src/util/cuda/ofp_context.hxx | 2 +- src/util/cuda/scan_ofp.cuh | 48 +++-- src/util/cuda/sort_ofp.cuh | 194 ++++++++++++------ .../multi_array_ref_subarray_openfpm.hpp | 2 +- 29 files changed, 362 insertions(+), 121 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 57e0ebf8..8acda874 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ if (POLICY CMP0074) cmake_policy(SET CMP0074 OLD) endif () -list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/) +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/ /opt/rocm/hip/cmake) set(BOOST_INCLUDE ${Boost_INCLUDE_DIR} CACHE PATH "Include directory for BOOST") set(LIBHILBERT_ROOT CACHE PATH "LibHilbert root path") @@ -15,6 +15,8 @@ set(ENABLE_GPU CACHE BOOL "Disable the GPU code independently that a cuda compil set(TEST_PERFORMANCE CACHE BOOL "Enable test performance") set(ALPAKA_ROOT CACHE PATH "Alpaka root path") set(CUDA_ON_CPU CACHE BOOL "Make Cuda work on heap") +set(HIP_ENABLE CACHE BOOL "Enable HIP compiler") +set(AMD_ARCH_COMPILE "gfx900" CACHE STRING "AMD gpu architecture used to compile kernels") if (ENABLE_GPU) set(CUDA_ON_CPU OFF) @@ -31,7 +33,7 @@ message("Searching Vc in ${Vc_DIR}") find_package(Boost 1.72.0 REQUIRED COMPONENTS unit_test_framework iostreams program_options system filesystem OPTIONAL_COMPONENTS fiber context) find_package(LibHilbert REQUIRED) find_package(Vc REQUIRED) - +find_package(HIP) ###### CONFIG.h FILE ###### @@ -47,6 +49,11 @@ if(TEST_PERFORMANCE) set(DEFINE_PERFORMANCE_TEST "#define PERFORMANCE_TEST") endif() +if(HIP_FOUND) + set(DEFINE_HIP_GPU "#define HIP_GPU") + set(DEFINE_CUDIFY_USE_HIP "#define CUDIFY_USE_HIP") +endif() + if (Boost_FOUND) set(DEFINE_HAVE_BOOST "#define HAVE_BOOST") set(DEFINE_HAVE_BOOST_IOSTREAMS "#define HAVE_BOOST_IOSTREAMS") @@ -73,6 +80,10 @@ if(CUDA_ON_CPU) set(DEFINE_CUDA_GPU "#define CUDA_GPU") endif() +if(HIP_FOUND) + set(DEFINE_CUDA_GPU "#define CUDA_GPU") +endif() + if(LIBHILBERT_FOUND) set(DEFINE_HAVE_LIBHILBERT "#define HAVE_LIBHILBERT 1") else() diff --git a/configure b/configure index 878cf86f..e00aff0e 100755 --- a/configure +++ b/configure @@ -122,6 +122,7 @@ with_petsc with_eigen with_vcdevel enable_gpu +enable_hip enable_asan ' @@ -258,6 +259,9 @@ do fi conf_options="$conf_options -DENABLE_GPU=ON" ;; + hip) + conf_options="$conf_options -DHIP_ENABLE=ON" + ;; asan) conf_options="$conf_options -DENABLE_ASAN=ON" ;; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fc99362d..eca74816 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -21,7 +21,7 @@ if (TEST_PERFORMANCE) Vector/performance/vector_performance_test.cu) endif () -if (CUDA_FOUND OR CUDA_ON_CPU) +if (CUDA_FOUND OR CUDA_ON_CPU OR HIP_FOUND) set(CUDA_SOURCES ${CUDA_SOURCES} Vector/map_vector_sparse_unit_tests.cu Vector/vector_gpu_unit_tests.cu @@ -45,7 +45,8 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") endif() if (CUDA_ON_CPU) - add_definitions(-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000) + + add_definitions(-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 ) set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX) set_source_files_properties(isolation.cu PROPERTIES LANGUAGE CXX) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel") @@ -53,18 +54,47 @@ if (CUDA_ON_CPU) endif() endif() -add_executable(mem_map ${CUDA_SOURCES} - main.cpp - data_type/aggregate_unit_tests.cpp - util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp - memory_ly/memory_conf_unit_tests.cpp - Space/tests/SpaceBox_unit_tests.cpp - Space/Shape/Sphere_unit_test.cpp +if ( HIP_ENABLE AND HIP_FOUND ) + + list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) + + if (CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND HIP_HIPCC_FLAGS -O0) + endif() + + add_definitions(-D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0) + set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX) + + set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) + + hip_add_executable(mem_map ${CUDA_SOURCES} + main.cpp + data_type/aggregate_unit_tests.cpp + util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp + memory_ly/memory_conf_unit_tests.cpp + Space/tests/SpaceBox_unit_tests.cpp + Space/Shape/Sphere_unit_test.cpp + SparseGrid/SparseGrid_unit_tests.cpp + SparseGrid/SparseGrid_chunk_copy_unit_tests.cpp + Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp + Grid/Geometry/tests/grid_smb_tests.cpp) + +else() + + add_executable(mem_map ${CUDA_SOURCES} + main.cpp + data_type/aggregate_unit_tests.cpp + util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp + memory_ly/memory_conf_unit_tests.cpp + Space/tests/SpaceBox_unit_tests.cpp + Space/Shape/Sphere_unit_test.cpp SparseGrid/SparseGrid_unit_tests.cpp SparseGrid/SparseGrid_chunk_copy_unit_tests.cpp - Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp + Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp Grid/Geometry/tests/grid_smb_tests.cpp) +endif() + set_property(TARGET mem_map PROPERTY CUDA_ARCHITECTURES 60 75) if (CUDA_FOUND) diff --git a/src/Grid/cuda/cuda_grid_gpu_tests.cu b/src/Grid/cuda/cuda_grid_gpu_tests.cu index d26103e0..bb0bb4af 100644 --- a/src/Grid/cuda/cuda_grid_gpu_tests.cu +++ b/src/Grid/cuda/cuda_grid_gpu_tests.cu @@ -32,6 +32,18 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func) auto gcf = c3.getGPUIterator(k1,k2); +#ifdef __HIP__ + + BOOST_REQUIRE_EQUAL(gcf.thr.x,8ul); + BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf.thr.z,4ul); + + BOOST_REQUIRE_EQUAL(gcf.wthr.x,8ul); + BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf.wthr.z,16ul); + +#else + BOOST_REQUIRE_EQUAL(gcf.thr.x,16ul); BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul); BOOST_REQUIRE_EQUAL(gcf.thr.z,8ul); @@ -40,12 +52,26 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func) BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul); BOOST_REQUIRE_EQUAL(gcf.wthr.z,8ul); +#endif + grid_key_dx<3> k3({50,50,50}); grid_key_dx<3> k4({62,62,62}); grid_key_dx<3> k5({60,61,62}); auto gcf2 = c3.getGPUIterator(k3,k4); +#ifdef __HIP__ + + BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.z,4ul); + + BOOST_REQUIRE_EQUAL(gcf2.wthr.x,2ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.z,4ul); + +#else + BOOST_REQUIRE_EQUAL(gcf2.thr.x,13ul); BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul); BOOST_REQUIRE_EQUAL(gcf2.thr.z,8ul); @@ -54,6 +80,8 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func) BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul); BOOST_REQUIRE_EQUAL(gcf2.wthr.z,2ul); +#endif + gcf2 = c3.getGPUIterator(k3,k4,511); BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul); diff --git a/src/Grid/cuda/map_grid_cuda_ker.cuh b/src/Grid/cuda/map_grid_cuda_ker.cuh index 6a2f9fbf..d27aeab0 100644 --- a/src/Grid/cuda/map_grid_cuda_ker.cuh +++ b/src/Grid/cuda/map_grid_cuda_ker.cuh @@ -421,7 +421,7 @@ public: * \param stop end point * */ - struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = 1024) const + struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = default_kernel_wg_threads_) const { return getGPUIterator_impl<dim>(g1,key1,key2,n_thr); } diff --git a/src/Grid/grid_base_implementation.hpp b/src/Grid/grid_base_implementation.hpp index 9ff639d6..cb615f90 100644 --- a/src/Grid/grid_base_implementation.hpp +++ b/src/Grid/grid_base_implementation.hpp @@ -730,7 +730,7 @@ public: * \param stop end point * */ - struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = 1024) const + struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = default_kernel_wg_threads_) const { return getGPUIterator_impl<dim>(g1,key1,key2,n_thr); } diff --git a/src/Grid/grid_sm.hpp b/src/Grid/grid_sm.hpp index ed1e1d9e..f49a056e 100755 --- a/src/Grid/grid_sm.hpp +++ b/src/Grid/grid_sm.hpp @@ -125,7 +125,7 @@ struct ite_gpu template<unsigned int N, typename T> class grid_sm; template<unsigned int dim, typename T2, typename T> -ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = 1024); +ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = default_kernel_wg_threads_); //! Declaration print_warning_on_adjustment template <unsigned int dim, typename linearizer> class print_warning_on_adjustment; @@ -733,7 +733,7 @@ public: return grid_key_dx_iterator_sub<N>(*this,start,stop); } -#ifdef CUDA_GPU +#if defined(CUDA_GPU) /*! \brief Get an iterator for the GPU * @@ -742,7 +742,7 @@ public: * */ template<typename T2> - struct ite_gpu<N> getGPUIterator(const grid_key_dx<N,T2> & key1, const grid_key_dx<N,T2> & key2, size_t n_thr = 1024) const + struct ite_gpu<N> getGPUIterator(const grid_key_dx<N,T2> & key1, const grid_key_dx<N,T2> & key2, size_t n_thr = default_kernel_wg_threads_) const { return getGPUIterator_impl<N>(*this,key1,key2,n_thr); } @@ -753,7 +753,7 @@ public: * \param stop end point * */ - struct ite_gpu<N> getGPUIterator(size_t n_thr = 1024) const + struct ite_gpu<N> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<N> k1; grid_key_dx<N> k2; diff --git a/src/Grid/map_grid.hpp b/src/Grid/map_grid.hpp index e6965bae..299bb85e 100755 --- a/src/Grid/map_grid.hpp +++ b/src/Grid/map_grid.hpp @@ -190,7 +190,19 @@ public: * \param g grid to copy * */ - grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g) + __device__ grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g) + { + printf("Error grid_base operator= is not defined in device code\n"); + + return *this; + } + + /*! \brief It copy a grid + * + * \param g grid to copy + * + */ + __host__ grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g) { (static_cast<grid_base_impl<dim,T,S, memory_traits_lin> *>(this))->swap(g.duplicate()); diff --git a/src/NN/CellList/CellList_gpu_test.cu b/src/NN/CellList/CellList_gpu_test.cu index 364d2e0b..9f323818 100644 --- a/src/NN/CellList/CellList_gpu_test.cu +++ b/src/NN/CellList/CellList_gpu_test.cu @@ -1363,6 +1363,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props); cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m); + auto & s_t_ns = cl2.getSortToNonSort(); pl.template hostToDevice<0>(); @@ -1371,6 +1372,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di // Domain particles + auto & gdsi = cl2.getDomainSortIds(); gdsi.template deviceToHost<0>(); s_t_ns.template deviceToHost<0>(); @@ -1497,6 +1499,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di } BOOST_REQUIRE_EQUAL(check,true); + } } diff --git a/src/NN/CellList/CellList_test.hpp b/src/NN/CellList/CellList_test.hpp index 1336823e..e93bb76f 100644 --- a/src/NN/CellList/CellList_test.hpp +++ b/src/NN/CellList/CellList_test.hpp @@ -395,7 +395,7 @@ template<unsigned int dim, typename T, typename CellS> void Test_NN_iterator_rad for (size_t i = 0 ; i < dim ; i++) { - vrp.template get<0>(j)[i] = ((float)rand() / RAND_MAX)*(box.getHigh(i) - box.getLow(i)) + box.getLow(i); + vrp.template get<0>(j)[i] = ((float)rand() / (float)RAND_MAX)*(box.getHigh(i) - box.getLow(i)) + box.getLow(i); } } diff --git a/src/NN/CellList/cuda/CellList_gpu.hpp b/src/NN/CellList/cuda/CellList_gpu.hpp index 5537ec0f..4913bd4a 100644 --- a/src/NN/CellList/cuda/CellList_gpu.hpp +++ b/src/NN/CellList/cuda/CellList_gpu.hpp @@ -356,15 +356,12 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> { #ifdef __NVCC__ - CUDA_SAFE() - // Than we construct the ids auto ite_gpu = pl.getGPUIteratorTo(stop-start-1); cl_n.resize(this->gr_cell.size()+1); cl_n.template fill<0>(0); -// CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type))); part_ids.resize(stop - start); diff --git a/src/Point_test.hpp b/src/Point_test.hpp index c41e0429..ede644fa 100755 --- a/src/Point_test.hpp +++ b/src/Point_test.hpp @@ -335,7 +335,7 @@ public: * \return this * */ - inline Point_test<T> operator= (const Point_test<T> & p) + __device__ __host__ inline Point_test<T> & operator= (const Point_test<T> & p) { boost::fusion::at_c<0>(data) = boost::fusion::at_c<0>(p.data); boost::fusion::at_c<1>(data) = boost::fusion::at_c<1>(p.data); diff --git a/src/Space/SpaceBox.hpp b/src/Space/SpaceBox.hpp index 0b4fdb88..b595b3e2 100644 --- a/src/Space/SpaceBox.hpp +++ b/src/Space/SpaceBox.hpp @@ -211,7 +211,7 @@ class SpaceBox : public Box<dim,T> Point<dim,T> p; for (size_t i = 0 ; i < dim ; i++) - p.get(i) = ((T)rand())/RAND_MAX * (this->getHigh(i) - this->getLow(i)) + this->getLow(i); + p.get(i) = ((T)rand())/(T)RAND_MAX * (this->getHigh(i) - this->getLow(i)) + this->getLow(i); return p; } diff --git a/src/SparseGrid/SparseGrid_chunk_copy.hpp b/src/SparseGrid/SparseGrid_chunk_copy.hpp index 41b8ff72..8fe4d603 100644 --- a/src/SparseGrid/SparseGrid_chunk_copy.hpp +++ b/src/SparseGrid/SparseGrid_chunk_copy.hpp @@ -8,9 +8,11 @@ #ifndef SPARSEGRID_CHUNK_COPY_HPP_ #define SPARSEGRID_CHUNK_COPY_HPP_ -#if !defined(__NVCC__) || defined(CUDA_ON_CPU) +#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__) +// Nvcc does not like VC ... for some reason #include <Vc/Vc> #endif + #include "util/mathutil.hpp" diff --git a/src/SparseGrid/SparseGrid_conv_opt.hpp b/src/SparseGrid/SparseGrid_conv_opt.hpp index 7d50d97f..334efffd 100644 --- a/src/SparseGrid/SparseGrid_conv_opt.hpp +++ b/src/SparseGrid/SparseGrid_conv_opt.hpp @@ -110,7 +110,7 @@ struct conv_impl } }; -#if !defined(__NVCC__) || defined(CUDA_ON_CPU) +#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__) template<unsigned int dir,int p, unsigned int prop_src1,typename chunk_type, typename vect_type, typename ids_type> diff --git a/src/SparseGridGpu/SparseGridGpu_kernels.cuh b/src/SparseGridGpu/SparseGridGpu_kernels.cuh index 33edebbf..bd45286d 100644 --- a/src/SparseGridGpu/SparseGridGpu_kernels.cuh +++ b/src/SparseGridGpu/SparseGridGpu_kernels.cuh @@ -1076,7 +1076,7 @@ namespace SparseGridGpuKernels #ifdef SE_CLASS1 if (numCnt > blockDim.x) - {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,blockDim.x);} + {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(int)blockDim.x);} #endif @@ -1164,7 +1164,7 @@ namespace SparseGridGpuKernels #ifdef SE_CLASS1 if (numCnt > blockDim.x) - {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,blockDim.x);} + {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(int)blockDim.x);} #endif diff --git a/src/Vector/cuda/map_vector_cuda_funcs_tests.cu b/src/Vector/cuda/map_vector_cuda_funcs_tests.cu index 907d29c6..01aec66f 100644 --- a/src/Vector/cuda/map_vector_cuda_funcs_tests.cu +++ b/src/Vector/cuda/map_vector_cuda_funcs_tests.cu @@ -8,7 +8,7 @@ #define BOOST_GPU_ENABLED __host__ __device__ -#include "util/cudify/cudify.hpp" +#include "util/cuda_launch.hpp" #include "config.h" #define BOOST_TEST_DYN_LINK diff --git a/src/Vector/cuda/map_vector_cuda_ker.cuh b/src/Vector/cuda/map_vector_cuda_ker.cuh index d6ebfd8d..d5a0df7c 100644 --- a/src/Vector/cuda/map_vector_cuda_ker.cuh +++ b/src/Vector/cuda/map_vector_cuda_ker.cuh @@ -453,7 +453,7 @@ namespace openfpm * * */ - __host__ ite_gpu<1> getGPUIterator(size_t n_thr = 1024) const + __host__ ite_gpu<1> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<1> start(0); grid_key_dx<1> stop(size()-1); @@ -465,7 +465,7 @@ namespace openfpm * * */ - ite_gpu<1> getGPUIteratorTo(size_t stop, size_t n_thr = 1024) const + ite_gpu<1> getGPUIteratorTo(size_t stop, size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<1> start(0); grid_key_dx<1> stop_(stop); @@ -478,7 +478,19 @@ namespace openfpm * \param object to copy * */ - vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v) + __device__ vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v) + { + printf("Error vector_gpu_ker the operator= is not defined in device code\n"); + + return *this; + } + + /*! \brief operator= this operator absorb the pointers, consider that this object wrap device pointers + * + * \param object to copy + * + */ + __host__ vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v) { v_size = v.v_size; base = v.base; diff --git a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh index 28d3963c..290e2632 100644 --- a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh +++ b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh @@ -10,14 +10,15 @@ #ifdef __NVCC__ +#include "config.h" + #if CUDART_VERSION < 11000 #include "util/cuda/cub_old/util_type.cuh" #include "util/cuda/cub_old/block/block_scan.cuh" #include "util/cuda/moderngpu/operators.hxx" +#include "util/cuda/cuda_launch.hpp" #else - #if !defined(CUDA_ON_CPU) - #include "cub/util_type.cuh" - #include "cub/block/block_scan.cuh" + #if !defined(CUDA_ON_CPU) #include "util/cuda/moderngpu/operators.hxx" #endif #endif diff --git a/src/Vector/map_vector.hpp b/src/Vector/map_vector.hpp index 1118c76e..e0243c53 100644 --- a/src/Vector/map_vector.hpp +++ b/src/Vector/map_vector.hpp @@ -1448,7 +1448,6 @@ namespace openfpm #endif } - return *this; } @@ -1680,7 +1679,7 @@ namespace openfpm * * */ - ite_gpu<1> getGPUIteratorTo(long int stop, size_t n_thr = 1024) const + ite_gpu<1> getGPUIteratorTo(long int stop, size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<1,long int> start(0); grid_key_dx<1,long int> stop_(stop); @@ -1708,7 +1707,7 @@ namespace openfpm * * */ - ite_gpu<1> getGPUIterator(size_t n_thr = 1024) const + ite_gpu<1> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<1> start(0); grid_key_dx<1> stop(size()-1); diff --git a/src/Vector/map_vector_std.hpp b/src/Vector/map_vector_std.hpp index 2d4a875f..98431720 100644 --- a/src/Vector/map_vector_std.hpp +++ b/src/Vector/map_vector_std.hpp @@ -676,7 +676,21 @@ public: * \return itself * */ - vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & operator=(const vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & v) + __device__ vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & operator=(const vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & v) + { + printf("Error openfpm::vector operator= cannot be used on device"); + + return *this; + } + + /*! \brief Operator= copy the vector into another + * + * \param v vector to copy + * + * \return itself + * + */ + __host__ vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & operator=(const vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & v) { base = v.base; diff --git a/src/config/config_cmake.h.in b/src/config/config_cmake.h.in index 01c19697..2c956fc2 100644 --- a/src/config/config_cmake.h.in +++ b/src/config/config_cmake.h.in @@ -4,6 +4,12 @@ ${DEFINE_COVERTY_SCAN} /* GPU support */ ${DEFINE_CUDA_GPU} +/* HIP GPU support */ +${DEFINE_HIP_GPU} + +/* HIP Cudify GPU support */ +${DEFINE_CUDIFY_USE_HIP} + /* Debug */ ${DEFINE_DEBUG} /**/ diff --git a/src/timer.hpp b/src/timer.hpp index 787223a8..f87a0e1c 100644 --- a/src/timer.hpp +++ b/src/timer.hpp @@ -42,14 +42,22 @@ class timer clock_t cstop; #if defined(__NVCC__) && !defined(CUDA_ON_CPU) - cudaEvent_t start_g, stop_g; + #ifdef __HIP__ + hipEvent_t start_g, stop_g; + #else + cudaEvent_t start_g, stop_g; + #endif #endif // Fill the stop point void check() { -#if defined(SYNC_BEFORE_TAKE_TIME) && defined(__NVCC__) +#if defined(SYNC_BEFORE_TAKE_TIME) && defined(__NVCC__) + #ifdef __HIP__ + hipDeviceSynchronize(); + #else cudaDeviceSynchronize(); + #endif #endif #ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time @@ -153,21 +161,37 @@ public: void startGPU() { + #ifdef __HIP__ + hipEventCreate(&start_g); + hipEventRecord(start_g,0); + #else cudaEventCreate(&start_g); cudaEventRecord(start_g,0); + #endif } void stopGPU() { - cudaEventCreate(&stop_g); - cudaEventRecord(stop_g,0); - cudaEventSynchronize(stop_g); + #ifdef __HIP__ + hipEventCreate(&stop_g); + hipEventRecord(stop_g,0); + hipEventSynchronize(stop_g); + #else + cudaEventCreate(&stop_g); + cudaEventRecord(stop_g,0); + cudaEventSynchronize(stop_g); + #endif } double getwctGPU() { float elapsedTime; + + #ifdef __HIP__ + hipEventElapsedTime(&elapsedTime, start_g,stop_g); + #else cudaEventElapsedTime(&elapsedTime, start_g,stop_g); + #endif return elapsedTime; } diff --git a/src/util/cuda/modern_gpu_tests.cu b/src/util/cuda/modern_gpu_tests.cu index 167a7103..ddd8d8b4 100644 --- a/src/util/cuda/modern_gpu_tests.cu +++ b/src/util/cuda/modern_gpu_tests.cu @@ -15,7 +15,7 @@ BOOST_AUTO_TEST_SUITE( modern_gpu_tests ) -BOOST_AUTO_TEST_CASE( modern_gpu_transform_lbs ) +BOOST_AUTO_TEST_CASE( modern_gpu_loadbalance_lbs ) { std::cout << "Test modern gpu test tansform_lbs" << "\n"; diff --git a/src/util/cuda/moderngpu/transform.hxx b/src/util/cuda/moderngpu/transform.hxx index 617394d0..10905dcd 100644 --- a/src/util/cuda/moderngpu/transform.hxx +++ b/src/util/cuda/moderngpu/transform.hxx @@ -3,7 +3,9 @@ #include <random> #include <algorithm> +#ifndef __HIP__ #include <cuda.h> +#endif #include "launch_box.hxx" BEGIN_MGPU_NAMESPACE diff --git a/src/util/cuda/ofp_context.hxx b/src/util/cuda/ofp_context.hxx index 47d5327e..cd1607e8 100644 --- a/src/util/cuda/ofp_context.hxx +++ b/src/util/cuda/ofp_context.hxx @@ -151,7 +151,7 @@ namespace mgpu void init(int dev_num, gpu_context_opt opt) { cudaFuncAttributes attr; - cudaError_t result = cudaFuncGetAttributes(&attr, dummy_k<0>); + cudaError_t result = cudaFuncGetAttributes(&attr, (void *)dummy_k<0>); if(cudaSuccess != result) throw cuda_exception_t(result); _ptx_version = attr.ptxVersion; diff --git a/src/util/cuda/scan_ofp.cuh b/src/util/cuda/scan_ofp.cuh index 7e5c7eb6..96cdc884 100644 --- a/src/util/cuda/scan_ofp.cuh +++ b/src/util/cuda/scan_ofp.cuh @@ -15,7 +15,11 @@ #if CUDART_VERSION >= 11000 #ifndef CUDA_ON_CPU // Here we have for sure CUDA >= 11 - #include "cub/cub.cuh" + #ifdef __HIP__ + #include "hipcub/hipcub.hpp" + #else + #include "cub/cub.cuh" + #endif #ifndef SCAN_WITH_CUB #define SCAN_WITH_CUB #endif @@ -48,19 +52,39 @@ namespace openfpm #else #ifdef SCAN_WITH_CUB - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input, - output, - count); + #ifdef __HIP__ + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input, + output, + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + // Run + hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, + output, + count); + + #else + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input, + output, + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); + // Run + cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, + output, + count); - // Run - cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, - output, - count); + #endif #else mgpu::scan(input,count,output,context); diff --git a/src/util/cuda/sort_ofp.cuh b/src/util/cuda/sort_ofp.cuh index 32002952..ae2fba9f 100644 --- a/src/util/cuda/sort_ofp.cuh +++ b/src/util/cuda/sort_ofp.cuh @@ -16,7 +16,11 @@ #if CUDART_VERSION >= 11000 #ifndef CUDA_ON_CPU // Here we have for sure CUDA >= 11 - #include "cub/cub.cuh" + #ifdef __HIP__ + #include "hipcub/hipcub.hpp" + #else + #include "cub/cub.cuh" + #endif #ifndef SORT_WITH_CUB #define SORT_WITH_CUB #endif @@ -248,66 +252,134 @@ namespace openfpm #ifdef SORT_WITH_CUB - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - - auto & temporal2 = context.getTemporalCUB2(); - temporal2.resize(sizeof(key_t)*count); - - auto & temporal3 = context.getTemporalCUB3(); - temporal3.resize(sizeof(val_t)*count); - - if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true) - { - cub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); - - d_temp_storage = temporal.template getDeviceBuffer<0>(); - - // Run - cub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - } - else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true) - { - cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); - - d_temp_storage = temporal.template getDeviceBuffer<0>(); - - // Run - cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - } - - cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); - cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); + #ifdef __HIP__ + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + + auto & temporal2 = context.getTemporalCUB2(); + temporal2.resize(sizeof(key_t)*count); + + auto & temporal3 = context.getTemporalCUB3(); + temporal3.resize(sizeof(val_t)*count); + + if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true) + { + hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true) + { + hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + + cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); + cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); + + + #else + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + + auto & temporal2 = context.getTemporalCUB2(); + temporal2.resize(sizeof(key_t)*count); + + auto & temporal3 = context.getTemporalCUB3(); + temporal3.resize(sizeof(val_t)*count); + + if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true) + { + cub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + cub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true) + { + cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + + cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); + cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); + + #endif #else mgpu::mergesort(keys_input,vals_input,count,comp,context); diff --git a/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp b/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp index 744efeac..442aa1f3 100644 --- a/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp +++ b/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp @@ -119,7 +119,7 @@ public: inline __host__ __device__ size_type size() const { return boost::mpl::at<vector,boost::mpl::int_<0>>::type::value; } size_type max_size() const { return num_elements(); } bool empty() const { return size() == 0; } - size_type num_dimensions() const { return NumDims; } + inline __device__ __host__ size_type num_dimensions() const { return NumDims; } inline __host__ __device__ const index* strides() const { return strides_; } size_type num_elements() const -- GitLab