diff --git a/CMakeLists.txt b/CMakeLists.txt index 57e0ebf829f0a191f09b04ebc5404117306573f3..8acda8746fa57ce5cf40ff3e2b12d904764e3a61 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ if (POLICY CMP0074) cmake_policy(SET CMP0074 OLD) endif () -list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/) +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/ /opt/rocm/hip/cmake) set(BOOST_INCLUDE ${Boost_INCLUDE_DIR} CACHE PATH "Include directory for BOOST") set(LIBHILBERT_ROOT CACHE PATH "LibHilbert root path") @@ -15,6 +15,8 @@ set(ENABLE_GPU CACHE BOOL "Disable the GPU code independently that a cuda compil set(TEST_PERFORMANCE CACHE BOOL "Enable test performance") set(ALPAKA_ROOT CACHE PATH "Alpaka root path") set(CUDA_ON_CPU CACHE BOOL "Make Cuda work on heap") +set(HIP_ENABLE CACHE BOOL "Enable HIP compiler") +set(AMD_ARCH_COMPILE "gfx900" CACHE STRING "AMD gpu architecture used to compile kernels") if (ENABLE_GPU) set(CUDA_ON_CPU OFF) @@ -31,7 +33,7 @@ message("Searching Vc in ${Vc_DIR}") find_package(Boost 1.72.0 REQUIRED COMPONENTS unit_test_framework iostreams program_options system filesystem OPTIONAL_COMPONENTS fiber context) find_package(LibHilbert REQUIRED) find_package(Vc REQUIRED) - +find_package(HIP) ###### CONFIG.h FILE ###### @@ -47,6 +49,11 @@ if(TEST_PERFORMANCE) set(DEFINE_PERFORMANCE_TEST "#define PERFORMANCE_TEST") endif() +if(HIP_FOUND) + set(DEFINE_HIP_GPU "#define HIP_GPU") + set(DEFINE_CUDIFY_USE_HIP "#define CUDIFY_USE_HIP") +endif() + if (Boost_FOUND) set(DEFINE_HAVE_BOOST "#define HAVE_BOOST") set(DEFINE_HAVE_BOOST_IOSTREAMS "#define HAVE_BOOST_IOSTREAMS") @@ -73,6 +80,10 @@ if(CUDA_ON_CPU) set(DEFINE_CUDA_GPU "#define CUDA_GPU") endif() +if(HIP_FOUND) + set(DEFINE_CUDA_GPU "#define CUDA_GPU") +endif() + if(LIBHILBERT_FOUND) set(DEFINE_HAVE_LIBHILBERT "#define HAVE_LIBHILBERT 1") else() diff --git a/configure b/configure index 878cf86f1a54d6824e530ffe53afc8a1882ac1c2..e00aff0eb74fe175878cbcbb17bce28a3f63a023 100755 --- a/configure +++ b/configure @@ -122,6 +122,7 @@ with_petsc with_eigen with_vcdevel enable_gpu +enable_hip enable_asan ' @@ -258,6 +259,9 @@ do fi conf_options="$conf_options -DENABLE_GPU=ON" ;; + hip) + conf_options="$conf_options -DHIP_ENABLE=ON" + ;; asan) conf_options="$conf_options -DENABLE_ASAN=ON" ;; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fc99362d3db209abfbcda0acb87a9e21d2d7838b..eca7481673ed1d295ab034a5db544c2fbb433334 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -21,7 +21,7 @@ if (TEST_PERFORMANCE) Vector/performance/vector_performance_test.cu) endif () -if (CUDA_FOUND OR CUDA_ON_CPU) +if (CUDA_FOUND OR CUDA_ON_CPU OR HIP_FOUND) set(CUDA_SOURCES ${CUDA_SOURCES} Vector/map_vector_sparse_unit_tests.cu Vector/vector_gpu_unit_tests.cu @@ -45,7 +45,8 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") endif() if (CUDA_ON_CPU) - add_definitions(-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000) + + add_definitions(-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 ) set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX) set_source_files_properties(isolation.cu PROPERTIES LANGUAGE CXX) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel") @@ -53,18 +54,47 @@ if (CUDA_ON_CPU) endif() endif() -add_executable(mem_map ${CUDA_SOURCES} - main.cpp - data_type/aggregate_unit_tests.cpp - util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp - memory_ly/memory_conf_unit_tests.cpp - Space/tests/SpaceBox_unit_tests.cpp - Space/Shape/Sphere_unit_test.cpp +if ( HIP_ENABLE AND HIP_FOUND ) + + list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) + + if (CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND HIP_HIPCC_FLAGS -O0) + endif() + + add_definitions(-D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0) + set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX) + + set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) + + hip_add_executable(mem_map ${CUDA_SOURCES} + main.cpp + data_type/aggregate_unit_tests.cpp + util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp + memory_ly/memory_conf_unit_tests.cpp + Space/tests/SpaceBox_unit_tests.cpp + Space/Shape/Sphere_unit_test.cpp + SparseGrid/SparseGrid_unit_tests.cpp + SparseGrid/SparseGrid_chunk_copy_unit_tests.cpp + Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp + Grid/Geometry/tests/grid_smb_tests.cpp) + +else() + + add_executable(mem_map ${CUDA_SOURCES} + main.cpp + data_type/aggregate_unit_tests.cpp + util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp + memory_ly/memory_conf_unit_tests.cpp + Space/tests/SpaceBox_unit_tests.cpp + Space/Shape/Sphere_unit_test.cpp SparseGrid/SparseGrid_unit_tests.cpp SparseGrid/SparseGrid_chunk_copy_unit_tests.cpp - Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp + Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp Grid/Geometry/tests/grid_smb_tests.cpp) +endif() + set_property(TARGET mem_map PROPERTY CUDA_ARCHITECTURES 60 75) if (CUDA_FOUND) diff --git a/src/Grid/cuda/cuda_grid_gpu_tests.cu b/src/Grid/cuda/cuda_grid_gpu_tests.cu index d26103e01083762fc1eb377aac9c886df228bda4..bb0bb4af8e2a0cad0e04c1287d44b38785f9df65 100644 --- a/src/Grid/cuda/cuda_grid_gpu_tests.cu +++ b/src/Grid/cuda/cuda_grid_gpu_tests.cu @@ -32,6 +32,18 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func) auto gcf = c3.getGPUIterator(k1,k2); +#ifdef __HIP__ + + BOOST_REQUIRE_EQUAL(gcf.thr.x,8ul); + BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf.thr.z,4ul); + + BOOST_REQUIRE_EQUAL(gcf.wthr.x,8ul); + BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf.wthr.z,16ul); + +#else + BOOST_REQUIRE_EQUAL(gcf.thr.x,16ul); BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul); BOOST_REQUIRE_EQUAL(gcf.thr.z,8ul); @@ -40,12 +52,26 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func) BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul); BOOST_REQUIRE_EQUAL(gcf.wthr.z,8ul); +#endif + grid_key_dx<3> k3({50,50,50}); grid_key_dx<3> k4({62,62,62}); grid_key_dx<3> k5({60,61,62}); auto gcf2 = c3.getGPUIterator(k3,k4); +#ifdef __HIP__ + + BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul); + BOOST_REQUIRE_EQUAL(gcf2.thr.z,4ul); + + BOOST_REQUIRE_EQUAL(gcf2.wthr.x,2ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul); + BOOST_REQUIRE_EQUAL(gcf2.wthr.z,4ul); + +#else + BOOST_REQUIRE_EQUAL(gcf2.thr.x,13ul); BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul); BOOST_REQUIRE_EQUAL(gcf2.thr.z,8ul); @@ -54,6 +80,8 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func) BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul); BOOST_REQUIRE_EQUAL(gcf2.wthr.z,2ul); +#endif + gcf2 = c3.getGPUIterator(k3,k4,511); BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul); diff --git a/src/Grid/cuda/map_grid_cuda_ker.cuh b/src/Grid/cuda/map_grid_cuda_ker.cuh index 6a2f9fbf257598fb8423901dd0bb6aac72b4d86f..d27aeab0a3234af06405a23c4195a544eb4b0db7 100644 --- a/src/Grid/cuda/map_grid_cuda_ker.cuh +++ b/src/Grid/cuda/map_grid_cuda_ker.cuh @@ -421,7 +421,7 @@ public: * \param stop end point * */ - struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = 1024) const + struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = default_kernel_wg_threads_) const { return getGPUIterator_impl<dim>(g1,key1,key2,n_thr); } diff --git a/src/Grid/grid_base_implementation.hpp b/src/Grid/grid_base_implementation.hpp index 9ff639d69398fdc6012962817e54e42937294ee7..cb615f90cf8e9942432875fd72fb62a666061743 100644 --- a/src/Grid/grid_base_implementation.hpp +++ b/src/Grid/grid_base_implementation.hpp @@ -730,7 +730,7 @@ public: * \param stop end point * */ - struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = 1024) const + struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = default_kernel_wg_threads_) const { return getGPUIterator_impl<dim>(g1,key1,key2,n_thr); } diff --git a/src/Grid/grid_sm.hpp b/src/Grid/grid_sm.hpp index ed1e1d9ea776d2e2d4b3c95cf1fb606d0c043bc5..f49a056ec57c11260c7c7b3e8c76f8ed7e9e72e4 100755 --- a/src/Grid/grid_sm.hpp +++ b/src/Grid/grid_sm.hpp @@ -125,7 +125,7 @@ struct ite_gpu template<unsigned int N, typename T> class grid_sm; template<unsigned int dim, typename T2, typename T> -ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = 1024); +ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = default_kernel_wg_threads_); //! Declaration print_warning_on_adjustment template <unsigned int dim, typename linearizer> class print_warning_on_adjustment; @@ -733,7 +733,7 @@ public: return grid_key_dx_iterator_sub<N>(*this,start,stop); } -#ifdef CUDA_GPU +#if defined(CUDA_GPU) /*! \brief Get an iterator for the GPU * @@ -742,7 +742,7 @@ public: * */ template<typename T2> - struct ite_gpu<N> getGPUIterator(const grid_key_dx<N,T2> & key1, const grid_key_dx<N,T2> & key2, size_t n_thr = 1024) const + struct ite_gpu<N> getGPUIterator(const grid_key_dx<N,T2> & key1, const grid_key_dx<N,T2> & key2, size_t n_thr = default_kernel_wg_threads_) const { return getGPUIterator_impl<N>(*this,key1,key2,n_thr); } @@ -753,7 +753,7 @@ public: * \param stop end point * */ - struct ite_gpu<N> getGPUIterator(size_t n_thr = 1024) const + struct ite_gpu<N> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<N> k1; grid_key_dx<N> k2; diff --git a/src/Grid/map_grid.hpp b/src/Grid/map_grid.hpp index e6965bae03acbbba006cfbde63c92b5e731a2289..299bb85efe413d47add92b910755822c7e33a545 100755 --- a/src/Grid/map_grid.hpp +++ b/src/Grid/map_grid.hpp @@ -190,7 +190,19 @@ public: * \param g grid to copy * */ - grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g) + __device__ grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g) + { + printf("Error grid_base operator= is not defined in device code\n"); + + return *this; + } + + /*! \brief It copy a grid + * + * \param g grid to copy + * + */ + __host__ grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g) { (static_cast<grid_base_impl<dim,T,S, memory_traits_lin> *>(this))->swap(g.duplicate()); diff --git a/src/NN/CellList/CellList_gpu_test.cu b/src/NN/CellList/CellList_gpu_test.cu index 364d2e0b955b764352a8c1785d6c064a58c13f66..9f323818c7a3cb48c06ab9e9c72e5b2ee40a21cc 100644 --- a/src/NN/CellList/CellList_gpu_test.cu +++ b/src/NN/CellList/CellList_gpu_test.cu @@ -1363,6 +1363,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props); cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m); + auto & s_t_ns = cl2.getSortToNonSort(); pl.template hostToDevice<0>(); @@ -1371,6 +1372,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di // Domain particles + auto & gdsi = cl2.getDomainSortIds(); gdsi.template deviceToHost<0>(); s_t_ns.template deviceToHost<0>(); @@ -1497,6 +1499,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di } BOOST_REQUIRE_EQUAL(check,true); + } } diff --git a/src/NN/CellList/CellList_test.hpp b/src/NN/CellList/CellList_test.hpp index 1336823e542b51224b64ac7662070711aa3c754f..e93bb76f1cc342c05be9be9c5ceec312f320af0e 100644 --- a/src/NN/CellList/CellList_test.hpp +++ b/src/NN/CellList/CellList_test.hpp @@ -395,7 +395,7 @@ template<unsigned int dim, typename T, typename CellS> void Test_NN_iterator_rad for (size_t i = 0 ; i < dim ; i++) { - vrp.template get<0>(j)[i] = ((float)rand() / RAND_MAX)*(box.getHigh(i) - box.getLow(i)) + box.getLow(i); + vrp.template get<0>(j)[i] = ((float)rand() / (float)RAND_MAX)*(box.getHigh(i) - box.getLow(i)) + box.getLow(i); } } diff --git a/src/NN/CellList/cuda/CellList_gpu.hpp b/src/NN/CellList/cuda/CellList_gpu.hpp index 5537ec0fa9c4b02c7948200e9c082392d6ad05bb..4913bd4a4d9fdd2e2441ca876227f76e0f9046d9 100644 --- a/src/NN/CellList/cuda/CellList_gpu.hpp +++ b/src/NN/CellList/cuda/CellList_gpu.hpp @@ -356,15 +356,12 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> { #ifdef __NVCC__ - CUDA_SAFE() - // Than we construct the ids auto ite_gpu = pl.getGPUIteratorTo(stop-start-1); cl_n.resize(this->gr_cell.size()+1); cl_n.template fill<0>(0); -// CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type))); part_ids.resize(stop - start); diff --git a/src/Point_test.hpp b/src/Point_test.hpp index c41e0429ee1a321ab6f80acfa76324fbcc76906f..ede644fa7c19308f6103366d3c3cdcb796b7e1ec 100755 --- a/src/Point_test.hpp +++ b/src/Point_test.hpp @@ -335,7 +335,7 @@ public: * \return this * */ - inline Point_test<T> operator= (const Point_test<T> & p) + __device__ __host__ inline Point_test<T> & operator= (const Point_test<T> & p) { boost::fusion::at_c<0>(data) = boost::fusion::at_c<0>(p.data); boost::fusion::at_c<1>(data) = boost::fusion::at_c<1>(p.data); diff --git a/src/Space/SpaceBox.hpp b/src/Space/SpaceBox.hpp index 0b4fdb8842c2f18009bb6f18697d13320d96edca..b595b3e26bda5f1c78d8ddd1a40b8700dbefebf1 100644 --- a/src/Space/SpaceBox.hpp +++ b/src/Space/SpaceBox.hpp @@ -211,7 +211,7 @@ class SpaceBox : public Box<dim,T> Point<dim,T> p; for (size_t i = 0 ; i < dim ; i++) - p.get(i) = ((T)rand())/RAND_MAX * (this->getHigh(i) - this->getLow(i)) + this->getLow(i); + p.get(i) = ((T)rand())/(T)RAND_MAX * (this->getHigh(i) - this->getLow(i)) + this->getLow(i); return p; } diff --git a/src/SparseGrid/SparseGrid_chunk_copy.hpp b/src/SparseGrid/SparseGrid_chunk_copy.hpp index 41b8ff729dc227a245ec9dd00d0b0682d06007eb..8fe4d603a0af628f87275bc144c9995b0a8a9ad3 100644 --- a/src/SparseGrid/SparseGrid_chunk_copy.hpp +++ b/src/SparseGrid/SparseGrid_chunk_copy.hpp @@ -8,9 +8,11 @@ #ifndef SPARSEGRID_CHUNK_COPY_HPP_ #define SPARSEGRID_CHUNK_COPY_HPP_ -#if !defined(__NVCC__) || defined(CUDA_ON_CPU) +#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__) +// Nvcc does not like VC ... for some reason #include <Vc/Vc> #endif + #include "util/mathutil.hpp" diff --git a/src/SparseGrid/SparseGrid_conv_opt.hpp b/src/SparseGrid/SparseGrid_conv_opt.hpp index 7d50d97fb3d80c420ac00613a4bbcee064ebd7e8..334efffdc85a35675d939a0e4900c2e46969cbef 100644 --- a/src/SparseGrid/SparseGrid_conv_opt.hpp +++ b/src/SparseGrid/SparseGrid_conv_opt.hpp @@ -110,7 +110,7 @@ struct conv_impl } }; -#if !defined(__NVCC__) || defined(CUDA_ON_CPU) +#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__) template<unsigned int dir,int p, unsigned int prop_src1,typename chunk_type, typename vect_type, typename ids_type> diff --git a/src/SparseGridGpu/SparseGridGpu_kernels.cuh b/src/SparseGridGpu/SparseGridGpu_kernels.cuh index 33edebbf6987a2a555eda1053903086e3ced7739..bd45286de1410015b54e8fc2d9123d654cdc8d8b 100644 --- a/src/SparseGridGpu/SparseGridGpu_kernels.cuh +++ b/src/SparseGridGpu/SparseGridGpu_kernels.cuh @@ -1076,7 +1076,7 @@ namespace SparseGridGpuKernels #ifdef SE_CLASS1 if (numCnt > blockDim.x) - {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,blockDim.x);} + {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(int)blockDim.x);} #endif @@ -1164,7 +1164,7 @@ namespace SparseGridGpuKernels #ifdef SE_CLASS1 if (numCnt > blockDim.x) - {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,blockDim.x);} + {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x %d %d \n",numCnt,(int)blockDim.x);} #endif diff --git a/src/Vector/cuda/map_vector_cuda_funcs_tests.cu b/src/Vector/cuda/map_vector_cuda_funcs_tests.cu index 907d29c60c382f669f12492eeef9398d57d9cb5b..01aec66f7b07f25262493ab9614b500315106e9a 100644 --- a/src/Vector/cuda/map_vector_cuda_funcs_tests.cu +++ b/src/Vector/cuda/map_vector_cuda_funcs_tests.cu @@ -8,7 +8,7 @@ #define BOOST_GPU_ENABLED __host__ __device__ -#include "util/cudify/cudify.hpp" +#include "util/cuda_launch.hpp" #include "config.h" #define BOOST_TEST_DYN_LINK diff --git a/src/Vector/cuda/map_vector_cuda_ker.cuh b/src/Vector/cuda/map_vector_cuda_ker.cuh index d6ebfd8d68ff2fe1ba444388954c91add026ce61..d5a0df7c9038766aafa641a315321321213d8cbc 100644 --- a/src/Vector/cuda/map_vector_cuda_ker.cuh +++ b/src/Vector/cuda/map_vector_cuda_ker.cuh @@ -453,7 +453,7 @@ namespace openfpm * * */ - __host__ ite_gpu<1> getGPUIterator(size_t n_thr = 1024) const + __host__ ite_gpu<1> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<1> start(0); grid_key_dx<1> stop(size()-1); @@ -465,7 +465,7 @@ namespace openfpm * * */ - ite_gpu<1> getGPUIteratorTo(size_t stop, size_t n_thr = 1024) const + ite_gpu<1> getGPUIteratorTo(size_t stop, size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<1> start(0); grid_key_dx<1> stop_(stop); @@ -478,7 +478,19 @@ namespace openfpm * \param object to copy * */ - vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v) + __device__ vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v) + { + printf("Error vector_gpu_ker the operator= is not defined in device code\n"); + + return *this; + } + + /*! \brief operator= this operator absorb the pointers, consider that this object wrap device pointers + * + * \param object to copy + * + */ + __host__ vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v) { v_size = v.v_size; base = v.base; diff --git a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh index 28d3963ca7d116ecbb65c4a165335a1266f2cba0..290e2632def4feab2905aa05dbf87729d5493178 100644 --- a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh +++ b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh @@ -10,14 +10,15 @@ #ifdef __NVCC__ +#include "config.h" + #if CUDART_VERSION < 11000 #include "util/cuda/cub_old/util_type.cuh" #include "util/cuda/cub_old/block/block_scan.cuh" #include "util/cuda/moderngpu/operators.hxx" +#include "util/cuda/cuda_launch.hpp" #else - #if !defined(CUDA_ON_CPU) - #include "cub/util_type.cuh" - #include "cub/block/block_scan.cuh" + #if !defined(CUDA_ON_CPU) #include "util/cuda/moderngpu/operators.hxx" #endif #endif diff --git a/src/Vector/map_vector.hpp b/src/Vector/map_vector.hpp index 1118c76ebdd3cc9537ab5af87c4c4d72efb2dfea..e0243c539e7de741e849a9092a7f217ad90ce080 100644 --- a/src/Vector/map_vector.hpp +++ b/src/Vector/map_vector.hpp @@ -1448,7 +1448,6 @@ namespace openfpm #endif } - return *this; } @@ -1680,7 +1679,7 @@ namespace openfpm * * */ - ite_gpu<1> getGPUIteratorTo(long int stop, size_t n_thr = 1024) const + ite_gpu<1> getGPUIteratorTo(long int stop, size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<1,long int> start(0); grid_key_dx<1,long int> stop_(stop); @@ -1708,7 +1707,7 @@ namespace openfpm * * */ - ite_gpu<1> getGPUIterator(size_t n_thr = 1024) const + ite_gpu<1> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const { grid_key_dx<1> start(0); grid_key_dx<1> stop(size()-1); diff --git a/src/Vector/map_vector_std.hpp b/src/Vector/map_vector_std.hpp index 2d4a875fcec92d3d7870efc5a6cea211d0d9be7f..9843172032765f37aa1fb3195ae5842d3d73217a 100644 --- a/src/Vector/map_vector_std.hpp +++ b/src/Vector/map_vector_std.hpp @@ -676,7 +676,21 @@ public: * \return itself * */ - vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & operator=(const vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & v) + __device__ vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & operator=(const vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & v) + { + printf("Error openfpm::vector operator= cannot be used on device"); + + return *this; + } + + /*! \brief Operator= copy the vector into another + * + * \param v vector to copy + * + * \return itself + * + */ + __host__ vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & operator=(const vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & v) { base = v.base; diff --git a/src/config/config_cmake.h.in b/src/config/config_cmake.h.in index 01c196978f3feb7cb93fd2bc3d0aec46446a3162..2c956fc29ec06f81a30e381865d2c5fc795e31ee 100644 --- a/src/config/config_cmake.h.in +++ b/src/config/config_cmake.h.in @@ -4,6 +4,12 @@ ${DEFINE_COVERTY_SCAN} /* GPU support */ ${DEFINE_CUDA_GPU} +/* HIP GPU support */ +${DEFINE_HIP_GPU} + +/* HIP Cudify GPU support */ +${DEFINE_CUDIFY_USE_HIP} + /* Debug */ ${DEFINE_DEBUG} /**/ diff --git a/src/timer.hpp b/src/timer.hpp index 787223a87a5d0b99988195179e317eab8c080a20..f87a0e1cff42bd94268d70791de7ea96d167d792 100644 --- a/src/timer.hpp +++ b/src/timer.hpp @@ -42,14 +42,22 @@ class timer clock_t cstop; #if defined(__NVCC__) && !defined(CUDA_ON_CPU) - cudaEvent_t start_g, stop_g; + #ifdef __HIP__ + hipEvent_t start_g, stop_g; + #else + cudaEvent_t start_g, stop_g; + #endif #endif // Fill the stop point void check() { -#if defined(SYNC_BEFORE_TAKE_TIME) && defined(__NVCC__) +#if defined(SYNC_BEFORE_TAKE_TIME) && defined(__NVCC__) + #ifdef __HIP__ + hipDeviceSynchronize(); + #else cudaDeviceSynchronize(); + #endif #endif #ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time @@ -153,21 +161,37 @@ public: void startGPU() { + #ifdef __HIP__ + hipEventCreate(&start_g); + hipEventRecord(start_g,0); + #else cudaEventCreate(&start_g); cudaEventRecord(start_g,0); + #endif } void stopGPU() { - cudaEventCreate(&stop_g); - cudaEventRecord(stop_g,0); - cudaEventSynchronize(stop_g); + #ifdef __HIP__ + hipEventCreate(&stop_g); + hipEventRecord(stop_g,0); + hipEventSynchronize(stop_g); + #else + cudaEventCreate(&stop_g); + cudaEventRecord(stop_g,0); + cudaEventSynchronize(stop_g); + #endif } double getwctGPU() { float elapsedTime; + + #ifdef __HIP__ + hipEventElapsedTime(&elapsedTime, start_g,stop_g); + #else cudaEventElapsedTime(&elapsedTime, start_g,stop_g); + #endif return elapsedTime; } diff --git a/src/util/cuda/modern_gpu_tests.cu b/src/util/cuda/modern_gpu_tests.cu index 167a7103c6afac34e675b9f40450375facb633df..ddd8d8b42404cb04139c5ecee733c5117b47d03a 100644 --- a/src/util/cuda/modern_gpu_tests.cu +++ b/src/util/cuda/modern_gpu_tests.cu @@ -15,7 +15,7 @@ BOOST_AUTO_TEST_SUITE( modern_gpu_tests ) -BOOST_AUTO_TEST_CASE( modern_gpu_transform_lbs ) +BOOST_AUTO_TEST_CASE( modern_gpu_loadbalance_lbs ) { std::cout << "Test modern gpu test tansform_lbs" << "\n"; diff --git a/src/util/cuda/moderngpu/transform.hxx b/src/util/cuda/moderngpu/transform.hxx index 617394d03af26eb10d5a10ecbd4d3c5858aa1519..10905dcd6466399c83728e5c766c5fabccf57fd8 100644 --- a/src/util/cuda/moderngpu/transform.hxx +++ b/src/util/cuda/moderngpu/transform.hxx @@ -3,7 +3,9 @@ #include <random> #include <algorithm> +#ifndef __HIP__ #include <cuda.h> +#endif #include "launch_box.hxx" BEGIN_MGPU_NAMESPACE diff --git a/src/util/cuda/ofp_context.hxx b/src/util/cuda/ofp_context.hxx index 47d5327e11be0adc0067f49f04efb8a2482dff09..cd1607e88e0ebf26a2cf7853a2838777643d172c 100644 --- a/src/util/cuda/ofp_context.hxx +++ b/src/util/cuda/ofp_context.hxx @@ -151,7 +151,7 @@ namespace mgpu void init(int dev_num, gpu_context_opt opt) { cudaFuncAttributes attr; - cudaError_t result = cudaFuncGetAttributes(&attr, dummy_k<0>); + cudaError_t result = cudaFuncGetAttributes(&attr, (void *)dummy_k<0>); if(cudaSuccess != result) throw cuda_exception_t(result); _ptx_version = attr.ptxVersion; diff --git a/src/util/cuda/scan_ofp.cuh b/src/util/cuda/scan_ofp.cuh index 7e5c7eb60fea8b77d020f3d482cc3e48d1b36e87..96cdc884d0791086996896ba0220308a596cac15 100644 --- a/src/util/cuda/scan_ofp.cuh +++ b/src/util/cuda/scan_ofp.cuh @@ -15,7 +15,11 @@ #if CUDART_VERSION >= 11000 #ifndef CUDA_ON_CPU // Here we have for sure CUDA >= 11 - #include "cub/cub.cuh" + #ifdef __HIP__ + #include "hipcub/hipcub.hpp" + #else + #include "cub/cub.cuh" + #endif #ifndef SCAN_WITH_CUB #define SCAN_WITH_CUB #endif @@ -48,19 +52,39 @@ namespace openfpm #else #ifdef SCAN_WITH_CUB - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input, - output, - count); + #ifdef __HIP__ + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input, + output, + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + // Run + hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, + output, + count); + + #else + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input, + output, + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); + // Run + cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, + output, + count); - // Run - cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, - output, - count); + #endif #else mgpu::scan(input,count,output,context); diff --git a/src/util/cuda/sort_ofp.cuh b/src/util/cuda/sort_ofp.cuh index 32002952b3c7dca4525f3dca0652fb486b2a15cb..ae2fba9f9acaa47f5259e224dc96f29903df2da8 100644 --- a/src/util/cuda/sort_ofp.cuh +++ b/src/util/cuda/sort_ofp.cuh @@ -16,7 +16,11 @@ #if CUDART_VERSION >= 11000 #ifndef CUDA_ON_CPU // Here we have for sure CUDA >= 11 - #include "cub/cub.cuh" + #ifdef __HIP__ + #include "hipcub/hipcub.hpp" + #else + #include "cub/cub.cuh" + #endif #ifndef SORT_WITH_CUB #define SORT_WITH_CUB #endif @@ -248,66 +252,134 @@ namespace openfpm #ifdef SORT_WITH_CUB - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - - auto & temporal2 = context.getTemporalCUB2(); - temporal2.resize(sizeof(key_t)*count); - - auto & temporal3 = context.getTemporalCUB3(); - temporal3.resize(sizeof(val_t)*count); - - if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true) - { - cub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); - - d_temp_storage = temporal.template getDeviceBuffer<0>(); - - // Run - cub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - } - else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true) - { - cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); - - d_temp_storage = temporal.template getDeviceBuffer<0>(); - - // Run - cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - } - - cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); - cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); + #ifdef __HIP__ + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + + auto & temporal2 = context.getTemporalCUB2(); + temporal2.resize(sizeof(key_t)*count); + + auto & temporal3 = context.getTemporalCUB3(); + temporal3.resize(sizeof(val_t)*count); + + if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true) + { + hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true) + { + hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + + cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); + cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); + + + #else + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + + auto & temporal2 = context.getTemporalCUB2(); + temporal2.resize(sizeof(key_t)*count); + + auto & temporal3 = context.getTemporalCUB3(); + temporal3.resize(sizeof(val_t)*count); + + if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true) + { + cub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + cub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true) + { + cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + + cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); + cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); + + #endif #else mgpu::mergesort(keys_input,vals_input,count,comp,context); diff --git a/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp b/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp index 744efeacd19e7df2518ec103884e3d1552e6656c..442aa1f3200b075b9266400fdb58855fbdce7771 100644 --- a/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp +++ b/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp @@ -119,7 +119,7 @@ public: inline __host__ __device__ size_type size() const { return boost::mpl::at<vector,boost::mpl::int_<0>>::type::value; } size_type max_size() const { return num_elements(); } bool empty() const { return size() == 0; } - size_type num_dimensions() const { return NumDims; } + inline __device__ __host__ size_type num_dimensions() const { return NumDims; } inline __host__ __device__ const index* strides() const { return strides_; } size_type num_elements() const