From 640054a3b219bd7c612a9f8e21eed72cd344e3ac Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Thu, 1 Apr 2021 20:27:44 +0200
Subject: [PATCH] HIP moving on

---
 CMakeLists.txt                                |  15 +-
 configure                                     |   4 +
 src/CMakeLists.txt                            |  50 ++++-
 src/Grid/cuda/cuda_grid_gpu_tests.cu          |  28 +++
 src/Grid/cuda/map_grid_cuda_ker.cuh           |   2 +-
 src/Grid/grid_base_implementation.hpp         |   2 +-
 src/Grid/grid_sm.hpp                          |   8 +-
 src/Grid/map_grid.hpp                         |  14 +-
 src/NN/CellList/CellList_gpu_test.cu          |   3 +
 src/NN/CellList/CellList_test.hpp             |   2 +-
 src/NN/CellList/cuda/CellList_gpu.hpp         |   3 -
 src/Point_test.hpp                            |   2 +-
 src/Space/SpaceBox.hpp                        |   2 +-
 src/SparseGrid/SparseGrid_chunk_copy.hpp      |   4 +-
 src/SparseGrid/SparseGrid_conv_opt.hpp        |   2 +-
 src/SparseGridGpu/SparseGridGpu_kernels.cuh   |   4 +-
 .../cuda/map_vector_cuda_funcs_tests.cu       |   2 +-
 src/Vector/cuda/map_vector_cuda_ker.cuh       |  18 +-
 .../cuda/map_vector_sparse_cuda_kernels.cuh   |   7 +-
 src/Vector/map_vector.hpp                     |   5 +-
 src/Vector/map_vector_std.hpp                 |  16 +-
 src/config/config_cmake.h.in                  |   6 +
 src/timer.hpp                                 |  34 ++-
 src/util/cuda/modern_gpu_tests.cu             |   2 +-
 src/util/cuda/moderngpu/transform.hxx         |   2 +
 src/util/cuda/ofp_context.hxx                 |   2 +-
 src/util/cuda/scan_ofp.cuh                    |  48 +++--
 src/util/cuda/sort_ofp.cuh                    | 194 ++++++++++++------
 .../multi_array_ref_subarray_openfpm.hpp      |   2 +-
 29 files changed, 362 insertions(+), 121 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 57e0ebf8..8acda874 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@ if (POLICY CMP0074)
         cmake_policy(SET CMP0074 OLD)
 endif ()
 
-list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/)
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/ /opt/rocm/hip/cmake)
 
 set(BOOST_INCLUDE ${Boost_INCLUDE_DIR} CACHE PATH "Include directory for BOOST")
 set(LIBHILBERT_ROOT CACHE PATH "LibHilbert root path")
@@ -15,6 +15,8 @@ set(ENABLE_GPU CACHE BOOL "Disable the GPU code independently that a cuda compil
 set(TEST_PERFORMANCE CACHE BOOL "Enable test performance")
 set(ALPAKA_ROOT CACHE PATH "Alpaka root path")
 set(CUDA_ON_CPU CACHE BOOL "Make Cuda work on heap")
+set(HIP_ENABLE CACHE BOOL "Enable HIP compiler")
+set(AMD_ARCH_COMPILE "gfx900" CACHE STRING "AMD gpu architecture used to compile kernels")
 
 if (ENABLE_GPU)
     set(CUDA_ON_CPU OFF)
@@ -31,7 +33,7 @@ message("Searching Vc in ${Vc_DIR}")
 find_package(Boost 1.72.0 REQUIRED COMPONENTS unit_test_framework iostreams program_options system filesystem OPTIONAL_COMPONENTS fiber context)
 find_package(LibHilbert REQUIRED)
 find_package(Vc REQUIRED)
-
+find_package(HIP)
 
 ###### CONFIG.h FILE ######
 
@@ -47,6 +49,11 @@ if(TEST_PERFORMANCE)
         set(DEFINE_PERFORMANCE_TEST "#define PERFORMANCE_TEST")
 endif()
 
+if(HIP_FOUND)
+        set(DEFINE_HIP_GPU "#define HIP_GPU")
+        set(DEFINE_CUDIFY_USE_HIP "#define CUDIFY_USE_HIP")
+endif()
+
 if (Boost_FOUND)
         set(DEFINE_HAVE_BOOST "#define HAVE_BOOST")
         set(DEFINE_HAVE_BOOST_IOSTREAMS "#define HAVE_BOOST_IOSTREAMS")
@@ -73,6 +80,10 @@ if(CUDA_ON_CPU)
         set(DEFINE_CUDA_GPU "#define CUDA_GPU")
 endif()
 
+if(HIP_FOUND)
+	set(DEFINE_CUDA_GPU "#define CUDA_GPU")
+endif()
+
 if(LIBHILBERT_FOUND)
         set(DEFINE_HAVE_LIBHILBERT "#define HAVE_LIBHILBERT 1")
 else()
diff --git a/configure b/configure
index 878cf86f..e00aff0e 100755
--- a/configure
+++ b/configure
@@ -122,6 +122,7 @@ with_petsc
 with_eigen
 with_vcdevel
 enable_gpu
+enable_hip
 enable_asan
 '
 
@@ -258,6 +259,9 @@ do
         fi
        conf_options="$conf_options -DENABLE_GPU=ON"
        ;;
+    hip)
+       conf_options="$conf_options -DHIP_ENABLE=ON"
+       ;;
     asan)
        conf_options="$conf_options -DENABLE_ASAN=ON"
        ;;
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fc99362d..eca74816 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -21,7 +21,7 @@ if (TEST_PERFORMANCE)
                          Vector/performance/vector_performance_test.cu)
 endif ()
 
-if (CUDA_FOUND OR CUDA_ON_CPU)
+if (CUDA_FOUND OR CUDA_ON_CPU OR HIP_FOUND)
 	set(CUDA_SOURCES ${CUDA_SOURCES}
 	    Vector/map_vector_sparse_unit_tests.cu
             Vector/vector_gpu_unit_tests.cu
@@ -45,7 +45,8 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
 endif()
 
 if (CUDA_ON_CPU)
-	add_definitions(-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000)
+
+	add_definitions(-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 )
         set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX)
 	set_source_files_properties(isolation.cu PROPERTIES LANGUAGE CXX)
        	if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
@@ -53,18 +54,47 @@ if (CUDA_ON_CPU)
         endif()
 endif()
 
-add_executable(mem_map ${CUDA_SOURCES}
-        main.cpp
-        data_type/aggregate_unit_tests.cpp
-        util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp
-        memory_ly/memory_conf_unit_tests.cpp
-        Space/tests/SpaceBox_unit_tests.cpp
-        Space/Shape/Sphere_unit_test.cpp
+if ( HIP_ENABLE AND HIP_FOUND )
+
+	list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
+
+	if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+		list(APPEND HIP_HIPCC_FLAGS -O0)
+	endif()
+
+	add_definitions(-D__NVCC__ -D__HIP__  -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0)
+	set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX)
+
+	set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
+
+        hip_add_executable(mem_map ${CUDA_SOURCES}
+                main.cpp
+                data_type/aggregate_unit_tests.cpp
+                util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp
+                memory_ly/memory_conf_unit_tests.cpp
+                Space/tests/SpaceBox_unit_tests.cpp
+                Space/Shape/Sphere_unit_test.cpp
+                SparseGrid/SparseGrid_unit_tests.cpp
+                SparseGrid/SparseGrid_chunk_copy_unit_tests.cpp
+                Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp
+                Grid/Geometry/tests/grid_smb_tests.cpp)
+
+else()
+
+	add_executable(mem_map ${CUDA_SOURCES}
+        	main.cpp
+        	data_type/aggregate_unit_tests.cpp
+        	util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp
+        	memory_ly/memory_conf_unit_tests.cpp
+        	Space/tests/SpaceBox_unit_tests.cpp
+        	Space/Shape/Sphere_unit_test.cpp
 		SparseGrid/SparseGrid_unit_tests.cpp
 		SparseGrid/SparseGrid_chunk_copy_unit_tests.cpp
-        Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp
+        	Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp
 		Grid/Geometry/tests/grid_smb_tests.cpp)
 
+endif()
+
 set_property(TARGET mem_map PROPERTY CUDA_ARCHITECTURES 60 75)
 
 if (CUDA_FOUND)
diff --git a/src/Grid/cuda/cuda_grid_gpu_tests.cu b/src/Grid/cuda/cuda_grid_gpu_tests.cu
index d26103e0..bb0bb4af 100644
--- a/src/Grid/cuda/cuda_grid_gpu_tests.cu
+++ b/src/Grid/cuda/cuda_grid_gpu_tests.cu
@@ -32,6 +32,18 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func)
 
 	auto gcf = c3.getGPUIterator(k1,k2);
 
+#ifdef __HIP__
+
+	BOOST_REQUIRE_EQUAL(gcf.thr.x,8ul);
+	BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul);
+	BOOST_REQUIRE_EQUAL(gcf.thr.z,4ul);
+
+	BOOST_REQUIRE_EQUAL(gcf.wthr.x,8ul);
+	BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul);
+	BOOST_REQUIRE_EQUAL(gcf.wthr.z,16ul);
+
+#else
+
 	BOOST_REQUIRE_EQUAL(gcf.thr.x,16ul);
 	BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul);
 	BOOST_REQUIRE_EQUAL(gcf.thr.z,8ul);
@@ -40,12 +52,26 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func)
 	BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul);
 	BOOST_REQUIRE_EQUAL(gcf.wthr.z,8ul);
 
+#endif
+
 	grid_key_dx<3> k3({50,50,50});
 	grid_key_dx<3> k4({62,62,62});
 	grid_key_dx<3> k5({60,61,62});
 
 	auto gcf2 = c3.getGPUIterator(k3,k4);
 
+#ifdef __HIP__
+
+	BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul);
+	BOOST_REQUIRE_EQUAL(gcf2.thr.z,4ul);
+
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.x,2ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul);
+	BOOST_REQUIRE_EQUAL(gcf2.wthr.z,4ul);
+
+#else
+
 	BOOST_REQUIRE_EQUAL(gcf2.thr.x,13ul);
 	BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul);
 	BOOST_REQUIRE_EQUAL(gcf2.thr.z,8ul);
@@ -54,6 +80,8 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func)
 	BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul);
 	BOOST_REQUIRE_EQUAL(gcf2.wthr.z,2ul);
 
+#endif
+
 	gcf2 = c3.getGPUIterator(k3,k4,511);
 
 	BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul);
diff --git a/src/Grid/cuda/map_grid_cuda_ker.cuh b/src/Grid/cuda/map_grid_cuda_ker.cuh
index 6a2f9fbf..d27aeab0 100644
--- a/src/Grid/cuda/map_grid_cuda_ker.cuh
+++ b/src/Grid/cuda/map_grid_cuda_ker.cuh
@@ -421,7 +421,7 @@ public:
 	 * \param stop end point
 	 *
 	 */
-	struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = 1024) const
+	struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = default_kernel_wg_threads_) const
 	{
 		return getGPUIterator_impl<dim>(g1,key1,key2,n_thr);
 	}
diff --git a/src/Grid/grid_base_implementation.hpp b/src/Grid/grid_base_implementation.hpp
index 9ff639d6..cb615f90 100644
--- a/src/Grid/grid_base_implementation.hpp
+++ b/src/Grid/grid_base_implementation.hpp
@@ -730,7 +730,7 @@ public:
 	 * \param stop end point
 	 *
 	 */
-	struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = 1024) const
+	struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = default_kernel_wg_threads_) const
 	{
 		return getGPUIterator_impl<dim>(g1,key1,key2,n_thr);
 	}
diff --git a/src/Grid/grid_sm.hpp b/src/Grid/grid_sm.hpp
index ed1e1d9e..f49a056e 100755
--- a/src/Grid/grid_sm.hpp
+++ b/src/Grid/grid_sm.hpp
@@ -125,7 +125,7 @@ struct ite_gpu
 template<unsigned int N, typename T> class grid_sm;
 
 template<unsigned int dim, typename T2, typename T>
-ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = 1024);
+ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = default_kernel_wg_threads_);
 
 //! Declaration print_warning_on_adjustment
 template <unsigned int dim, typename linearizer> class print_warning_on_adjustment;
@@ -733,7 +733,7 @@ public:
 		return grid_key_dx_iterator_sub<N>(*this,start,stop);
 	}
 
-#ifdef CUDA_GPU
+#if defined(CUDA_GPU)
 
 	/*! \brief Get an iterator for the GPU
 	 *
@@ -742,7 +742,7 @@ public:
 	 *
 	 */
 	template<typename T2>
-	struct ite_gpu<N> getGPUIterator(const grid_key_dx<N,T2> & key1, const grid_key_dx<N,T2> & key2, size_t n_thr = 1024) const
+	struct ite_gpu<N> getGPUIterator(const grid_key_dx<N,T2> & key1, const grid_key_dx<N,T2> & key2, size_t n_thr = default_kernel_wg_threads_) const
 	{
 		return getGPUIterator_impl<N>(*this,key1,key2,n_thr);
 	}
@@ -753,7 +753,7 @@ public:
 	 * \param stop end point
 	 *
 	 */
-	struct ite_gpu<N> getGPUIterator(size_t n_thr = 1024) const
+	struct ite_gpu<N> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const
 	{
 		grid_key_dx<N> k1;
 		grid_key_dx<N> k2;
diff --git a/src/Grid/map_grid.hpp b/src/Grid/map_grid.hpp
index e6965bae..299bb85e 100755
--- a/src/Grid/map_grid.hpp
+++ b/src/Grid/map_grid.hpp
@@ -190,7 +190,19 @@ public:
 	 * \param g grid to copy
 	 *
 	 */
-	grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g)
+	__device__ grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g)
+	{
+		printf("Error grid_base operator= is not defined in device code\n");
+
+		return *this;
+	}
+
+	/*! \brief It copy a grid
+	 *
+	 * \param g grid to copy
+	 *
+	 */
+	__host__ grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g)
 	{
 		(static_cast<grid_base_impl<dim,T,S, memory_traits_lin> *>(this))->swap(g.duplicate());
 
diff --git a/src/NN/CellList/CellList_gpu_test.cu b/src/NN/CellList/CellList_gpu_test.cu
index 364d2e0b..9f323818 100644
--- a/src/NN/CellList/CellList_gpu_test.cu
+++ b/src/NN/CellList/CellList_gpu_test.cu
@@ -1363,6 +1363,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di
 
 	mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
 	cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
+
 	auto & s_t_ns = cl2.getSortToNonSort();
 
 	pl.template hostToDevice<0>();
@@ -1371,6 +1372,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di
 
 	// Domain particles
 
+	
 	auto & gdsi = cl2.getDomainSortIds();
 	gdsi.template deviceToHost<0>();
 	s_t_ns.template deviceToHost<0>();
@@ -1497,6 +1499,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di
 	}
 
 	BOOST_REQUIRE_EQUAL(check,true);
+
 	}
 }
 
diff --git a/src/NN/CellList/CellList_test.hpp b/src/NN/CellList/CellList_test.hpp
index 1336823e..e93bb76f 100644
--- a/src/NN/CellList/CellList_test.hpp
+++ b/src/NN/CellList/CellList_test.hpp
@@ -395,7 +395,7 @@ template<unsigned int dim, typename T, typename CellS> void Test_NN_iterator_rad
 
 		for (size_t i = 0 ; i < dim ; i++)
 		{
-			vrp.template get<0>(j)[i] = ((float)rand() / RAND_MAX)*(box.getHigh(i) - box.getLow(i)) + box.getLow(i);
+			vrp.template get<0>(j)[i] = ((float)rand() / (float)RAND_MAX)*(box.getHigh(i) - box.getLow(i)) + box.getLow(i);
 		}
 	}
 
diff --git a/src/NN/CellList/cuda/CellList_gpu.hpp b/src/NN/CellList/cuda/CellList_gpu.hpp
index 5537ec0f..4913bd4a 100644
--- a/src/NN/CellList/cuda/CellList_gpu.hpp
+++ b/src/NN/CellList/cuda/CellList_gpu.hpp
@@ -356,15 +356,12 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 	{
 #ifdef __NVCC__
 
-		CUDA_SAFE()
-
 		// Than we construct the ids
 
 		auto ite_gpu = pl.getGPUIteratorTo(stop-start-1);
 
 		cl_n.resize(this->gr_cell.size()+1);
 		cl_n.template fill<0>(0);
-//		CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type)));
 
 		part_ids.resize(stop - start);
 
diff --git a/src/Point_test.hpp b/src/Point_test.hpp
index c41e0429..ede644fa 100755
--- a/src/Point_test.hpp
+++ b/src/Point_test.hpp
@@ -335,7 +335,7 @@ public:
 	 * \return this
 	 *
 	 */
-	inline Point_test<T> operator= (const Point_test<T> & p)
+	__device__ __host__ inline Point_test<T> & operator= (const Point_test<T> & p)
 	{
 		boost::fusion::at_c<0>(data) = boost::fusion::at_c<0>(p.data);
 		boost::fusion::at_c<1>(data) = boost::fusion::at_c<1>(p.data);
diff --git a/src/Space/SpaceBox.hpp b/src/Space/SpaceBox.hpp
index 0b4fdb88..b595b3e2 100644
--- a/src/Space/SpaceBox.hpp
+++ b/src/Space/SpaceBox.hpp
@@ -211,7 +211,7 @@ class SpaceBox : public Box<dim,T>
 		Point<dim,T> p;
 
 		for (size_t i = 0 ; i < dim ; i++)
-			p.get(i) = ((T)rand())/RAND_MAX * (this->getHigh(i) - this->getLow(i)) + this->getLow(i);
+			p.get(i) = ((T)rand())/(T)RAND_MAX * (this->getHigh(i) - this->getLow(i)) + this->getLow(i);
 
 		return p;
 	}
diff --git a/src/SparseGrid/SparseGrid_chunk_copy.hpp b/src/SparseGrid/SparseGrid_chunk_copy.hpp
index 41b8ff72..8fe4d603 100644
--- a/src/SparseGrid/SparseGrid_chunk_copy.hpp
+++ b/src/SparseGrid/SparseGrid_chunk_copy.hpp
@@ -8,9 +8,11 @@
 #ifndef SPARSEGRID_CHUNK_COPY_HPP_
 #define SPARSEGRID_CHUNK_COPY_HPP_
 
-#if !defined(__NVCC__) || defined(CUDA_ON_CPU)
+#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__)
+// Nvcc does not like VC ... for some reason
 #include <Vc/Vc>
 #endif
+
 #include "util/mathutil.hpp"
 
 
diff --git a/src/SparseGrid/SparseGrid_conv_opt.hpp b/src/SparseGrid/SparseGrid_conv_opt.hpp
index 7d50d97f..334efffd 100644
--- a/src/SparseGrid/SparseGrid_conv_opt.hpp
+++ b/src/SparseGrid/SparseGrid_conv_opt.hpp
@@ -110,7 +110,7 @@ struct conv_impl
 	}
 };
 
-#if !defined(__NVCC__) || defined(CUDA_ON_CPU)
+#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__)
 
 
 template<unsigned int dir,int p, unsigned int prop_src1,typename chunk_type, typename vect_type, typename ids_type>
diff --git a/src/SparseGridGpu/SparseGridGpu_kernels.cuh b/src/SparseGridGpu/SparseGridGpu_kernels.cuh
index 33edebbf..bd45286d 100644
--- a/src/SparseGridGpu/SparseGridGpu_kernels.cuh
+++ b/src/SparseGridGpu/SparseGridGpu_kernels.cuh
@@ -1076,7 +1076,7 @@ namespace SparseGridGpuKernels
 #ifdef SE_CLASS1
 
         if (numCnt > blockDim.x)
-        {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x  %d %d \n",numCnt,blockDim.x);}
+        {printf("Error calc_exist_points_with_boxes assertion failed numCnt >= blockDim.x  %d %d \n",numCnt,(int)blockDim.x);}
 
 #endif
 
@@ -1164,7 +1164,7 @@ namespace SparseGridGpuKernels
 #ifdef SE_CLASS1
 
         if (numCnt > blockDim.x)
-        {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x  %d %d \n",numCnt,blockDim.x);}
+        {printf("Error get_exist_points_with_boxes assertion failed numCnt >= blockDim.x  %d %d \n",numCnt,(int)blockDim.x);}
 
 #endif
 
diff --git a/src/Vector/cuda/map_vector_cuda_funcs_tests.cu b/src/Vector/cuda/map_vector_cuda_funcs_tests.cu
index 907d29c6..01aec66f 100644
--- a/src/Vector/cuda/map_vector_cuda_funcs_tests.cu
+++ b/src/Vector/cuda/map_vector_cuda_funcs_tests.cu
@@ -8,7 +8,7 @@
 
 #define BOOST_GPU_ENABLED __host__ __device__
 
-#include "util/cudify/cudify.hpp"
+#include "util/cuda_launch.hpp"
 
 #include "config.h"
 #define BOOST_TEST_DYN_LINK
diff --git a/src/Vector/cuda/map_vector_cuda_ker.cuh b/src/Vector/cuda/map_vector_cuda_ker.cuh
index d6ebfd8d..d5a0df7c 100644
--- a/src/Vector/cuda/map_vector_cuda_ker.cuh
+++ b/src/Vector/cuda/map_vector_cuda_ker.cuh
@@ -453,7 +453,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		__host__ ite_gpu<1> getGPUIterator(size_t n_thr = 1024) const
+		__host__ ite_gpu<1> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const
 		{
 			grid_key_dx<1> start(0);
 			grid_key_dx<1> stop(size()-1);
@@ -465,7 +465,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		ite_gpu<1> getGPUIteratorTo(size_t stop, size_t n_thr = 1024) const
+		ite_gpu<1> getGPUIteratorTo(size_t stop, size_t n_thr = default_kernel_wg_threads_) const
 		{
 			grid_key_dx<1> start(0);
 			grid_key_dx<1> stop_(stop);
@@ -478,7 +478,19 @@ namespace openfpm
 		 * \param object to copy
 		 *
 		 */
-		vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v)
+		 __device__ vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v)
+		 {
+			 printf("Error vector_gpu_ker the operator= is not defined in device code\n");
+ 
+			 return *this;
+		 }
+
+		/*! \brief operator= this operator absorb the pointers, consider that this object wrap device pointers
+		 *
+		 * \param object to copy
+		 *
+		 */
+		__host__ vector_gpu_ker<T,layout_base> & operator=(const vector_gpu_ker<T,layout_base> & v)
 		{
 			v_size = v.v_size;
 			base = v.base;
diff --git a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh
index 28d3963c..290e2632 100644
--- a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh
+++ b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh
@@ -10,14 +10,15 @@
 
 #ifdef __NVCC__
 
+#include "config.h"
+
 #if CUDART_VERSION < 11000
 #include "util/cuda/cub_old/util_type.cuh"
 #include "util/cuda/cub_old/block/block_scan.cuh"
 #include "util/cuda/moderngpu/operators.hxx"
+#include "util/cuda/cuda_launch.hpp"
 #else
-	#if !defined(CUDA_ON_CPU)
-	#include "cub/util_type.cuh"
-	#include "cub/block/block_scan.cuh"
+	#if !defined(CUDA_ON_CPU)	
 	#include "util/cuda/moderngpu/operators.hxx"
 	#endif
 #endif
diff --git a/src/Vector/map_vector.hpp b/src/Vector/map_vector.hpp
index 1118c76e..e0243c53 100644
--- a/src/Vector/map_vector.hpp
+++ b/src/Vector/map_vector.hpp
@@ -1448,7 +1448,6 @@ namespace openfpm
 #endif
 			}
 
-
 			return *this;
 		}
 
@@ -1680,7 +1679,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		ite_gpu<1> getGPUIteratorTo(long int stop, size_t n_thr = 1024) const
+		ite_gpu<1> getGPUIteratorTo(long int stop, size_t n_thr = default_kernel_wg_threads_) const
 		{
 			grid_key_dx<1,long int> start(0);
 			grid_key_dx<1,long int> stop_(stop);
@@ -1708,7 +1707,7 @@ namespace openfpm
 		 *
 		 *
 		 */
-		ite_gpu<1> getGPUIterator(size_t n_thr = 1024) const
+		ite_gpu<1> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const
 		{
 			grid_key_dx<1> start(0);
 			grid_key_dx<1> stop(size()-1);
diff --git a/src/Vector/map_vector_std.hpp b/src/Vector/map_vector_std.hpp
index 2d4a875f..98431720 100644
--- a/src/Vector/map_vector_std.hpp
+++ b/src/Vector/map_vector_std.hpp
@@ -676,7 +676,21 @@ public:
 	 * \return itself
 	 *
 	 */
-	vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & operator=(const vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & v)
+	__device__ vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & operator=(const vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & v)
+	{
+		printf("Error openfpm::vector operator= cannot be used on device");
+
+		return *this;
+	}
+
+	/*! \brief Operator= copy the vector into another
+	 *
+	 * \param v vector to copy
+	 *
+	 * \return itself
+	 *
+	 */
+	__host__ vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & operator=(const vector<T,HeapMemory,memory_traits_lin,grow_policy_double,STD_VECTOR> & v)
 	{
 		base = v.base;
 
diff --git a/src/config/config_cmake.h.in b/src/config/config_cmake.h.in
index 01c19697..2c956fc2 100644
--- a/src/config/config_cmake.h.in
+++ b/src/config/config_cmake.h.in
@@ -4,6 +4,12 @@ ${DEFINE_COVERTY_SCAN}
 /* GPU support */
 ${DEFINE_CUDA_GPU}
 
+/* HIP GPU support */
+${DEFINE_HIP_GPU}
+
+/* HIP Cudify GPU support */
+${DEFINE_CUDIFY_USE_HIP}
+
 /* Debug */
 ${DEFINE_DEBUG} /**/
 
diff --git a/src/timer.hpp b/src/timer.hpp
index 787223a8..f87a0e1c 100644
--- a/src/timer.hpp
+++ b/src/timer.hpp
@@ -42,14 +42,22 @@ class timer
     clock_t cstop;
 
 #if defined(__NVCC__) && !defined(CUDA_ON_CPU)
-    cudaEvent_t start_g, stop_g;
+    #ifdef __HIP__
+        hipEvent_t start_g, stop_g;
+    #else
+        cudaEvent_t start_g, stop_g;
+    #endif
 #endif
 
     // Fill the stop point
     void check()
     {
-#if defined(SYNC_BEFORE_TAKE_TIME) && defined(__NVCC__) 
+#if defined(SYNC_BEFORE_TAKE_TIME) && defined(__NVCC__)
+        #ifdef __HIP__
+        hipDeviceSynchronize();
+        #else
         cudaDeviceSynchronize();
+        #endif
 #endif
 
 #ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time
@@ -153,21 +161,37 @@ public:
 
     void startGPU()
     {
+        #ifdef __HIP__
+        hipEventCreate(&start_g);
+        hipEventRecord(start_g,0);
+        #else
         cudaEventCreate(&start_g);
         cudaEventRecord(start_g,0);
+        #endif
     }
 
     void stopGPU()
     {
-       cudaEventCreate(&stop_g);
-       cudaEventRecord(stop_g,0);
-       cudaEventSynchronize(stop_g);
+        #ifdef __HIP__
+        hipEventCreate(&stop_g);
+        hipEventRecord(stop_g,0);
+        hipEventSynchronize(stop_g);
+        #else
+        cudaEventCreate(&stop_g);
+        cudaEventRecord(stop_g,0);
+        cudaEventSynchronize(stop_g);
+        #endif
     }
 
     double getwctGPU()
     {
         float elapsedTime;
+
+        #ifdef __HIP__
+        hipEventElapsedTime(&elapsedTime, start_g,stop_g);
+        #else
         cudaEventElapsedTime(&elapsedTime, start_g,stop_g);
+        #endif
 
         return elapsedTime;
     }
diff --git a/src/util/cuda/modern_gpu_tests.cu b/src/util/cuda/modern_gpu_tests.cu
index 167a7103..ddd8d8b4 100644
--- a/src/util/cuda/modern_gpu_tests.cu
+++ b/src/util/cuda/modern_gpu_tests.cu
@@ -15,7 +15,7 @@
 
 BOOST_AUTO_TEST_SUITE( modern_gpu_tests )
 
-BOOST_AUTO_TEST_CASE( modern_gpu_transform_lbs )
+BOOST_AUTO_TEST_CASE( modern_gpu_loadbalance_lbs )
 {
 	std::cout << "Test modern gpu test tansform_lbs" << "\n";
 
diff --git a/src/util/cuda/moderngpu/transform.hxx b/src/util/cuda/moderngpu/transform.hxx
index 617394d0..10905dcd 100644
--- a/src/util/cuda/moderngpu/transform.hxx
+++ b/src/util/cuda/moderngpu/transform.hxx
@@ -3,7 +3,9 @@
 
 #include <random>
 #include <algorithm>
+#ifndef __HIP__
 #include <cuda.h>
+#endif
 #include "launch_box.hxx"
 
 BEGIN_MGPU_NAMESPACE
diff --git a/src/util/cuda/ofp_context.hxx b/src/util/cuda/ofp_context.hxx
index 47d5327e..cd1607e8 100644
--- a/src/util/cuda/ofp_context.hxx
+++ b/src/util/cuda/ofp_context.hxx
@@ -151,7 +151,7 @@ namespace mgpu
 					void init(int dev_num, gpu_context_opt opt)
 					{
 						cudaFuncAttributes attr;
-						cudaError_t result = cudaFuncGetAttributes(&attr, dummy_k<0>);
+						cudaError_t result = cudaFuncGetAttributes(&attr, (void *)dummy_k<0>);
 						if(cudaSuccess != result) throw cuda_exception_t(result);
 						_ptx_version = attr.ptxVersion;
 
diff --git a/src/util/cuda/scan_ofp.cuh b/src/util/cuda/scan_ofp.cuh
index 7e5c7eb6..96cdc884 100644
--- a/src/util/cuda/scan_ofp.cuh
+++ b/src/util/cuda/scan_ofp.cuh
@@ -15,7 +15,11 @@
 #if CUDART_VERSION >= 11000
 	#ifndef CUDA_ON_CPU 
 	// Here we have for sure CUDA >= 11
-	#include "cub/cub.cuh"
+	#ifdef __HIP__
+		#include "hipcub/hipcub.hpp"
+	#else
+		#include "cub/cub.cuh"
+	#endif
 	#ifndef SCAN_WITH_CUB
 		#define SCAN_WITH_CUB
 	#endif
@@ -48,19 +52,39 @@ namespace openfpm
 #else
 	#ifdef SCAN_WITH_CUB
 
-			void *d_temp_storage = NULL;
-			size_t temp_storage_bytes = 0;
-			cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input,
-																		output,
-																		count);
+			#ifdef __HIP__
+
+				void *d_temp_storage = NULL;
+				size_t temp_storage_bytes = 0;
+				hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input,
+																			output,
+																			count);
+
+				auto & temporal = context.getTemporalCUB();
+				temporal.resize(temp_storage_bytes);
+
+				// Run
+				hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
+						output,
+						count);
+
+			#else
+
+				void *d_temp_storage = NULL;
+				size_t temp_storage_bytes = 0;
+				cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input,
+																			output,
+																			count);
+
+				auto & temporal = context.getTemporalCUB();
+				temporal.resize(temp_storage_bytes);
 
-			auto & temporal = context.getTemporalCUB();
-			temporal.resize(temp_storage_bytes);
+				// Run
+				cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
+						output,
+						count);
 
-			// Run
-			cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
-					output,
-					count);
+			#endif
 
 	#else
 			mgpu::scan(input,count,output,context);
diff --git a/src/util/cuda/sort_ofp.cuh b/src/util/cuda/sort_ofp.cuh
index 32002952..ae2fba9f 100644
--- a/src/util/cuda/sort_ofp.cuh
+++ b/src/util/cuda/sort_ofp.cuh
@@ -16,7 +16,11 @@
 #if CUDART_VERSION >= 11000
 	#ifndef CUDA_ON_CPU 
 	// Here we have for sure CUDA >= 11
-	#include "cub/cub.cuh"
+	#ifdef __HIP__
+		#include "hipcub/hipcub.hpp"
+	#else
+		#include "cub/cub.cuh"
+	#endif
 	#ifndef SORT_WITH_CUB
 		#define SORT_WITH_CUB
 	#endif
@@ -248,66 +252,134 @@ namespace openfpm
 
 	#ifdef SORT_WITH_CUB
 
-			void *d_temp_storage = NULL;
-			size_t temp_storage_bytes = 0;
-
-			auto & temporal2 = context.getTemporalCUB2();
-			temporal2.resize(sizeof(key_t)*count);
-
-			auto & temporal3 = context.getTemporalCUB3();
-			temporal3.resize(sizeof(val_t)*count);
-
-			if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true)
-			{
-				cub::DeviceRadixSort::SortPairs(d_temp_storage, 
-												temp_storage_bytes,
-												keys_input,
-												(key_t *)temporal2.template getDeviceBuffer<0>(),
-												vals_input,
-												(val_t *)temporal3.template getDeviceBuffer<0>(),
-												count);
-
-				auto & temporal = context.getTemporalCUB();
-				temporal.resize(temp_storage_bytes);
-
-				d_temp_storage = temporal.template getDeviceBuffer<0>();
-
-				// Run
-				cub::DeviceRadixSort::SortPairs(d_temp_storage, 
-												temp_storage_bytes,
-												keys_input,
-												(key_t *)temporal2.template getDeviceBuffer<0>(),
-												vals_input,
-												(val_t *)temporal3.template getDeviceBuffer<0>(),
-												count);
-			}
-			else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true)
-			{
-				cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
-												temp_storage_bytes,
-												keys_input,
-												(key_t *)temporal2.template getDeviceBuffer<0>(),
-												vals_input,
-												(val_t *)temporal3.template getDeviceBuffer<0>(),
-												count);
-
-				auto & temporal = context.getTemporalCUB();
-				temporal.resize(temp_storage_bytes);
-
-				d_temp_storage = temporal.template getDeviceBuffer<0>();
-
-				// Run
-				cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
-												temp_storage_bytes,
-												keys_input,
-												(key_t *)temporal2.template getDeviceBuffer<0>(),
-												vals_input,
-												(val_t *)temporal3.template getDeviceBuffer<0>(),
-												count);
-			}
-
-			cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice);
-			cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice);
+			#ifdef __HIP__
+
+				void *d_temp_storage = NULL;
+				size_t temp_storage_bytes = 0;
+
+				auto & temporal2 = context.getTemporalCUB2();
+				temporal2.resize(sizeof(key_t)*count);
+
+				auto & temporal3 = context.getTemporalCUB3();
+				temporal3.resize(sizeof(val_t)*count);
+
+				if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true)
+				{
+					hipcub::DeviceRadixSort::SortPairs(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+
+					auto & temporal = context.getTemporalCUB();
+					temporal.resize(temp_storage_bytes);
+
+					d_temp_storage = temporal.template getDeviceBuffer<0>();
+
+					// Run
+					hipcub::DeviceRadixSort::SortPairs(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+				}
+				else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true)
+				{
+					hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+
+					auto & temporal = context.getTemporalCUB();
+					temporal.resize(temp_storage_bytes);
+
+					d_temp_storage = temporal.template getDeviceBuffer<0>();
+
+					// Run
+					hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+				}
+
+				cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice);
+				cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice);
+			
+
+			#else
+
+				void *d_temp_storage = NULL;
+				size_t temp_storage_bytes = 0;
+
+				auto & temporal2 = context.getTemporalCUB2();
+				temporal2.resize(sizeof(key_t)*count);
+
+				auto & temporal3 = context.getTemporalCUB3();
+				temporal3.resize(sizeof(val_t)*count);
+
+				if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true)
+				{
+					cub::DeviceRadixSort::SortPairs(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+
+					auto & temporal = context.getTemporalCUB();
+					temporal.resize(temp_storage_bytes);
+
+					d_temp_storage = temporal.template getDeviceBuffer<0>();
+
+					// Run
+					cub::DeviceRadixSort::SortPairs(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+				}
+				else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true)
+				{
+					cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+
+					auto & temporal = context.getTemporalCUB();
+					temporal.resize(temp_storage_bytes);
+
+					d_temp_storage = temporal.template getDeviceBuffer<0>();
+
+					// Run
+					cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
+													temp_storage_bytes,
+													keys_input,
+													(key_t *)temporal2.template getDeviceBuffer<0>(),
+													vals_input,
+													(val_t *)temporal3.template getDeviceBuffer<0>(),
+													count);
+				}
+
+				cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice);
+				cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice);
+				
+			#endif
 
 	#else
 			mgpu::mergesort(keys_input,vals_input,count,comp,context);
diff --git a/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp b/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp
index 744efeac..442aa1f3 100644
--- a/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp
+++ b/src/util/multi_array_openfpm/multi_array_ref_subarray_openfpm.hpp
@@ -119,7 +119,7 @@ public:
   inline __host__ __device__ size_type size() const { return boost::mpl::at<vector,boost::mpl::int_<0>>::type::value; }
   size_type max_size() const { return num_elements(); }
   bool empty() const { return size() == 0; }
-  size_type num_dimensions() const { return NumDims; }
+  inline __device__ __host__ size_type num_dimensions() const { return NumDims; }
   inline __host__ __device__ const index* strides() const { return strides_; }
 
   size_type num_elements() const
-- 
GitLab