Commit a99d2daf authored by Ubuntu's avatar Ubuntu

Fixing HIP compilation

parent 008cfeff
Pipeline #1958 failed with stages
in 1 minute and 29 seconds
......@@ -50,10 +50,9 @@ endif()
set(ENV{PATH} "$ENV{PATH}:${HDF5_ROOT}/bin")
set(HDF5_PREFER_PARALLEL TRUE)
find_package(MPI)
if(ENABLE_GPU)
set(CMAKE_CUDA_HOST_COMPILER mpic++)
enable_language(CUDA)
find_package(CUDA)
if (CMAKE_EXPORT_COMPILE_COMMANDS==OFF)
......@@ -82,7 +81,6 @@ if ( HIP_ENABLE )
endif()
find_package(Boost 1.68.0 COMPONENTS unit_test_framework iostreams program_options system filesystem)
find_package(MPI)
find_package(PETSc)
find_package(HDF5)
find_package(Eigen3)
......
......@@ -236,11 +236,7 @@ do
conf_options="$conf_options -DTEST_PERFORMANCE=ON"
;;
gpu)
if [ x"$CXX" == x"" ]; then
conf_options="$conf_options"
else
conf_options="$conf_options -DCMAKE_CUDA_HOST_COMPILER=$(which $CXX)"
fi
conf_options="$conf_options -DCMAKE_CUDA_HOST_COMPILER=$(which $CXX)"
conf_options="$conf_options -DENABLE_GPU=ON"
;;
*) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
......
......@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
########################### Executables
if(CUDA_FOUND)
if(CUDA_FOUND OR HIP_ENABLE)
set(CUDA_SOURCES ../openfpm_devices/src/memory/CudaMemory.cu)
endif()
......@@ -24,6 +24,11 @@ if ( HIP_ENABLE AND HIP_FOUND )
set_property(TARGET dom_box PROPERTY COMPILE_FLAGS "-I${MPI_C_INCLUDE_DIRS}")
set_property(TARGET vector_dist PROPERTY COMPILE_FLAGS "-I${MPI_C_INCLUDE_DIRS}")
target_compile_definitions(cart_dec PUBLIC BOOST_GPU_ENABLED="__device__ __host__")
target_compile_definitions(metis_dec PUBLIC BOOST_GPU_ENABLED="__device__ __host__")
target_compile_definitions(dom_box PUBLIC BOOST_GPU_ENABLED="__device__ __host__")
target_compile_definitions(vector_dist PUBLIC BOOST_GPU_ENABLED="__device__ __host__")
else()
add_executable(cart_dec CartDecomposition_gen_vtk.cpp ../src/lib/pdata.cpp ${CUDA_SOURCES} ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp)
......
openfpm_data @ 2def97fe
Subproject commit d6edb0cb37a75fd33a408d852005da7df3da37ea
Subproject commit 2def97fed01737234c39ef26d8bce83c885f67b0
openfpm_devices @ f73fad9e
Subproject commit 02ff5ac2a84c1c4e05f4181abf35448680ac948a
Subproject commit f73fad9e2424544ade40ccd6b4028b1c3078a7a2
openfpm_io @ 1b4a37f2
Subproject commit f8fe0b5783e4948e521735ee005a27130210edee
Subproject commit 1b4a37f276b6d9708532c1fb6eda48ad558f1587
openfpm_numerics @ 484318ff
Subproject commit 4aee04a90a8d53bb72793652e9f162dac1b8c8fa
Subproject commit 484318ff8c0e4b10fbcd10a18da2ca6e5df849f8
openfpm_vcluster @ 5cb26a39
Subproject commit 16226981e72b9355585b1ba674f218dee7620711
Subproject commit 5cb26a394cad1624f5f79fa2da219e28cb7ef6d0
......@@ -9,13 +9,27 @@ from pathlib import Path
# python script/py/code-transform.py --json "build/compile_commands.json" --compile_command "/usr/local/cuda/bin/nvcc -ccbin=/home/i-bird/MPI/bin/mpic++" --build_path "/home/i-bird/Desktop/MOSAIC/OpenFPM_project/openfpm_bianucci_flush_hip_test/openfpm_pdata/build" --add_includes " -I/home/i-bird/MPI/include -I/home/i-bird/Desktop/MOSAIC/OpenFPM_project/openfpm_bianucci_flush_hip_test/openfpm_pdata/openfpm_numerics/src/ -I/home/ibird/PETSC/include" --directory "/home/i-bird/Desktop/MOSAIC/OpenFPM_project/openfpm_bianucci_flush_hip_test/openfpm_pdata" --exclude_directories="/home/i-bird/Desktop/MOSAIC/OpenFPM_project/openfpm_bianucci_flush_hip_test/openfpm_pdata/openfpm_data/src/util/cuda/cub"
# python script/py/code-transform.py --json "build/compile_commands.json" --compile_command "/usr/local/cuda/bin/nvcc -ccbin=/home/ibird/MPI/bin/mpic++" --build_path "/home/ibird/openfpm_pdata/build" --add_includes " -DSE_CLASS2 -I/home/i-bird/MPI/include -I/home/ibird/openfpm_pdata/openfpm_numerics/src/ -I/home/ibird/PETSC/include" --directory "/home/ibird/openfpm_pdata" --exclude_directories="/home/ibird/openfpm_pdata/openfpm_data/src/util/cuda/cub"
# CREATE COMPILE_COMMANDS
# source openfpm_vars
#cmake ../. -DMPI_VENDOR=openmpi -DPARMETIS_ROOT=/home/ibird/PARMETIS -DMETIS_ROOT=/home/ibird/METIS -DBOOST_ROOT=/home/ibird/BOOST -DHDF5_ROOT=/home/ibird/HDF5/ -DLIBHILBERT_ROOT=/home/ibird/LIBHILBERT -DPETSC_ROOT=/home/ibird/PETSC -DSUITESPARSE_ROOT=/home/ibird/SUITESPARSE -DEIGEN3_ROOT=/home/ibird/EIGEN -DOPENBLAS_ROOT=/home/ibird/OPENBLAS/ -DCMAKE_CUDA_HOST_COMPILER=/home/ibird/MPI/bin/mpic++ -DENABLE_GPU=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
# Transform
# To make it work /home/ibird/BOOST/include/boost/config/detail/select_compiler_config.hpp remove from __clang__ !defined(__CUDA__)
# cmake ../. -DCMAKE_INSTALL_PREFIX=/usr/local/openfpm_sparse_cl -DBOOST_ROOT=/home/ibird/BOOST -DHDF5_ROOT=/home/ibird/HDF5/ -DLIBHILBERT_ROOT=/home/ibird/LIBHILBERT -DMPI_VENDOR=openmpi -DPARMETIS_ROOT=/home/ibird/PARMETIS -DMETIS_ROOT=/home/ibird/METIS -DPETSC_ROOT=/home/ibird/PETSC -DSUITESPARSE_ROOT=/home/ibird/SUITESPARSE -DEIGEN3_ROOT=/home/ibird/EIGEN -DOPENBLAS_ROOT=/home/ibird/OPENBLAS/ -DCMAKE_BUILD_TYPE=Release -DHIP_ENABLE=ON -DAMD_ARCH_COMPILE=gfx900 -DHIPCUB_ROOT=/home/i-bird/Desktop/MOSAIC/OpenFPM_project/HIP/HIPCUB_install/ -DROCMPRIM_ROOT=/opt/rocm-3.1.0/rocprim/ -DENABLE_GPU=OFF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
# python3 script/py/code-transform.py --json "build/compile_commands.json" --compile_command "/usr/local/cuda/bin/nvcc -ccbin=/home/ibird/MPI/bin/mpic++" --build_path "/home/ibird/openfpm_pdata/build" --add_includes " -DSE_CLASS2 -I/home/i-bird/MPI/include -I/home/ibird/openfpm_pdata/openfpm_numerics/src/ -I/home/ibird/PETSC/include" --directory "/home/ibird/openfpm_pdata" --exclude_directories="/home/ibird/openfpm_pdata/openfpm_data/src/util/cuda/cub"
#COMPILE with HIP
# rm files in build
# cmake ../. -DCMAKE_INSTALL_PREFIX=/usr/local/openfpm_sparse_cl -DBOOST_ROOT=/home/ibird/BOOST -DHDF5_ROOT=/home/ibird/HDF5/ -DLIBHILBERT_ROOT=/home/ibird/LIBHILBERT -DMPI_VENDOR=openmpi -DPARMETIS_ROOT=/home/ibird/PARMETIS -DMETIS_ROOT=/home/ibird/METIS -DPETSC_ROOT=/home/ibird/PETSC -DSUITESPARSE_ROOT=/home/ibird/SUITESPARSE -DEIGEN3_ROOT=/home/ibird/EIGEN -DOPENBLAS_ROOT=/home/ibird/OPENBLAS/ -DCMAKE_BUILD_TYPE=Release -DHIP_ENABLE=ON -DAMD_ARCH_COMPILE=gfx900 -DHIPCUB_ROOT=/home/i-bird/Desktop/MOSAIC/OpenFPM_project/HIP/HIPCUB_install/ -DROCMPRIM_ROOT=/opt/rocm-3.1.0/rocprim/ -DENABLE_GPU=OFF
# cmake ../. -DBOOST_ROOT=/home/ibird/BOOST -DHDF5_ROOT=/home/ibird/HDF5/ -DLIBHILBERT_ROOT=/home/ibird/LIBHILBERT -DMPI_VENDOR=openmpi -DPARMETIS_ROOT=/home/ibird/PARMETIS -DMETIS_ROOT=/home/ibird/METIS -DPETSC_ROOT=/home/ibird/PETSC -DSUITESPARSE_ROOT=/home/ibird/SUITESPARSE -DEIGEN3_ROOT=/home/ibird/EIGEN -DOPENBLAS_ROOT=/home/ibird/OPENBLAS/ -DCMAKE_BUILD_TYPE=Release -DHIP_ENABLE=ON -DAMD_ARCH_COMPILE=gfx900 -DHIPCUB_ROOT=/opt/rocm/hipcub -DROCMPRIM_ROOT=/opt/rocm/rocprim/ -DENABLE_GPU=OFF
# SE_CLASS2 must be used to convert CudaMemory.cu
cudaExtension = ".cu"
cudaFileToParse=[]
......
......@@ -11,6 +11,8 @@
* Created on: Oct 5, 2017
* Author: i-bird
*/
#include <hip/hip_runtime.h>
#include "config.h"
#define BOOST_TEST_DYN_LINK
......
......@@ -53,6 +53,8 @@ if ( HIP_ENABLE AND HIP_FOUND )
hip_add_executable(pdata ${OPENFPM_INIT_FILE} ${CUDA_SOURCES} ${CPP_SOURCES})
hip_add_library(ofpm_pdata STATIC lib/pdata.cpp)
target_compile_definitions(pdata PUBLIC BOOST_GPU_ENABLED="__device__ __host__")
target_include_directories (pdata PUBLIC ${HIP_ROOT_DIR}/include)
#target_include_directories (pdata PUBLIC ${MPI_C_INCLUDE_DIRS}) # Well ........ it does not work, work around line below
set_property(TARGET pdata PROPERTY COMPILE_FLAGS "-I${MPI_C_INCLUDE_DIRS}")
......
#define BOOST_TEST_DYN_LINK
#include <hip/hip_runtime.h>
#include "config.h"
#include <boost/test/unit_test.hpp>
#include "Space/Shape/Box.hpp"
......
#define BOOST_TEST_DYN_LINK
#include <hip/hip_runtime.h>
#include "config.h"
#include <boost/test/unit_test.hpp>
......@@ -77,7 +79,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
CudaMemory mem;
mem.allocate(2*sizeof(unsigned int));
test_proc_idbc<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_proc_idbc<decltype(gpudec)>), dim3(1), dim3(1), 0, 0, p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
mem.deviceToHost();
......@@ -86,7 +88,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
CudaMemory mem2;
mem2.allocate(2*sizeof(unsigned int));
test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_ghost_n<decltype(gpudec)>), dim3(1), dim3(1), 0, 0, p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
mem2.deviceToHost();
......@@ -94,7 +96,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
openfpm::vector_gpu<aggregate<int,int>> vd;
vd.resize(tot);
test_ghost<decltype(gpudec),decltype(vd.toKernel())><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_ghost<decltype(gpudec),decltype(vd.toKernel())>), dim3(1), dim3(1), 0, 0, p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel());
if (((unsigned int *)mem.getPointer())[0] != ((unsigned int *)mem.getPointer())[1])
{
......@@ -120,7 +122,7 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
p2.get(j) = std::nextafter(SpaceBox<3,double>(dec.getSubDomains().get(i)).getHigh(j),1.0);
test_proc_idbc<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_proc_idbc<decltype(gpudec)>), dim3(1), dim3(1), 0, 0, p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
mem.deviceToHost();
......@@ -128,14 +130,14 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
BOOST_REQUIRE(((unsigned int *)mem.getPointer())[1] < vcl.size());
mem2.allocate(2*sizeof(unsigned int));
test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_ghost_n<decltype(gpudec)>), dim3(1), dim3(1), 0, 0, p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
mem2.deviceToHost();
tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1];
vd.resize(tot);
test_ghost<decltype(gpudec),decltype(vd.toKernel())><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_ghost<decltype(gpudec),decltype(vd.toKernel())>), dim3(1), dim3(1), 0, 0, p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel());
if (((unsigned int *)mem.getPointer())[0] != ((unsigned int *)mem.getPointer())[1])
{
......
......@@ -326,7 +326,7 @@ public:
// fill properties
boost::mpl::for_each<boost::mpl::range_c<int, 0, sizeof...(pos)> >(flp);
boost::mpl::for_each_ref<boost::mpl::range_c<int, 0, sizeof...(pos)> >(flp);
// set map global to local in the graph, needed because vertex is already created without addVertex method
......@@ -478,7 +478,7 @@ public:
// fill properties
boost::mpl::for_each<boost::mpl::range_c<int, 0, sizeof...(pos)> >(flp);
boost::mpl::for_each_ref<boost::mpl::range_c<int, 0, sizeof...(pos)> >(flp);
// set map global to local in the graph, needed because vertex is already created without addVertex method
......
#define BOOST_TEST_DYN_LINK
#include <hip/hip_runtime.h>
#include "config.h"
#include <boost/test/unit_test.hpp>
......
#define BOOST_TEST_DYN_LINK
#include <hip/hip_runtime.h>
#include "config.h"
#define TEST1
......@@ -100,9 +102,7 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles )
v_prp.hostToDevice<0,1,2>();
// label particle processor
num_shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
<<<ite.wthr,ite.thr>>>
(box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),v_pos.size());
hipLaunchKernelGGL(HIP_KERNEL_NAME(num_shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>), dim3(ite.wthr), dim3(ite.thr), 0, 0, box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),v_pos.size());
o_part_loc.deviceToHost<0>();
......@@ -161,12 +161,10 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles )
openfpm::vector_gpu<aggregate<unsigned int,unsigned int>> o_part_loc2;
o_part_loc2.resize(tot);
shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
hipLaunchKernelGGL(HIP_KERNEL_NAME(shift_ghost_each_part<3,float,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),
decltype(starts.toKernel()),decltype(shifts.toKernel()),
decltype(o_part_loc2.toKernel())>
<<<ite.wthr,ite.thr>>>
(box_f_dev.toKernel(),box_f_sv.toKernel(),
decltype(o_part_loc2.toKernel())>), dim3(ite.wthr), dim3(ite.thr), 0, 0, box_f_dev.toKernel(),box_f_sv.toKernel(),
v_pos.toKernel(),v_prp.toKernel(),
starts.toKernel(),shifts.toKernel(),o_part_loc2.toKernel(),old,old);
......@@ -351,9 +349,7 @@ BOOST_AUTO_TEST_CASE( vector_ghost_process_local_particles )
ite = o_part_loc2.getGPUIterator();
process_ghost_particles_local<true,3,decltype(o_part_loc2.toKernel()),decltype(v_pos2.toKernel()),decltype(v_prp2.toKernel()),decltype(shifts.toKernel())>
<<<ite.wthr,ite.thr>>>
(o_part_loc2.toKernel(),v_pos2.toKernel(),v_prp2.toKernel(),shifts.toKernel(),old);
hipLaunchKernelGGL(HIP_KERNEL_NAME(process_ghost_particles_local<true,3,decltype(o_part_loc2.toKernel()),decltype(v_pos2.toKernel()),decltype(v_prp2.toKernel()),decltype(shifts.toKernel())>), dim3(ite.wthr), dim3(ite.thr), 0, 0, o_part_loc2.toKernel(),v_pos2.toKernel(),v_prp2.toKernel(),shifts.toKernel(),old);
v_pos2.template deviceToHost<0>();
v_prp2.template deviceToHost<0,1,2>();
......@@ -457,9 +453,7 @@ BOOST_AUTO_TEST_CASE( vector_ghost_fill_send_buffer_test )
auto ite = g_send_prp.get(i).getGPUIterator();
process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),0,1,2>
<<<ite.wthr,ite.thr>>>
(g_opart_device.toKernel(), g_send_prp.get(i).toKernel(),
hipLaunchKernelGGL(HIP_KERNEL_NAME(process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),0,1,2>), dim3(ite.wthr), dim3(ite.thr), 0, 0, g_opart_device.toKernel(), g_send_prp.get(i).toKernel(),
v_prp.toKernel(),offset);
offset += g_send_prp.get(i).size();
......@@ -573,9 +567,7 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use )
proc_id_out.template get<0>(proc_id_out.size()-1) = 0;
proc_id_out.template hostToDevice(proc_id_out.size()-1,proc_id_out.size()-1);
num_proc_ghost_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(num_proc_ghost_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())>), dim3(ite.wthr), dim3(ite.thr), 0, 0, dec.toKernel(),vg.toKernel(),proc_id_out.toKernel());
proc_id_out.deviceToHost<0>();
......@@ -623,9 +615,7 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use )
ite = vg.getGPUIterator();
// we compute processor id for each particle
proc_label_id_ghost<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(starts.toKernel()),decltype(output.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),vg.toKernel(),starts.toKernel(),output.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(proc_label_id_ghost<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(starts.toKernel()),decltype(output.toKernel())>), dim3(ite.wthr), dim3(ite.thr), 0, 0, dec.toKernel(),vg.toKernel(),starts.toKernel(),output.toKernel());
output.template deviceToHost<0,1>();
......@@ -741,9 +731,7 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
dev_counter.fill<1>(0);
dev_counter.fill<2>(0);
process_id_proc_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel()),decltype(dev_counter.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),dev_counter.toKernel(),v_cl.rank());
hipLaunchKernelGGL(HIP_KERNEL_NAME(process_id_proc_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel()),decltype(dev_counter.toKernel())>), dim3(ite.wthr), dim3(ite.thr), 0, 0, dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),dev_counter.toKernel(),v_cl.rank());
proc_id_out.deviceToHost<0>();
......@@ -780,7 +768,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_find_buffer_offsets_test )
auto ite = vgp.getGPUIterator();
vgp.hostToDevice<0,1>();
CUDA_LAUNCH((find_buffer_offsets<1,decltype(vgp.toKernel()),decltype(offs.toKernel())>),ite,vgp.toKernel(),(int *)mem.getDevicePointer(),offs.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME((find_buffer_offsets<1,decltype(vgp.toKernel()),decltype(offs.toKernel())>)), dim3(), dim3(), 0, 0, vgp.toKernel(),(int *)mem.getDevicePointer(),offs.toKernel());
offs.template deviceToHost<0,1>();
......@@ -831,7 +819,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_reorder_lbl)
auto ite = lbl_p.getGPUIterator();
reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())><<<ite.wthr,ite.thr>>>(lbl_p.toKernel(),starts.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())>), dim3(ite.wthr), dim3(ite.thr), 0, 0, lbl_p.toKernel(),starts.toKernel());
starts.template deviceToHost<0>();
lbl_p.template deviceToHost<0,1,2>();
......@@ -906,7 +894,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
auto ite = v_pos.getGPUIterator();
merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),0><<<ite.wthr,ite.thr>>>(v_pos.toKernel(),v_prp.toKernel(),
hipLaunchKernelGGL(HIP_KERNEL_NAME(merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),0>), dim3(ite.wthr), dim3(ite.thr), 0, 0, v_pos.toKernel(),v_prp.toKernel(),
v_pos_out.toKernel(),v_prp_out.toKernel(),
ns_to_s.toKernel());
......@@ -930,7 +918,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
BOOST_REQUIRE_EQUAL(match,true);
merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),1,2><<<ite.wthr,ite.thr>>>(v_pos.toKernel(),v_prp.toKernel(),
hipLaunchKernelGGL(HIP_KERNEL_NAME(merge_sort_part<false,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel()),1,2>), dim3(ite.wthr), dim3(ite.thr), 0, 0, v_pos.toKernel(),v_prp.toKernel(),
v_pos_out.toKernel(),v_prp_out.toKernel(),
ns_to_s.toKernel());
......@@ -958,7 +946,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_merge_sort)
BOOST_REQUIRE_EQUAL(match,true);
merge_sort_part<true,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel())><<<ite.wthr,ite.thr>>>(v_pos.toKernel(),v_prp.toKernel(),
hipLaunchKernelGGL(HIP_KERNEL_NAME(merge_sort_part<true,decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(ns_to_s.toKernel())>), dim3(ite.wthr), dim3(ite.thr), 0, 0, v_pos.toKernel(),v_prp.toKernel(),
v_pos_out.toKernel(),v_prp_out.toKernel(),
ns_to_s.toKernel());
......@@ -1048,10 +1036,8 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
{
auto ite = m_pos.get(i).getGPUIterator();
process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
<<<ite.wthr,ite.thr>>>
(m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
hipLaunchKernelGGL(HIP_KERNEL_NAME(process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>), dim3(ite.wthr), dim3(ite.thr), 0, 0, m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
v_pos.toKernel(),v_prp.toKernel(),offset);
m_pos.get(i).deviceToHost<0>();
......
#include <hip/hip_runtime.h>
#include "config.h"
#define BOOST_TEST_DYN_LINK
......@@ -561,7 +563,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_test )
openfpm::vector_gpu<aggregate<float>> output;
output.resize(100 * phases.size());
vdmkt<<<1,1>>>(phases.toKernel(),output.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(vdmkt), dim3(1), dim3(1), 0, 0, phases.toKernel(),output.toKernel());
output.template deviceToHost<0>();
......@@ -653,7 +655,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_test_simplified )
openfpm::vector_gpu<aggregate<float>> output;
output.resize(100 * phases.size());
vdmkt_simple<<<1,1>>>(phases.toKernel(),output.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(vdmkt_simple), dim3(1), dim3(1), 0, 0, phases.toKernel(),output.toKernel());
output.template deviceToHost<0>();
......@@ -762,7 +764,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_multiphase_kernel_cl_test )
output.resize(tot);
output2.resize(tot_g);
vdmkt_simple_cl<<<1,1>>>(phases.toKernel(),output.toKernel(),cl_ph.toKernel(),output2.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(vdmkt_simple_cl), dim3(1), dim3(1), 0, 0, phases.toKernel(),output.toKernel(),cl_ph.toKernel(),output2.toKernel());
output.template deviceToHost<0>();
......
#define BOOST_TEST_DYN_LINK
#include <hip/hip_runtime.h>
#include "config.h"
#include <boost/test/unit_test.hpp>
#include "VCluster/VCluster.hpp"
......@@ -273,7 +275,7 @@ void check_cell_list_cpu_and_gpu(vector_type & vd, CellList_type & NN, CellList_
{
auto it5 = vd.getDomainIteratorGPU(32);
calculate_force<typename vector_type::stype,decltype(NN.toKernel())><<<it5.wthr,it5.thr>>>(vd.toKernel(),vd.toKernel_sorted(),NN.toKernel(),create_vcluster().rank());
hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_force<typename vector_type::stype,decltype(NN.toKernel())>), dim3(it5.wthr), dim3(it5.thr), 0, 0, vd.toKernel(),vd.toKernel_sorted(),NN.toKernel(),create_vcluster().rank());
vd.template deviceToHostProp<1,2>();
......@@ -299,7 +301,7 @@ void check_cell_list_cpu_and_gpu(vector_type & vd, CellList_type & NN, CellList_
// We do exactly the same test as before, but now we completely use the sorted version
calculate_force_full_sort<typename vector_type::stype,decltype(NN.toKernel())><<<it5.wthr,it5.thr>>>(vd.toKernel_sorted(),NN.toKernel(),create_vcluster().rank());
hipLaunchKernelGGL(HIP_KERNEL_NAME(calculate_force_full_sort<typename vector_type::stype,decltype(NN.toKernel())>), dim3(it5.wthr), dim3(it5.thr), 0, 0, vd.toKernel_sorted(),NN.toKernel(),create_vcluster().rank());
vd.template merge_sort<1>(NN);
vd.template deviceToHostProp<1>();
......@@ -385,7 +387,7 @@ void vector_dist_gpu_test_impl()
// offload to device
vd.hostToDevicePos();
initialize_props<<<it3.wthr,it3.thr>>>(vd.toKernel());
hipLaunchKernelGGL(initialize_props, dim3(it3.wthr), dim3(it3.thr), 0, 0, vd.toKernel());
// now we check what we initialized
......@@ -471,7 +473,7 @@ void vector_dist_gpu_make_sort_test_impl()
auto it3 = vd.getDomainIteratorGPU();
initialize_props<<<it3.wthr,it3.thr>>>(vd.toKernel());
hipLaunchKernelGGL(initialize_props, dim3(it3.wthr), dim3(it3.thr), 0, 0, vd.toKernel());
// Here we check make sort does not mess-up particles we use a Cell-List to check that
// the two cell-list constructed are identical
......@@ -697,7 +699,7 @@ void vdist_calc_gpu_test()
{
vd.map(RUN_ON_DEVICE);
CUDA_SAFE(cudaGetLastError());
CUDA_SAFE(hipGetLastError());
vd.deviceToHostPos();
vd.template deviceToHostProp<0,1,2>();
......@@ -843,7 +845,7 @@ void vdist_calc_gpu_test()
// move particles on gpu
auto ite = vd.getDomainIteratorGPU();
move_parts_gpu_test<3,decltype(vd.toKernel())><<<ite.wthr,ite.thr>>>(vd.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(move_parts_gpu_test<3,decltype(vd.toKernel())>), dim3(ite.wthr), dim3(ite.thr), 0, 0, vd.toKernel());
}
}
......@@ -1726,7 +1728,7 @@ BOOST_AUTO_TEST_CASE(vector_dist_overflow_se_class1)
ite.thr.y = 1;
ite.thr.z = 1;
CUDA_LAUNCH(launch_overflow,ite,vdg.toKernel(),vdg2.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(launch_overflow), dim3(), dim3(), 0, 0, vdg.toKernel(),vdg2.toKernel());
std::cout << "****** TEST ERROR MESSAGE END ********" << std::endl;
}
......
......@@ -8,6 +8,7 @@
#ifndef SRC_VECTOR_VECTOR_DIST_PERFORMANCE_UTIL_HPP_
#define SRC_VECTOR_VECTOR_DIST_PERFORMANCE_UTIL_HPP_
#include "util/cuda_util.hpp"
#include <boost/property_tree/ptree.hpp>
#include "Plot/GoogleChart.hpp"
#include "vector_dist_performance_common.hpp"
......
#define PRINT_RANK_TO_GPU
#include <hip/hip_runtime.h>
#include "initialize_wrapper.hpp"
#include "VCluster/VCluster.hpp"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment