Commit 24987fd6 authored by incardon's avatar incardon

Hip compilable version

parent b507c258
......@@ -5,8 +5,6 @@
* Author: i-bird
*/
#include <hip/hip_runtime.h>
#include "config.h"
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
......@@ -633,9 +631,9 @@ BOOST_AUTO_TEST_CASE (gpu_grid_test_se_class1)
int dev_mem[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
test_se1_crash_gt2<<<{32,1,1},{16,1,1}>>>(c3.toKernel(),c2.toKernel());
hipDeviceSynchronize();
cudaDeviceSynchronize();
hipMemcpyFromSymbol(dev_mem,global_cuda_error_array,sizeof(dev_mem));
cudaMemcpyFromSymbol(dev_mem,global_cuda_error_array,sizeof(dev_mem));
BOOST_REQUIRE_EQUAL(dev_mem[0],1);
BOOST_REQUIRE_EQUAL(*(size_t *)(&dev_mem[1]),(size_t)(c3.toKernel().template getPointer<1>()));
......@@ -659,10 +657,10 @@ BOOST_AUTO_TEST_CASE (gpu_grid_test_se_class1)
int dev_mem2[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
test_se1_crash_gt3<<<{32,1,1},{16,1,1}>>>(c2.toKernel(),c3.toKernel());
hipDeviceSynchronize();
cudaDeviceSynchronize();
hipMemcpyFromSymbol(dev_mem2,global_cuda_error_array,sizeof(dev_mem2));
cudaMemcpyFromSymbol(dev_mem2,global_cuda_error_array,sizeof(dev_mem2));
BOOST_REQUIRE_EQUAL(dev_mem2[0],1);
BOOST_REQUIRE_EQUAL(*(size_t *)(&dev_mem2[1]),(size_t)(c2.toKernel().template getPointer<2>()));
......
#include <hip/hip_runtime.h>
#include "config.h"
#include <Grid/map_grid.hpp>
#include "Point_test.hpp"
......@@ -84,7 +82,7 @@ void gpu_grid_3D_one(grid_gpu<3,Point_aggr_test> & g)
float * prp_0 = (float *)g.getDeviceBuffer<0>();
hipLaunchKernelGGL(fill_one, dim3(grid), dim3(threads), 0, 0, prp_0,64);
fill_one<<< grid, threads >>>(prp_0,64);
}
// call compute
......@@ -97,7 +95,7 @@ void gpu_grid_3D_compute(grid_gpu<3,Point_aggr_test> & g)
float * prp_0 = (float *)g.getDeviceBuffer<0>();
hipLaunchKernelGGL(fill_count, dim3(grid), dim3(threads), 0, 0, prp_0,64);
fill_count<<< grid, threads >>>(prp_0,64);
}
void gpu_grid_3D_compute_stencil(grid_gpu<3,Point_aggr_test> & g1, grid_gpu<3,Point_aggr_test> & g2,
......@@ -110,7 +108,7 @@ void gpu_grid_3D_compute_stencil(grid_gpu<3,Point_aggr_test> & g1, grid_gpu<3,Po
auto gpu_it = g2.getGPUIterator(start,stop);
hipLaunchKernelGGL(compute_stencil, dim3(gpu_it.thr), dim3(gpu_it.wthr), 0, 0, prp_0,prp_1,64,start,stop);
compute_stencil<<< gpu_it.thr, gpu_it.wthr >>>(prp_0,prp_1,64,start,stop);
}
void gpu_grid_3D_compute_grid_stencil(grid_gpu<3,Point_aggr_test> & g1, grid_gpu<3,Point_aggr_test> & g2,
......@@ -121,27 +119,27 @@ void gpu_grid_3D_compute_grid_stencil(grid_gpu<3,Point_aggr_test> & g1, grid_gpu
auto g1k = g1.toKernel();
auto g2k = g2.toKernel();
hipLaunchKernelGGL(compute_stencil_grid, dim3(gpu_it.thr), dim3(gpu_it.wthr), 0, 0, g1k,g2k,gpu_it);
compute_stencil_grid<<< gpu_it.thr, gpu_it.wthr >>>(g1k,g2k,gpu_it);
}
void gpu_grid_fill_vector(grid_gpu<3,Point_aggr_test> & g1, grid_key_dx<3> & start, grid_key_dx<3> & stop)
{
auto gpu_it = g1.getGPUIterator(start,stop);
hipLaunchKernelGGL(grid_fill_vector, dim3(gpu_it.thr), dim3(gpu_it.wthr), 0, 0, g1.toKernel(),gpu_it);
grid_fill_vector<<< gpu_it.thr, gpu_it.wthr >>>(g1.toKernel(),gpu_it);
}
void gpu_grid_fill_vector2(grid_gpu<3,Point_aggr_test> & g1, grid_key_dx<3> & start, grid_key_dx<3> & stop)
{
auto gpu_it = g1.getGPUIterator(start,stop);
hipLaunchKernelGGL(grid_fill_vector2, dim3(gpu_it.thr), dim3(gpu_it.wthr), 0, 0, g1.toKernel(),gpu_it);
grid_fill_vector2<<< gpu_it.thr, gpu_it.wthr >>>(g1.toKernel(),gpu_it);
}
void gpu_grid_gradient_vector(grid_gpu<3,Point_aggr_test> & g1, grid_gpu<3,Point_aggr_test> & g2, grid_key_dx<3> & start, grid_key_dx<3> & stop)
{
auto gpu_it = g1.getGPUIterator(start,stop);
hipLaunchKernelGGL(grid_gradient_vector, dim3(gpu_it.thr), dim3(gpu_it.wthr), 0, 0, g1.toKernel(),g2.toKernel(),gpu_it);
grid_gradient_vector<<< gpu_it.thr, gpu_it.wthr >>>(g1.toKernel(),g2.toKernel(),gpu_it);
}
......@@ -7,8 +7,6 @@
#define BOOST_GPU_ENABLED __host__ __device__
#include <hip/hip_runtime.h>
#include "config.h"
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
......@@ -69,7 +67,7 @@ void test_sub_index()
}
cl_n.resize(17*17*17);
CUDA_SAFE(hipMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type)));
CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type)));
part_ids.resize(pl.size());
......@@ -85,7 +83,7 @@ void test_sub_index()
no_transform_only<dim,T> t(mt,pt);
hipLaunchKernelGGL(HIP_KERNEL_NAME(subindex<false,dim,T,cnt_type,ids_type,no_transform_only<dim,T>>), dim3(ite.wthr), dim3(ite.thr), 0, 0, div,
subindex<false,dim,T,cnt_type,ids_type,no_transform_only<dim,T>><<<ite.wthr,ite.thr>>>(div,
spacing,
off,
t,
......@@ -218,7 +216,7 @@ void test_sub_index2()
}
cl_n.resize(17*17*17);
CUDA_SAFE(hipMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type)));
CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type)));
part_ids.resize(pl.size());
......@@ -233,7 +231,7 @@ void test_sub_index2()
shift_only<dim,T> t(mt,pt);
hipLaunchKernelGGL(HIP_KERNEL_NAME(subindex<false,dim,T,cnt_type,ids_type,shift_only<dim,T>>), dim3(ite.wthr), dim3(ite.thr), 0, 0, div,
subindex<false,dim,T,cnt_type,ids_type,shift_only<dim,T>><<<ite.wthr,ite.thr>>>(div,
spacing,
off,
t,
......@@ -443,7 +441,7 @@ void test_fill_cell()
part_ids.template hostToDevice<0>();
// Here we test fill cell
hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>), dim3(itgg.wthr), dim3(itgg.thr), 0, 0, 0,
fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>><<<itgg.wthr,itgg.thr>>>(0,
div_c,
off,
part_ids.size(),
......@@ -451,7 +449,7 @@ void test_fill_cell()
0,
static_cast<cnt_type *>(starts.template getDeviceBuffer<0>()),
static_cast<ids_type *>(part_ids.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()));
static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()) );
cells.template deviceToHost<0>();
......@@ -523,7 +521,7 @@ void test_cell_count_n()
size_t sz[] = {17,17,17};
grid_sm<3,void> gs(sz);
hipLaunchKernelGGL(HIP_KERNEL_NAME(construct_cells), dim3(1), dim3(1), 0, 0, vs.toKernel(),gs);
construct_cells<<<1,1>>>(vs.toKernel(),gs);
mgpu::ofp_context_t ctx;
......@@ -553,7 +551,7 @@ void test_cell_count_n()
cells_nn_test.template hostToDevice<0>();
auto itgg = vs.getGPUIterator();
hipLaunchKernelGGL(HIP_KERNEL_NAME((count_nn_cells)), dim3(), dim3(), 0, 0, vs.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel());
CUDA_LAUNCH((count_nn_cells),itgg,vs.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel());
cells_nn.deviceToHost<0>();
......@@ -574,7 +572,7 @@ void test_cell_count_n()
openfpm::vector_gpu<aggregate<unsigned int,unsigned int>> cell_nn_list;
cell_nn_list.resize(7*8 + 9 + 2 + 1);
hipLaunchKernelGGL(HIP_KERNEL_NAME((fill_nn_cells)), dim3(), dim3(), 0, 0, vs.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel(),cell_nn_list.toKernel(),200);
CUDA_LAUNCH((fill_nn_cells),itgg,vs.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel(),cell_nn_list.toKernel(),200);
cell_nn_list.deviceToHost<0>();
......@@ -722,11 +720,11 @@ void test_reorder_parts(size_t n_part)
parts_prp.template hostToDevice<0,1,2,3>();
// Here we test fill cell
hipLaunchKernelGGL(HIP_KERNEL_NAME(reorder_parts<decltype(parts_prp.toKernel()),
reorder_parts<decltype(parts_prp.toKernel()),
decltype(pl.toKernel()),
decltype(sort_to_not_sort.toKernel()),
cnt_type,
shift_ph<0,cnt_type>>), dim3(ite.wthr), dim3(ite.thr), 0, 0, pl.size(),
shift_ph<0,cnt_type>><<<ite.wthr,ite.thr>>>(pl.size(),
parts_prp.toKernel(),
parts_prp_out.toKernel(),
pl.toKernel(),
......@@ -1203,7 +1201,7 @@ struct execute_cl_test
{
auto ite = pl.getGPUIterator();
hipLaunchKernelGGL(HIP_KERNEL_NAME((calc_force_number)), dim3(), dim3(), 0, 0, pl.toKernel(),s_t_ns.toKernel(),
CUDA_LAUNCH((calc_force_number),ite,pl.toKernel(),s_t_ns.toKernel(),
cl2.toKernel(),n_out.toKernel());
}
......@@ -1212,7 +1210,7 @@ struct execute_cl_test
{
auto ite = pl.getGPUIterator();
hipLaunchKernelGGL(HIP_KERNEL_NAME((calc_force_list)), dim3(), dim3(), 0, 0, pl.toKernel(),
CUDA_LAUNCH((calc_force_list),ite,pl.toKernel(),
s_t_ns.toKernel(),
cl2.toKernel(),
n_out_scan.toKernel(),
......@@ -1241,10 +1239,11 @@ struct execute_cl_test<1>
{
auto ite = pl.getGPUIterator();
hipLaunchKernelGGL(HIP_KERNEL_NAME((calc_force_number_rad<decltype(pl.toKernel()),
CUDA_LAUNCH((calc_force_number_rad<decltype(pl.toKernel()),
decltype(s_t_ns.toKernel()),
decltype(cl2.toKernel()),
decltype(n_out.toKernel())>)), dim3(), dim3(), 0, 0, pl.toKernel(),
decltype(n_out.toKernel())>),
ite,pl.toKernel(),
s_t_ns.toKernel(),
cl2.toKernel(),
n_out.toKernel());
......@@ -1255,10 +1254,11 @@ struct execute_cl_test<1>
{
auto ite = pl.getGPUIterator();
hipLaunchKernelGGL(HIP_KERNEL_NAME((calc_force_list_rad<decltype(pl.toKernel()),
CUDA_LAUNCH((calc_force_list_rad<decltype(pl.toKernel()),
decltype(s_t_ns.toKernel()),
decltype(cl2.toKernel()),
decltype(nn_list.toKernel())>)), dim3(), dim3(), 0, 0, pl.toKernel(),
decltype(nn_list.toKernel())>),
ite,pl.toKernel(),
s_t_ns.toKernel(),
cl2.toKernel(),
n_out_scan.toKernel(),
......@@ -1287,10 +1287,11 @@ struct execute_cl_test<2>
{
auto ite = s_t_ns.getGPUIterator();
hipLaunchKernelGGL(HIP_KERNEL_NAME((calc_force_number_box_noato<decltype(pl.toKernel()),
CUDA_LAUNCH((calc_force_number_box_noato<decltype(pl.toKernel()),
decltype(s_t_ns.toKernel()),
decltype(cl2.toKernel()),
decltype(n_out.toKernel())>)), dim3(), dim3(), 0, 0, pl.toKernel(),
decltype(n_out.toKernel())>),
ite,pl.toKernel(),
s_t_ns.toKernel(),
cl2.toKernel(),
n_out.toKernel(),
......@@ -1302,10 +1303,12 @@ struct execute_cl_test<2>
{
auto ite = s_t_ns.getGPUIterator();
hipLaunchKernelGGL(HIP_KERNEL_NAME((calc_force_number_box<decltype(pl.toKernel()),
CUDA_LAUNCH((calc_force_number_box<decltype(pl.toKernel()),
decltype(s_t_ns.toKernel()),
decltype(cl2.toKernel()),
decltype(n_out.toKernel())>)), dim3(), dim3(), 0, 0, pl.toKernel(),
decltype(n_out.toKernel())>),
ite,
pl.toKernel(),
s_t_ns.toKernel(),
cl2.toKernel(),
n_out.toKernel(),
......@@ -1317,10 +1320,11 @@ struct execute_cl_test<2>
{
auto ite = s_t_ns.getGPUIterator();
hipLaunchKernelGGL(HIP_KERNEL_NAME((calc_force_list_box<decltype(pl.toKernel()),
CUDA_LAUNCH((calc_force_list_box<decltype(pl.toKernel()),
decltype(s_t_ns.toKernel()),
decltype(cl2.toKernel()),
decltype(nn_list.toKernel())>)), dim3(), dim3(), 0, 0, pl.toKernel(),
decltype(nn_list.toKernel())>),
ite,pl.toKernel(),
s_t_ns.toKernel(),
cl2.toKernel(),
n_out_scan.toKernel(),
......@@ -1337,7 +1341,7 @@ struct execute_cl_test<2>
{
auto ite = s_t_ns.getGPUIterator();
hipLaunchKernelGGL(HIP_KERNEL_NAME((calc_force_list_box_partial)), dim3(), dim3(), 0, 0, pl.toKernel(),
CUDA_LAUNCH((calc_force_list_box_partial),ite,pl.toKernel(),
s_t_ns.toKernel(),
cl2.toKernel(),
n_out_scan.toKernel(),
......@@ -2019,7 +2023,7 @@ BOOST_AUTO_TEST_CASE( CellList_use_cpu_offload_test )
cl1.hostToDevice();
v.hostToDevice<0>();
hipLaunchKernelGGL(HIP_KERNEL_NAME(cl_offload_gpu<decltype(cl1.toKernel()),decltype(v.toKernel()),decltype(os.toKernel())>), dim3(ite.wthr), dim3(ite.thr), 0, 0, cl1.toKernel(),v.toKernel(),os.toKernel());
cl_offload_gpu<decltype(cl1.toKernel()),decltype(v.toKernel()),decltype(os.toKernel())><<<ite.wthr,ite.thr>>>(cl1.toKernel(),v.toKernel(),os.toKernel());
os.deviceToHost<0>();
......@@ -2048,8 +2052,9 @@ BOOST_AUTO_TEST_CASE( CellList_use_cpu_offload_test )
openfpm::vector_gpu<aggregate<int>> os_list;
os_list.resize(size_list);
hipLaunchKernelGGL(HIP_KERNEL_NAME(cl_offload_gpu_list<decltype(cl1.toKernel()),decltype(v.toKernel()),
decltype(os_scan.toKernel()),decltype(os_list.toKernel())>), dim3(ite.wthr), dim3(ite.thr), 0, 0, cl1.toKernel(),v.toKernel(),os_scan.toKernel(),os_list.toKernel());
cl_offload_gpu_list<decltype(cl1.toKernel()),decltype(v.toKernel()),
decltype(os_scan.toKernel()),decltype(os_list.toKernel())><<<ite.wthr,ite.thr>>>
(cl1.toKernel(),v.toKernel(),os_scan.toKernel(),os_list.toKernel());
os_list.deviceToHost<0>();
......
#include <hip/hip_runtime.h>
#include "config.h"
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
......@@ -37,14 +35,14 @@ BOOST_AUTO_TEST_CASE( CellDecomposer_gpu_test_use )
openfpm::vector_gpu<aggregate<grid_key_dx<3,unsigned int>>> output(8);
hipLaunchKernelGGL(HIP_KERNEL_NAME(check), dim3(1), dim3(1), 0, 0, output.toKernel(),clk,0,Point<3,float>({0.2,0.2,0.2}));
hipLaunchKernelGGL(HIP_KERNEL_NAME(check), dim3(1), dim3(1), 0, 0, output.toKernel(),clk,1,Point<3,float>({0.1,0.2,0.3}));
hipLaunchKernelGGL(HIP_KERNEL_NAME(check), dim3(1), dim3(1), 0, 0, output.toKernel(),clk,2,Point<3,float>({0.25,0.55,0.45}));
hipLaunchKernelGGL(HIP_KERNEL_NAME(check), dim3(1), dim3(1), 0, 0, output.toKernel(),clk,3,Point<3,float>({0.15,0.15,0.95}));
hipLaunchKernelGGL(HIP_KERNEL_NAME(check), dim3(1), dim3(1), 0, 0, output.toKernel(),clk,4,Point<3,float>({1.05,1.05,1.05}));
hipLaunchKernelGGL(HIP_KERNEL_NAME(check), dim3(1), dim3(1), 0, 0, output.toKernel(),clk,5,Point<3,float>({1.15,1.15,1.15}));
hipLaunchKernelGGL(HIP_KERNEL_NAME(check), dim3(1), dim3(1), 0, 0, output.toKernel(),clk,6,Point<3,float>({-0.05,-0.05,-0.05}));
hipLaunchKernelGGL(HIP_KERNEL_NAME(check), dim3(1), dim3(1), 0, 0, output.toKernel(),clk,7,Point<3,float>({-0.15,-0.15,-0.15}));
check<<<1,1>>>(output.toKernel(),clk,0,Point<3,float>({0.2,0.2,0.2}));
check<<<1,1>>>(output.toKernel(),clk,1,Point<3,float>({0.1,0.2,0.3}));
check<<<1,1>>>(output.toKernel(),clk,2,Point<3,float>({0.25,0.55,0.45}));
check<<<1,1>>>(output.toKernel(),clk,3,Point<3,float>({0.15,0.15,0.95}));
check<<<1,1>>>(output.toKernel(),clk,4,Point<3,float>({1.05,1.05,1.05}));
check<<<1,1>>>(output.toKernel(),clk,5,Point<3,float>({1.15,1.15,1.15}));
check<<<1,1>>>(output.toKernel(),clk,6,Point<3,float>({-0.05,-0.05,-0.05}));
check<<<1,1>>>(output.toKernel(),clk,7,Point<3,float>({-0.15,-0.15,-0.15}));
output.template deviceToHost<0>();
......
......@@ -1373,7 +1373,7 @@ public:
}
hipDeviceSynchronize();
cudaDeviceSynchronize();
}
template<typename NNtype = NNStar<dim>>
......
......@@ -3,8 +3,6 @@
//
#define BOOST_TEST_DYN_LINK
#include <hip/hip_runtime.h>
#include "config.h"
#include <boost/test/unit_test.hpp>
#include "SparseGridGpu/BlockMapGpu.hpp"
......@@ -132,7 +130,8 @@ BOOST_AUTO_TEST_CASE(testSegreduce_total)
#ifndef __HIPCC__
hipLaunchKernelGGL(HIP_KERNEL_NAME((BlockMapGpuKernels::segreduce_total<BLOCK, 0, BITMASK, 2, mgpu::plus_t<ScalarT>>)), dim3(segments.size()-1), dim3(2*BlockT::size), 0, 0, data_new.toKernel(),
CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<BLOCK, 0, BITMASK, 2, mgpu::plus_t<ScalarT>>),segments.size()-1, 2*BlockT::size,
data_new.toKernel(),
data_old.toKernel(),
segments.toKernel(),
segment_dataMap.toKernel(),
......@@ -141,7 +140,8 @@ BOOST_AUTO_TEST_CASE(testSegreduce_total)
outputData.toKernel());
// Segreduce on mask
hipLaunchKernelGGL(HIP_KERNEL_NAME((BlockMapGpuKernels::segreduce_total<BITMASK, 0, BITMASK, 2, mgpu::maximum_t<unsigned char>>)), dim3(segments.size()-1), dim3(2*BlockT::size), 0, 0, data_new.toKernel(),
CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<BITMASK, 0, BITMASK, 2, mgpu::maximum_t<unsigned char>>),segments.size()-1, 2*BlockT::size,
data_new.toKernel(),
data_old.toKernel(),
segments.toKernel(),
segment_dataMap.toKernel(),
......@@ -284,7 +284,7 @@ BOOST_AUTO_TEST_CASE(test_maps_create)
auto ite = merge_indexes.getGPUIterator();
hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockMapGpuKernels::compute_predicate), dim3(), dim3(), 0, 0, merge_keys.toKernel(),merge_indexes.toKernel(),9,p_ids.toKernel());
CUDA_LAUNCH(BlockMapGpuKernels::compute_predicate,ite,merge_keys.toKernel(),merge_indexes.toKernel(),9,p_ids.toKernel());
mgpu::ofp_context_t context(false);
openfpm::scan((int *)p_ids.template getDeviceBuffer<0>(),
......@@ -324,7 +324,7 @@ BOOST_AUTO_TEST_CASE(test_maps_create)
copy_old_src.resize(copy_old_size);
copy_old_dst.resize(copy_old_size);
hipLaunchKernelGGL(HIP_KERNEL_NAME(BlockMapGpuKernels::maps_create), dim3(), dim3(), 0, 0, s_ids.toKernel(),p_ids.toKernel(),segments_oldData.toKernel(),outputMap.toKernel(),copy_old_dst.toKernel(),copy_old_src.toKernel());
CUDA_LAUNCH(BlockMapGpuKernels::maps_create,ite,s_ids.toKernel(),p_ids.toKernel(),segments_oldData.toKernel(),outputMap.toKernel(),copy_old_dst.toKernel(),copy_old_src.toKernel());
segments_oldData.template deviceToHost<0>();
outputMap.template deviceToHost<0>();
......
......@@ -3,8 +3,6 @@
//
#define BOOST_TEST_DYN_LINK
#include <hip/hip_runtime.h>
#include "config.h"
#include <boost/test/unit_test.hpp>
#include "SparseGridGpu/BlockMapGpu.hpp"
......@@ -108,7 +106,7 @@ BOOST_AUTO_TEST_CASE(testBackground)
// Get output
openfpm::vector_gpu<AggregateOutT> output;
output.resize(gridSize * blockSize);
hipLaunchKernelGGL(HIP_KERNEL_NAME(copyBlocksToOutput<0>), dim3(gridSize), dim3(blockSize), 0, 0, sparseGrid.toKernel(), output.toKernel());
copyBlocksToOutput<0> <<< gridSize, blockSize >>> (sparseGrid.toKernel(), output.toKernel());
output.template deviceToHost<0>();
sparseGrid.template deviceToHost<0>();
......@@ -141,7 +139,7 @@ BOOST_AUTO_TEST_CASE(testInsert)
blockMap.setGPUInsertBuffer(gridSize, bufferPoolSize);
// Insert values
hipLaunchKernelGGL(HIP_KERNEL_NAME(insertValues<0>), dim3(gridSize), dim3(blockSizeInsert), 0, 0, blockMap.toKernel());
insertValues<0> <<< gridSize, blockSizeInsert >>> (blockMap.toKernel());
// Flush inserts
mgpu::ofp_context_t ctx;
......@@ -151,7 +149,7 @@ BOOST_AUTO_TEST_CASE(testInsert)
openfpm::vector_gpu<AggregateOutT> output;
output.resize(gridSizeRead * blockSizeRead);
hipLaunchKernelGGL(HIP_KERNEL_NAME(copyBlocksToOutput<0>), dim3(gridSizeRead), dim3(blockSizeRead), 0, 0, blockMap.toKernel(), output.toKernel());
copyBlocksToOutput<0> <<< gridSizeRead, blockSizeRead >>> (blockMap.toKernel(), output.toKernel());
output.template deviceToHost<0>();
blockMap.template deviceToHost<0>();
......@@ -191,7 +189,7 @@ BOOST_AUTO_TEST_CASE(testInsert_halfBlock) //todo
blockMap.setGPUInsertBuffer(gridSize, bufferPoolSize);
// Insert values
hipLaunchKernelGGL(HIP_KERNEL_NAME(insertValuesHalfBlock<0>), dim3(gridSize), dim3(blockSizeInsert), 0, 0, blockMap.toKernel());
insertValuesHalfBlock<0> <<< gridSize, blockSizeInsert >>> (blockMap.toKernel());
// Flush inserts
mgpu::ofp_context_t ctx;
......@@ -201,7 +199,7 @@ BOOST_AUTO_TEST_CASE(testInsert_halfBlock) //todo
openfpm::vector_gpu<AggregateOutT> output;
output.resize(gridSizeRead * blockSizeRead);
hipLaunchKernelGGL(HIP_KERNEL_NAME(copyBlocksToOutput<0>), dim3(gridSizeRead), dim3(blockSizeRead), 0, 0, blockMap.toKernel(), output.toKernel());
copyBlocksToOutput<0> <<< gridSizeRead, blockSizeRead >>> (blockMap.toKernel(), output.toKernel());
output.template deviceToHost<0>();
blockMap.template deviceToHost<0>();
......@@ -255,7 +253,7 @@ BOOST_AUTO_TEST_CASE(testInsert_blocked)
sparseGrid.setGPUInsertBuffer(gridSize, bufferPoolSize);
// Insert values
hipLaunchKernelGGL(HIP_KERNEL_NAME(insertValuesBlocked<0, 2>), dim3(gridSize), dim3(blockSizeInsert), 0, 0, sparseGrid.toKernel());
insertValuesBlocked<0, 2> <<< gridSize, blockSizeInsert >>> (sparseGrid.toKernel());
// Flush inserts
mgpu::ofp_context_t ctx;
......@@ -265,7 +263,7 @@ BOOST_AUTO_TEST_CASE(testInsert_blocked)
openfpm::vector_gpu<AggregateOutT> output;
output.resize(gridSizeRead * blockSizeRead);
hipLaunchKernelGGL(HIP_KERNEL_NAME(copyBlocksToOutput<0>), dim3(gridSizeRead), dim3(blockSizeRead), 0, 0, sparseGrid.toKernel(), output.toKernel());
copyBlocksToOutput<0> <<< gridSizeRead, blockSizeRead >>> (sparseGrid.toKernel(), output.toKernel());
output.template deviceToHost<0>();
sparseGrid.template deviceToHost<0>();
......
......@@ -529,7 +529,7 @@ void testConv3x3x3_perf(std::string testName)
for (unsigned int iter=0; iter<iterations; ++iter)
{
hipDeviceSynchronize();
cudaDeviceSynchronize();
conv_coeff cc;
for (int i = 0 ; i < 3 ; i++)
......@@ -548,7 +548,7 @@ void testConv3x3x3_perf(std::string testName)
sparseGrid.template applyStencils<Conv3x3x3<dim,0,1>>(STENCIL_MODE_INPLACE,cc);
hipDeviceSynchronize();
cudaDeviceSynchronize();
ts.stop();
measures_tm.add(ts.getwct());
......@@ -624,7 +624,7 @@ static void testConv3x3x3_no_shared_perf(std::string testName)
for (unsigned int iter=0; iter<iterations; ++iter)
{
hipDeviceSynchronize();
cudaDeviceSynchronize();
conv_coeff cc;
for (int i = 0 ; i < 3 ; i++)
......@@ -643,7 +643,7 @@ static void testConv3x3x3_no_shared_perf(std::string testName)
sparseGrid.template applyStencils<Conv3x3x3_noshared<SparseGridZ::dims,0,1>>(STENCIL_MODE_INPLACE_NO_SHARED,cc);
hipDeviceSynchronize();
cudaDeviceSynchronize();
ts.stop();
measures_tm.add(ts.getwct());
......
......@@ -8,8 +8,6 @@
#define BOOST_GPU_ENABLED __host__ __device__
#include <hip/hip_runtime.h>
#include "config.h"
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
......@@ -169,7 +167,7 @@ BOOST_AUTO_TEST_CASE( vector_cuda_to_kernel_recursive2_test_toKernel )
openfpm::vector_gpu<aggregate<long int>> vg;
vg.resize(9);
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel_recursive_check), dim3(1), dim3(1), 0, 0, tt2.toKernel(),vg.toKernel());
kernel_recursive_check<<<1,1>>>(tt2.toKernel(),vg.toKernel());
vg.template deviceToHost<0>();
......
......@@ -7,8 +7,6 @@
#define BOOST_TEST_DYN_LINK
#include <hip/hip_runtime.h>
#include "config.h"
#include <boost/test/unit_test.hpp>
#include "Vector/map_vector_sparse.hpp"
......@@ -203,18 +201,18 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu )
vs.setGPUInsertBuffer(10,1024);
// we launch a kernel to insert data
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse), dim3(10), dim3(100), 0, 0, vs.toKernel());
test_insert_sparse<<<10,100>>>(vs.toKernel());
mgpu::ofp_context_t ctx;
vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE);
vs.setGPUInsertBuffer(10,1024);
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse2), dim3(10), dim3(100), 0, 0, vs.toKernel());
test_insert_sparse2<<<10,100>>>(vs.toKernel());
vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE);
vs.setGPUInsertBuffer(4000,512);
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse3), dim3(4000), dim3(256), 0, 0, vs.toKernel());
test_insert_sparse3<<<4000,256>>>(vs.toKernel());
vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE);
......@@ -222,7 +220,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu )
output.resize(1500);
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sparse_get_test), dim3(10), dim3(150), 0, 0, vs.toKernel(),output.toKernel());
test_sparse_get_test<<<10,150>>>(vs.toKernel(),output.toKernel());
output.template deviceToHost<0,1,2>();
vs.template deviceToHost<0,1,2>();
......@@ -239,7 +237,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu )
vs.clear();
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_sparse_get_test), dim3(10), dim3(150), 0, 0, vs.toKernel(),output.toKernel());
test_sparse_get_test<<<10,150>>>(vs.toKernel(),output.toKernel());
output.template deviceToHost<0,1,2>();
vs.template deviceToHost<0,1,2>();
......@@ -328,10 +326,10 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_incremental_add )
vs.setGPUInsertBuffer(10,1024);
// we launch a kernel to insert data
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse), dim3(10), dim3(100), 0, 0, vs.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse2_inc), dim3(10), dim3(100), 0, 0, vs.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse2_inc), dim3(10), dim3(100), 0, 0, vs.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse2_inc), dim3(10), dim3(100), 0, 0, vs.toKernel());
test_insert_sparse<<<10,100>>>(vs.toKernel());
test_insert_sparse2_inc<<<10,100>>>(vs.toKernel());
test_insert_sparse2_inc<<<10,100>>>(vs.toKernel());
test_insert_sparse2_inc<<<10,100>>>(vs.toKernel());
mgpu::ofp_context_t ctx;
......@@ -379,7 +377,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_get )
vs.setGPUInsertBuffer(10,1024);
// we launch a kernel to insert data
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse), dim3(10), dim3(100), 0, 0, vs.toKernel());
test_insert_sparse<<<10,100>>>(vs.toKernel());
mgpu::ofp_context_t ctx;
......@@ -398,7 +396,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_get )
BOOST_REQUIRE_EQUAL(match,true);
vs.setGPUInsertBuffer(10,1024);
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse2), dim3(10), dim3(100), 0, 0, vs.toKernel());
test_insert_sparse2<<<10,100>>>(vs.toKernel());
vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE);
......@@ -436,7 +434,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_get )
BOOST_REQUIRE_EQUAL(match,true);
vs.setGPUInsertBuffer(4000,512);
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse3), dim3(4000), dim3(256), 0, 0, vs.toKernel());
test_insert_sparse3<<<4000,256>>>(vs.toKernel());
vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE);
vs.template deviceToHost<0,1,2>();
......@@ -491,10 +489,10 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_special_function )
vs.setGPUInsertBuffer(10,1024);
// we launch a kernel to insert data
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse), dim3(10), dim3(100), 0, 0, vs.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse2_inc), dim3(10), dim3(100), 0, 0, vs.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse2_inc), dim3(10), dim3(100), 0, 0, vs.toKernel());
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse2_inc), dim3(10), dim3(100), 0, 0, vs.toKernel());
test_insert_sparse<<<10,100>>>(vs.toKernel());
test_insert_sparse2_inc<<<10,100>>>(vs.toKernel());
test_insert_sparse2_inc<<<10,100>>>(vs.toKernel());
test_insert_sparse2_inc<<<10,100>>>(vs.toKernel());
mgpu::ofp_context_t ctx;
......@@ -623,24 +621,24 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_remove )
vs.setGPUInsertBuffer(10,1024);
// we launch a kernel to insert data
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse), dim3(10), dim3(100), 0, 0, vs.toKernel());
test_insert_sparse<<<10,100>>>(vs.toKernel());
mgpu::ofp_context_t ctx;
vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE);
vs.setGPUInsertBuffer(10,1024);
hipLaunchKernelGGL(HIP_KERNEL_NAME(test_insert_sparse2), dim3(10), dim3(100), 0, 0, vs.toKernel());
test_insert_sparse2<<<10,100>>>(vs.toKernel());
vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE);
vs.setGPUInsertBuffer(4000,512);