From 29305a648f37f6e1f8adf2d0f719406044ea1565 Mon Sep 17 00:00:00 2001 From: Serhii Yaskovets <yaskovet@mpi-cbg.de> Date: Mon, 31 Jul 2023 18:48:29 +0200 Subject: [PATCH] Move parallel primitives lib: moderngpu to CUB --- src/CMakeLists.txt | 46 +- src/Grid/cuda/cuda_grid_gpu_int.cu | 2 +- src/NN/CellList/CellList_gpu_test.cu | 16 +- src/NN/CellList/CellList_util.hpp | 14 +- src/NN/CellList/cuda/CellList_gpu.hpp | 28 +- src/NN/VerletList/VerletListFast.hpp | 2 +- src/SparseGridGpu/BlockMapGpu.hpp | 2 +- src/SparseGridGpu/BlockMapGpu_kernels.cuh | 6 +- src/SparseGridGpu/SparseGridGpu.hpp | 34 +- .../SparseGridGpu_performance_get_nn.cu | 2 +- .../SparseGridGpu_performance_get_single.cu | 2 +- .../SparseGridGpu_performance_heat_stencil.cu | 2 +- ...arseGridGpu_performance_heat_stencil_3d.cu | 4 +- ...GridGpu_performance_heat_stencil_sparse.cu | 2 +- .../SparseGridGpu_performance_insert_block.cu | 2 +- ...SparseGridGpu_performance_insert_single.cu | 2 +- .../SparseGridGpu_performance_tests.cu | 4 +- .../tests/BlockMapGpu_kernels_tests.cu | 8 +- src/SparseGridGpu/tests/BlockMapGpu_tests.cu | 6 +- .../tests/SparseGridGpu_tests.cu | 46 +- .../tests/utils/SparseGridGpu_util_test.cuh | 4 +- .../map_vector_sparse_cuda_ker_unit_tests.cu | 12 +- .../cuda/map_vector_sparse_cuda_kernels.cuh | 50 +- ...p_vector_sparse_cuda_kernels_unit_tests.cu | 14 +- src/Vector/map_vector_sparse.hpp | 71 ++- src/Vector/map_vector_sparse_unit_tests.cu | 2 +- src/util/cuda/merge_ofp.cuh | 13 +- src/util/cuda/modern_gpu_tests.cu | 222 --------- src/util/cuda/moderngpu/context.hxx | 221 --------- src/util/cuda/moderngpu/context_reduced.hxx | 107 ----- src/util/cuda/moderngpu/cpp11.hxx | 154 ------ src/util/cuda/moderngpu/cta_load_balance.hxx | 263 ----------- src/util/cuda/moderngpu/cta_merge.hxx | 209 --------- src/util/cuda/moderngpu/cta_mergesort.hxx | 140 ------ src/util/cuda/moderngpu/cta_reduce.hxx | 134 ------ src/util/cuda/moderngpu/cta_scan.hxx | 231 --------- src/util/cuda/moderngpu/cta_search.hxx | 100 ---- src/util/cuda/moderngpu/cta_segscan.hxx | 119 ----- src/util/cuda/moderngpu/cta_segsort.hxx | 226 --------- src/util/cuda/moderngpu/intrinsics.hxx | 363 -------------- src/util/cuda/moderngpu/kernel_bulkinsert.hxx | 18 - src/util/cuda/moderngpu/kernel_bulkremove.hxx | 91 ---- src/util/cuda/moderngpu/kernel_compact.hxx | 139 ------ .../cuda/moderngpu/kernel_intervalmove.hxx | 67 --- src/util/cuda/moderngpu/kernel_join.hxx | 50 -- .../cuda/moderngpu/kernel_load_balance.hxx | 88 ---- src/util/cuda/moderngpu/kernel_merge.hxx | 92 ---- src/util/cuda/moderngpu/kernel_mergesort.hxx | 150 ------ src/util/cuda/moderngpu/kernel_reduce.hxx | 70 --- src/util/cuda/moderngpu/kernel_scan.hxx | 198 -------- src/util/cuda/moderngpu/kernel_segreduce.hxx | 406 ---------------- src/util/cuda/moderngpu/kernel_segsort.hxx | 444 ------------------ .../cuda/moderngpu/kernel_sortedsearch.hxx | 64 --- src/util/cuda/moderngpu/kernel_workcreate.hxx | 272 ----------- src/util/cuda/moderngpu/launch_box.hxx | 93 ---- src/util/cuda/moderngpu/launch_params.hxx | 152 ------ src/util/cuda/moderngpu/loadstore.hxx | 188 -------- src/util/cuda/moderngpu/memory.hxx | 131 ------ src/util/cuda/moderngpu/meta.hxx | 249 ---------- src/util/cuda/moderngpu/operators.hxx | 347 -------------- src/util/cuda/moderngpu/search.hxx | 53 --- src/util/cuda/moderngpu/sort_networks.hxx | 57 --- src/util/cuda/moderngpu/transform.hxx | 107 ----- src/util/cuda/moderngpu/tuple.hxx | 393 ---------------- src/util/cuda/moderngpu/types.hxx | 147 ------ src/util/cuda/moderngpu/util.hxx | 30 -- src/util/cuda/ofp_context.hxx | 322 ------------- src/util/cuda/reduce_ofp.cuh | 77 +-- src/util/cuda/scan_ofp.cuh | 70 +-- src/util/cuda/scan_sort_cuda_unit_tests.cu | 24 +- src/util/cuda/segreduce_ofp.cuh | 112 ++--- src/util/cuda/sort_ofp.cuh | 276 ++++++----- .../cuda/test/segreduce_block_cuda_tests.cu | 6 +- 73 files changed, 405 insertions(+), 7463 deletions(-) delete mode 100644 src/util/cuda/modern_gpu_tests.cu delete mode 100644 src/util/cuda/moderngpu/context.hxx delete mode 100644 src/util/cuda/moderngpu/context_reduced.hxx delete mode 100644 src/util/cuda/moderngpu/cpp11.hxx delete mode 100644 src/util/cuda/moderngpu/cta_load_balance.hxx delete mode 100644 src/util/cuda/moderngpu/cta_merge.hxx delete mode 100644 src/util/cuda/moderngpu/cta_mergesort.hxx delete mode 100644 src/util/cuda/moderngpu/cta_reduce.hxx delete mode 100644 src/util/cuda/moderngpu/cta_scan.hxx delete mode 100644 src/util/cuda/moderngpu/cta_search.hxx delete mode 100644 src/util/cuda/moderngpu/cta_segscan.hxx delete mode 100644 src/util/cuda/moderngpu/cta_segsort.hxx delete mode 100644 src/util/cuda/moderngpu/intrinsics.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_bulkinsert.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_bulkremove.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_compact.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_intervalmove.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_join.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_load_balance.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_merge.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_mergesort.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_reduce.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_scan.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_segreduce.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_segsort.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_sortedsearch.hxx delete mode 100644 src/util/cuda/moderngpu/kernel_workcreate.hxx delete mode 100644 src/util/cuda/moderngpu/launch_box.hxx delete mode 100644 src/util/cuda/moderngpu/launch_params.hxx delete mode 100644 src/util/cuda/moderngpu/loadstore.hxx delete mode 100644 src/util/cuda/moderngpu/memory.hxx delete mode 100644 src/util/cuda/moderngpu/meta.hxx delete mode 100644 src/util/cuda/moderngpu/operators.hxx delete mode 100644 src/util/cuda/moderngpu/search.hxx delete mode 100644 src/util/cuda/moderngpu/sort_networks.hxx delete mode 100644 src/util/cuda/moderngpu/transform.hxx delete mode 100644 src/util/cuda/moderngpu/tuple.hxx delete mode 100644 src/util/cuda/moderngpu/types.hxx delete mode 100644 src/util/cuda/moderngpu/util.hxx delete mode 100644 src/util/cuda/ofp_context.hxx diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cc9991a0..2aa84f19 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,8 +3,6 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR) ########################### Executables -add_definitions(-DSCAN_WITH_CUB) - if (TEST_PERFORMANCE) set(CUDA_SOURCES SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu @@ -31,7 +29,6 @@ if (NOT CUDA_ON_BACKEND STREQUAL "None") NN/CellList/CellList_gpu_test.cu util/cuda/scan_sort_cuda_unit_tests.cu Grid/cuda/cuda_grid_unit_tests_func.cu - util/cuda/modern_gpu_tests.cu Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu NN/CellList/tests/CellDecomposer_gpu_ker_unit_test.cu SparseGridGpu/tests/BlockMapGpu_tests.cu @@ -448,8 +445,7 @@ install(FILES util/multi_array_openfpm/array_openfpm.hpp COMPONENT OpenFPM) -install(FILES util/cuda/ofp_context.hxx - util/cuda/kernels.cuh +install(FILES util/cuda/kernels.cuh util/cuda/scan_ofp.cuh util/cuda/sort_ofp.cuh util/cuda/reduce_ofp.cuh @@ -490,46 +486,6 @@ install (FILES SparseGridGpu/TemplateUtils/mathUtils.hpp DESTINATION openfpm_data/include/SparseGridGpu/TemplateUtils/ COMPONENT OpenFPM) -install(FILES util/cuda/moderngpu/context.hxx - util/cuda/moderngpu/context_reduced.hxx - util/cuda/moderngpu/cpp11.hxx - util/cuda/moderngpu/cta_load_balance.hxx - util/cuda/moderngpu/cta_merge.hxx - util/cuda/moderngpu/cta_mergesort.hxx - util/cuda/moderngpu/cta_reduce.hxx - util/cuda/moderngpu/cta_scan.hxx - util/cuda/moderngpu/cta_search.hxx - util/cuda/moderngpu/cta_segscan.hxx - util/cuda/moderngpu/intrinsics.hxx - util/cuda/moderngpu/kernel_bulkinsert.hxx - util/cuda/moderngpu/kernel_bulkremove.hxx - util/cuda/moderngpu/kernel_compact.hxx - util/cuda/moderngpu/kernel_intervalmove.hxx - util/cuda/moderngpu/kernel_join.hxx - util/cuda/moderngpu/kernel_load_balance.hxx - util/cuda/moderngpu/kernel_merge.hxx - util/cuda/moderngpu/kernel_mergesort.hxx - util/cuda/moderngpu/kernel_reduce.hxx - util/cuda/moderngpu/kernel_scan.hxx - util/cuda/moderngpu/kernel_segreduce.hxx - util/cuda/moderngpu/kernel_segsort.hxx - util/cuda/moderngpu/kernel_sortedsearch.hxx - util/cuda/moderngpu/kernel_workcreate.hxx - util/cuda/moderngpu/launch_box.hxx - util/cuda/moderngpu/launch_params.hxx - util/cuda/moderngpu/loadstore.hxx - util/cuda/moderngpu/memory.hxx - util/cuda/moderngpu/meta.hxx - util/cuda/moderngpu/operators.hxx - util/cuda/moderngpu/search.hxx - util/cuda/moderngpu/sort_networks.hxx - util/cuda/moderngpu/transform.hxx - util/cuda/moderngpu/tuple.hxx - util/cuda/moderngpu/types.hxx - util/cuda/moderngpu/util.hxx - DESTINATION openfpm_data/include/util/cuda/moderngpu/ - COMPONENT OpenFPM) - #if(BUILD_TESTING) # add_executable(particle_test test.cu) diff --git a/src/Grid/cuda/cuda_grid_gpu_int.cu b/src/Grid/cuda/cuda_grid_gpu_int.cu index aaab8292..74976b8e 100644 --- a/src/Grid/cuda/cuda_grid_gpu_int.cu +++ b/src/Grid/cuda/cuda_grid_gpu_int.cu @@ -172,7 +172,7 @@ BOOST_AUTO_TEST_CASE (gpu_p2m) size_t g_m = pos.size(); - mgpu::ofp_context_t context(false); + gpu::ofp_context_t context(false); const size_t (& sz)[3] = grid.getGrid().getSize(); diff --git a/src/NN/CellList/CellList_gpu_test.cu b/src/NN/CellList/CellList_gpu_test.cu index 2d19dd2a..d6cd4844 100644 --- a/src/NN/CellList/CellList_gpu_test.cu +++ b/src/NN/CellList/CellList_gpu_test.cu @@ -464,7 +464,7 @@ void test_cell_count_n() CUDA_LAUNCH_DIM3(construct_cells,1,1,vs.toKernel(),gs); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; vs.flush<sadd_<0>>(ctx,flush_type::FLUSH_ON_DEVICE); @@ -786,8 +786,8 @@ template<unsigned int dim, typename T, typename CellS> void Test_cell_gpu(SpaceB pl.template hostToDevice<0>(); pl_prp.template hostToDevice<0,1,2>(); - // create an mgpu context - mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props); + // create an gpu context + gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props); cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context); // Check @@ -1358,7 +1358,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di size_t g_m = pl.size() / 2; - mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props); + gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props); cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m); auto & s_t_ns = cl2.getSortToNonSort(); @@ -1564,7 +1564,7 @@ void Test_cell_gpu_force_split(SpaceBox<dim,T> & box, size_t npart, const size_t size_t g_m = pl.size() / 2; - mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props); + gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props); cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size()/2); cl2_split2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,pl.size()/2,pl.size()); auto & s_t_ns_s1 = cl2_split1.getSortToNonSort(); @@ -1809,7 +1809,7 @@ BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_split) size_t g_m = pl.size() / 2; - mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props); + gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props); cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size()/2); cl2_split2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,pl.size()/2,pl.size()); @@ -2013,7 +2013,7 @@ BOOST_AUTO_TEST_CASE( CellList_use_cpu_offload_test ) openfpm::vector_gpu<aggregate<int>> os_scan; os_scan.resize(v.size()); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; openfpm::scan((int *)os.template getDeviceBuffer<0>(),os.size(),(int *)os_scan.template getDeviceBuffer<0>(),ctx); os_scan.deviceToHost<0>(); @@ -2089,7 +2089,7 @@ BOOST_AUTO_TEST_CASE( CellList_swap_test ) size_t g_m = pl.size() / 2; - mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props); + gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props); cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m); cl4.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m); diff --git a/src/NN/CellList/CellList_util.hpp b/src/NN/CellList/CellList_util.hpp index 63caffbf..9f63cb0c 100644 --- a/src/NN/CellList/CellList_util.hpp +++ b/src/NN/CellList/CellList_util.hpp @@ -17,7 +17,7 @@ enum cl_construct_opt Only_reorder }; -#include "util/cuda/ofp_context.hxx" +#include "util/ofp_context.hpp" /*! \brief populate the Cell-list with particles non symmetric case on GPU @@ -48,7 +48,7 @@ struct populate_cell_list_no_sym_impl openfpm::vector<prop,Memory,layout_base > & v_prp, openfpm::vector<prop,Memory,layout_base > & v_prp_out, CellList & cli, - mgpu::ofp_context_t & context, + gpu::ofp_context_t & context, size_t g_m, cl_construct_opt optc) { @@ -70,7 +70,7 @@ struct populate_cell_list_no_sym_impl<true> openfpm::vector<prop,Memory,layout_base > & v_prp, openfpm::vector<prop,Memory,layout_base > & v_prp_out, CellList & cli, - mgpu::ofp_context_t & context, + gpu::ofp_context_t & context, size_t g_m, cl_construct_opt optc) { @@ -138,12 +138,12 @@ void populate_cell_list_no_sym(openfpm::vector<Point<dim,T>,Memory,layout_base > openfpm::vector<prop,Memory,layout_base > & v_prp, openfpm::vector<prop,Memory,layout_base > & v_prp_out, CellList & cli, - mgpu::ofp_context_t & mgpu, + gpu::ofp_context_t & gpu, size_t g_m, cl_construct_opt optc) { populate_cell_list_no_sym_impl<is_gpu_celllist<CellList>::value> - ::template populate<dim,T,prop,Memory,layout_base,CellList, prp ...>(pos,v_pos_out,v_prp,v_prp_out,cli,mgpu,g_m,optc); + ::template populate<dim,T,prop,Memory,layout_base,CellList, prp ...>(pos,v_pos_out,v_prp,v_prp_out,cli,gpu,g_m,optc); } /*! \brief populate the Cell-list with particles symmetric case @@ -189,7 +189,7 @@ void populate_cell_list(openfpm::vector<Point<dim,T>,Memory,layout_base> & pos, openfpm::vector<prop,Memory,layout_base > & v_prp, openfpm::vector<prop,Memory,layout_base > & v_prp_out, CellList & cli, - mgpu::ofp_context_t & context, + gpu::ofp_context_t & context, size_t g_m, size_t opt, cl_construct_opt optc) @@ -222,7 +222,7 @@ template<unsigned int dim, unsigned int ... prp> void populate_cell_list(openfpm::vector<Point<dim,T>,Memory,layout_base> & pos, CellList & cli, - mgpu::ofp_context_t & context, + gpu::ofp_context_t & context, size_t g_m, size_t opt, cl_construct_opt optc) diff --git a/src/NN/CellList/cuda/CellList_gpu.hpp b/src/NN/CellList/cuda/CellList_gpu.hpp index 5ee0e2b0..886a7f8a 100644 --- a/src/NN/CellList/cuda/CellList_gpu.hpp +++ b/src/NN/CellList/cuda/CellList_gpu.hpp @@ -237,7 +237,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> vector & pl_out, vector_prp & pl_prp, vector_prp & pl_prp_out, - mgpu::ofp_context_t & mgpuContext, + gpu::ofp_context_t & gpuContext, size_t g_m, size_t start, size_t stop, @@ -276,7 +276,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> cl_sparse.template setBackground<0>((cnt_type)-1); cl_sparse.setGPUInsertBuffer(ite_gpu.wthr.x,ite_gpu.thr.x); CUDA_LAUNCH((fill_cells_sparse),ite_gpu,cl_sparse.toKernel(),starts.toKernel()); - cl_sparse.template flush_vd<sstart_<0>>(cells,mgpuContext,FLUSH_ON_DEVICE); + cl_sparse.template flush_vd<sstart_<0>>(cells,gpuContext,FLUSH_ON_DEVICE); cells_nn.resize(cl_sparse.size()+1); cells_nn.template fill<0>(0); @@ -286,7 +286,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> CUDA_LAUNCH((count_nn_cells),itgg,cl_sparse.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel()); // now we scan - openfpm::scan((cnt_type *)cells_nn.template getDeviceBuffer<0>(), cells_nn.size(), (cnt_type *)cells_nn.template getDeviceBuffer<0>() , mgpuContext); + openfpm::scan((cnt_type *)cells_nn.template getDeviceBuffer<0>(), cells_nn.size(), (cnt_type *)cells_nn.template getDeviceBuffer<0>() , gpuContext); cells_nn.template deviceToHost<0>(cells_nn.size() - 1, cells_nn.size() - 1); size_t n_nn_cells = cells_nn.template get<0>(cells_nn.size() - 1); @@ -316,7 +316,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> if (opt == cl_construct_opt::Full) { - construct_domain_ids(mgpuContext,start,stop,g_m); + construct_domain_ids(gpuContext,start,stop,g_m); } #else @@ -328,10 +328,10 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> /*! \brief Construct the ids of the particles domain in the sorted array * - * \param mgpuContext mgpu context + * \param gpuContext gpu context * */ - void construct_domain_ids(mgpu::ofp_context_t & mgpuContext, size_t start, size_t stop, size_t g_m) + void construct_domain_ids(gpu::ofp_context_t & gpuContext, size_t start, size_t stop, size_t g_m) { #ifdef __NVCC__ sorted_domain_particles_dg.resize(stop-start+1); @@ -341,7 +341,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> CUDA_LAUNCH((mark_domain_particles),ite,sorted_to_not_sorted.toKernel(),sorted_domain_particles_dg.toKernel(),g_m); // lets scan - openfpm::scan((unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),sorted_domain_particles_dg.size(),(unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),mgpuContext); + openfpm::scan((unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),sorted_domain_particles_dg.size(),(unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),gpuContext); sorted_domain_particles_dg.template deviceToHost<0>(sorted_domain_particles_dg.size()-1,sorted_domain_particles_dg.size()-1); auto sz = sorted_domain_particles_dg.template get<0>(sorted_domain_particles_dg.size()-1); @@ -361,7 +361,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> vector & pl_out, vector_prp & pl_prp, vector_prp & pl_prp_out, - mgpu::ofp_context_t & mgpuContext, + gpu::ofp_context_t & gpuContext, size_t g_m, size_t start, size_t stop, @@ -398,7 +398,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> // now we scan starts.resize(cl_n.size()); - openfpm::scan((cnt_type *)cl_n.template getDeviceBuffer<0>(), cl_n.size(), (cnt_type *)starts.template getDeviceBuffer<0>() , mgpuContext); + openfpm::scan((cnt_type *)cl_n.template getDeviceBuffer<0>(), cl_n.size(), (cnt_type *)starts.template getDeviceBuffer<0>() , gpuContext); // now we construct the cells @@ -414,7 +414,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> // sort - mgpu::mergesort(static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()),pl.size(),mgpu::less_t<cnt_type>(),mgpuContext); + gpu::mergesort(static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()),pl.size(),gpu::less_t<cnt_type>(),gpuContext); #else @@ -470,7 +470,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform> if (opt == cl_construct_opt::Full) { - construct_domain_ids(mgpuContext,start,stop,g_m); + construct_domain_ids(gpuContext,start,stop,g_m); } #else @@ -630,7 +630,7 @@ public: vector & pl_out, vector_prp & pl_prp, vector_prp & pl_prp_out, - mgpu::ofp_context_t & mgpuContext, + gpu::ofp_context_t & gpuContext, size_t g_m = 0, size_t start = 0, size_t stop = (size_t)-1, @@ -640,8 +640,8 @@ public: if (stop == (size_t)-1) {stop = pl.size();} - if (is_sparse == false) {construct_dense<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,mgpuContext,g_m,start,stop,opt);} - else {construct_sparse<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,mgpuContext,g_m,start,stop,opt);} + if (is_sparse == false) {construct_dense<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,gpuContext,g_m,start,stop,opt);} + else {construct_sparse<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,gpuContext,g_m,start,stop,opt);} } CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,is_sparse> toKernel() diff --git a/src/NN/VerletList/VerletListFast.hpp b/src/NN/VerletList/VerletListFast.hpp index c6869025..0e74faab 100644 --- a/src/NN/VerletList/VerletListFast.hpp +++ b/src/NN/VerletList/VerletListFast.hpp @@ -322,7 +322,7 @@ private: */ void initCl(CellListImpl & cli, vector_pos_type & pos, size_t g_m, size_t opt) { - mgpu::ofp_context_t context(mgpu::gpu_context_opt::dummy); + gpu::ofp_context_t context(gpu::gpu_context_opt::dummy); if (opt & VL_SYMMETRIC || opt & VL_CRS_SYMMETRIC) {populate_cell_list(pos,cli,context,g_m,CL_SYMMETRIC,cl_construct_opt::Full);} else diff --git a/src/SparseGridGpu/BlockMapGpu.hpp b/src/SparseGridGpu/BlockMapGpu.hpp index b8c3a65b..9de735da 100644 --- a/src/SparseGridGpu/BlockMapGpu.hpp +++ b/src/SparseGridGpu/BlockMapGpu.hpp @@ -287,7 +287,7 @@ public: } template<typename ... v_reduce> - void flush(mgpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST) + void flush(gpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST) { #ifdef SE_CLASS1 diff --git a/src/SparseGridGpu/BlockMapGpu_kernels.cuh b/src/SparseGridGpu/BlockMapGpu_kernels.cuh index 525cabde..dd0a8174 100644 --- a/src/SparseGridGpu/BlockMapGpu_kernels.cuh +++ b/src/SparseGridGpu/BlockMapGpu_kernels.cuh @@ -584,7 +584,7 @@ struct sparse_vector_reduction_solve_conflict vector_segolddata_type & segments_oldData; //! gpu context - mgpu::ofp_context_t & context; + gpu::ofp_context_t & context; /*! \brief constructor * @@ -600,7 +600,7 @@ struct sparse_vector_reduction_solve_conflict vector_segoffset_type & segment_offset, vector_outmap_type & out_map, vector_segolddata_type & segments_oldData, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) :vector_data_red(vector_data_red), vector_data(vector_data), vector_data_unsorted(vector_data_unsorted), @@ -701,7 +701,7 @@ namespace BlockMapGpuFunctors bool solve_conflicts(vector_index_type &keys, vector_index_type &mergeIndices, vector_index_type2 &segments_new, vector_index_type &data_map, vector_data_type &dataOld, vector_data_type &dataNew, vector_index_type &keysOut, vector_data_type &dataOut, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) { #ifdef __NVCC__ typedef ValueTypeOf<vector_data_type> AggregateT; diff --git a/src/SparseGridGpu/SparseGridGpu.hpp b/src/SparseGridGpu/SparseGridGpu.hpp index a315f7f7..b872a746 100644 --- a/src/SparseGridGpu/SparseGridGpu.hpp +++ b/src/SparseGridGpu/SparseGridGpu.hpp @@ -770,7 +770,7 @@ public: } template<typename ... v_reduce> - void flush(mgpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST) + void flush(gpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST) { BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base> ::template flush<v_reduce ...>(context, opt); @@ -1094,7 +1094,7 @@ private: } template<typename MemType, unsigned int ... prp> - void preUnpack(ExtPreAlloc<MemType> * prAlloc_prp, mgpu::ofp_context_t & ctx, int opt) + void preUnpack(ExtPreAlloc<MemType> * prAlloc_prp, gpu::ofp_context_t & ctx, int opt) { if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false) { @@ -1114,14 +1114,14 @@ private: template<unsigned int ... prp> - void removeCopyToFinalize_phase1(mgpu::ofp_context_t & ctx, int opt) + void removeCopyToFinalize_phase1(gpu::ofp_context_t & ctx, int opt) { if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false) {removePoints(ctx);} } template<unsigned int ... prp> - void removeCopyToFinalize_phase2(mgpu::ofp_context_t & ctx, int opt) + void removeCopyToFinalize_phase2(gpu::ofp_context_t & ctx, int opt) { // Pack information Pack_stat sts; @@ -1173,7 +1173,7 @@ private: } template<unsigned int ... prp> - void removeCopyToFinalize_phase3(mgpu::ofp_context_t & ctx, int opt, bool is_unpack_remote) + void removeCopyToFinalize_phase3(gpu::ofp_context_t & ctx, int opt, bool is_unpack_remote) { ite_gpu<1> ite; @@ -1417,7 +1417,7 @@ private: void addAndConvertPackedChunkToTmp(ExtPreAlloc<S2> & mem, SparseGridGpu_iterator_sub<dim,self> & sub_it, Unpack_stat & ps, - mgpu::ofp_context_t &context) + gpu::ofp_context_t &context) { sparsegridgpu_pack_request<AggregateT,prp ...> spq; boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq); @@ -2019,7 +2019,7 @@ public: * \param grid_dw grid level down * */ - void construct_link(self & grid_up, self & grid_dw, mgpu::ofp_context_t &context) + void construct_link(self & grid_up, self & grid_dw, gpu::ofp_context_t &context) { /* // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers! auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer(); @@ -2106,7 +2106,7 @@ public: * \param gpu context * */ - void construct_link_dw(self & grid_dw, const Box<dim,int> & db_, Point<dim,int> p_dw, mgpu::ofp_context_t &context) + void construct_link_dw(self & grid_dw, const Box<dim,int> & db_, Point<dim,int> p_dw, gpu::ofp_context_t &context) { Box<dim,int> db = db_; @@ -2184,7 +2184,7 @@ public: * \praram grid_up grid level up * */ - void construct_link_up(self & grid_up, const Box<dim,int> & db_, Point<dim,int> p_up, mgpu::ofp_context_t &context) + void construct_link_up(self & grid_up, const Box<dim,int> & db_, Point<dim,int> p_up, gpu::ofp_context_t &context) { Box<dim,int> db = db_; @@ -2284,7 +2284,7 @@ public: } template<typename stencil_type = NNStar<dim>, typename checker_type = No_check> - void tagBoundaries(mgpu::ofp_context_t &context, checker_type chk = checker_type(), tag_boundaries opt = tag_boundaries::NO_CALCULATE_EXISTING_POINTS) + void tagBoundaries(gpu::ofp_context_t &context, checker_type chk = checker_type(), tag_boundaries opt = tag_boundaries::NO_CALCULATE_EXISTING_POINTS) { // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers! auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer(); @@ -2977,7 +2977,7 @@ public: * */ template<int ... prp> inline - void packRequest(size_t & req, mgpu::ofp_context_t &context) const + void packRequest(size_t & req, gpu::ofp_context_t &context) const { ite_gpu<1> ite; @@ -3070,7 +3070,7 @@ public: * */ template<int ... prp> inline - void packCalculate(size_t & req, mgpu::ofp_context_t &context) + void packCalculate(size_t & req, gpu::ofp_context_t &context) { ite_gpu<1> ite; pack_subs.template hostToDevice<0,1>(); @@ -3308,7 +3308,7 @@ public: * */ template<unsigned int ... prp> - void removeCopyToFinalize(mgpu::ofp_context_t & ctx, int opt) + void removeCopyToFinalize(gpu::ofp_context_t & ctx, int opt) { if ((opt & 0x3) == rem_copy_opt::PHASE1) { @@ -3410,7 +3410,7 @@ public: * \param context modern gpu context * */ - void removePoints(mgpu::ofp_context_t& context) + void removePoints(gpu::ofp_context_t& context) { auto & indexBuffer = private_get_index_array(); auto & dataBuffer = private_get_data_array(); @@ -3485,7 +3485,7 @@ public: * */ template<unsigned int ... prp> - void removeAddUnpackFinalize(mgpu::ofp_context_t& context, int opt) + void removeAddUnpackFinalize(gpu::ofp_context_t& context, int opt) { if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false) {removePoints(context);} @@ -3593,7 +3593,7 @@ public: header_type & headers, int ih, Unpack_stat & ps, - mgpu::ofp_context_t &context, + gpu::ofp_context_t &context, rem_copy_opt opt = rem_copy_opt::NONE_OPT) { //////////////////////////////////////////////////////////// @@ -3661,7 +3661,7 @@ public: void unpack(ExtPreAlloc<S2> & mem, SparseGridGpu_iterator_sub<dim,self> & sub_it, Unpack_stat & ps, - mgpu::ofp_context_t &context, + gpu::ofp_context_t &context, rem_copy_opt opt = rem_copy_opt::NONE_OPT) { //////////////////////////////////////////////////////////// diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_get_nn.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_get_nn.cu index 768f9ea8..d673edd4 100644 --- a/src/SparseGridGpu/performance/SparseGridGpu_performance_get_nn.cu +++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_get_nn.cu @@ -43,7 +43,7 @@ void testGetNeighbourhood(std::string testURI, unsigned int i) dim3 blockSizeBlockedInsert(1, 1); grid_smb<dim, blockEdgeSize> blockGeometry(gridSize); SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // Now fill the grid once diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_get_single.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_get_single.cu index a2db1e64..20104776 100644 --- a/src/SparseGridGpu/performance/SparseGridGpu_performance_get_single.cu +++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_get_single.cu @@ -44,7 +44,7 @@ void testGetSingle(std::string testURI, unsigned int i) dim3 blockSizeBlockedInsert(1, 1); grid_smb<dim, blockEdgeSize> blockGeometry(gridSize); SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // Now fill the grid once diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil.cu index d3d5164d..86105e05 100644 --- a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil.cu +++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil.cu @@ -41,7 +41,7 @@ void testStencilHeat_perf(unsigned int i, std::string base) dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_); typename SparseGridZ::grid_info blockGeometry(gridSize); SparseGridZ sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_; diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_3d.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_3d.cu index 73eed9db..8431728c 100644 --- a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_3d.cu +++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_3d.cu @@ -42,7 +42,7 @@ void testStencilHeat3D_perf(unsigned int i, std::string base) typename SparseGridZ::grid_info blockGeometry(gridSize); SparseGridZ sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_ @@ -166,7 +166,7 @@ void testStencilHeat3DSparse_perf(unsigned int i, std::string base, float fillMu size_t sz[3] = {spatialEdgeSize, spatialEdgeSize, spatialEdgeSize}; typename SparseGridZ::grid_info blockGeometry(sz); SparseGridZ sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); ///// Insert sparse content, a set of concentric spheres ///// diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu index abec6ef5..901a3439 100644 --- a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu +++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu @@ -45,7 +45,7 @@ void testStencilHeatSparse_perf(unsigned int i, std::string base, float fillMult size_t sz[2] = {spatialEdgeSize, spatialEdgeSize}; typename SparseGridZ::grid_info blockGeometry(sz); SparseGridZ sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); ///// Insert sparse content, a set of concentric spheres ///// diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_block.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_block.cu index 40270931..0e281e71 100644 --- a/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_block.cu +++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_block.cu @@ -46,7 +46,7 @@ void test_insert_block(std::string testURI, unsigned int i) dim3 blockSizeBlockedInsert(1, 1); grid_smb<dim, blockEdgeSize> blockGeometry(gridSize); SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // Warmup diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_single.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_single.cu index afacc2d3..aeb58b1b 100644 --- a/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_single.cu +++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_single.cu @@ -43,7 +43,7 @@ void testInsertSingle(std::string testURI, unsigned int i) dim3 blockSize(blockEdgeSize, blockEdgeSize); grid_smb<dim, blockEdgeSize> blockGeometry(gridSize); SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); if (prePopulateGrid) diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_tests.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_tests.cu index af6dcf8e..7f432300 100644 --- a/src/SparseGridGpu/performance/SparseGridGpu_performance_tests.cu +++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_tests.cu @@ -56,7 +56,7 @@ void testStencilHeatGet_perf(unsigned int i, std::string base) dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_); typename SparseGridZ::grid_info blockGeometry(gridSize); SparseGridZ sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_; @@ -180,7 +180,7 @@ void testStencilSkeleton_perf(unsigned int i, std::string base) dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_); typename SparseGridZ::grid_info blockGeometry(gridSize); SparseGridZ sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_; diff --git a/src/SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu b/src/SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu index f94cc157..037a270b 100644 --- a/src/SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu +++ b/src/SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu @@ -128,7 +128,7 @@ BOOST_AUTO_TEST_CASE(testSegreduce_total) openfpm::vector_gpu<aggregate<MaskBlockT, BlockT>> outputData; outputData.resize(100); - CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<BLOCK, 0, BITMASK, 2, mgpu::plus_t<ScalarT>>),segments.size()-1, 2*BlockT::size, + CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<BLOCK, 0, BITMASK, 2, gpu::plus_t<ScalarT>>),segments.size()-1, 2*BlockT::size, data_new.toKernel(), data_old.toKernel(), segments.toKernel(), @@ -138,7 +138,7 @@ BOOST_AUTO_TEST_CASE(testSegreduce_total) outputData.toKernel()); // Segreduce on mask - CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<BITMASK, 0, BITMASK, 2, mgpu::maximum_t<unsigned char>>),segments.size()-1, 2*BlockT::size, + CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<BITMASK, 0, BITMASK, 2, gpu::maximum_t<unsigned char>>),segments.size()-1, 2*BlockT::size, data_new.toKernel(), data_old.toKernel(), segments.toKernel(), @@ -258,7 +258,7 @@ BOOST_AUTO_TEST_CASE(test_maps_create) CUDA_LAUNCH(BlockMapGpuKernels::compute_predicate,ite,merge_keys.toKernel(),merge_indexes.toKernel(),9,p_ids.toKernel()); - mgpu::ofp_context_t context; + gpu::ofp_context_t context; openfpm::scan((int *)p_ids.template getDeviceBuffer<0>(), s_ids.size(), (int *)s_ids.template getDeviceBuffer<0>(), @@ -350,7 +350,7 @@ BOOST_AUTO_TEST_CASE (testSolve_conflicts) openfpm::vector_gpu<aggregate<unsigned int>> keys, mergeIndices, tmpIndices, keysOut, trivial_map; openfpm::vector_gpu<aggregate<unsigned int,unsigned int>> segments_new; openfpm::vector_gpu<aggregate<BlockT, MaskBlockT>> dataOld, dataNew, tmpData, dataOut; - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; // Keys keys.resize(14); diff --git a/src/SparseGridGpu/tests/BlockMapGpu_tests.cu b/src/SparseGridGpu/tests/BlockMapGpu_tests.cu index 0628108c..fb16d290 100644 --- a/src/SparseGridGpu/tests/BlockMapGpu_tests.cu +++ b/src/SparseGridGpu/tests/BlockMapGpu_tests.cu @@ -143,7 +143,7 @@ BOOST_AUTO_TEST_CASE(testInsert) CUDA_LAUNCH_DIM3((insertValues<0>), gridSize, blockSizeInsert ,blockMap.toKernel()); // Flush inserts - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; blockMap.flush<smax_<0>>(ctx, flush_type::FLUSH_ON_DEVICE); // Get output @@ -188,7 +188,7 @@ BOOST_AUTO_TEST_CASE(testInsert_halfBlock) CUDA_LAUNCH_DIM3((insertValuesHalfBlock<0>), gridSize, blockSizeInsert, blockMap.toKernel()); // Flush inserts - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; blockMap.flush<smax_<0>>(ctx, flush_type::FLUSH_ON_DEVICE); // Get output @@ -239,7 +239,7 @@ BOOST_AUTO_TEST_CASE(testInsert_blocked) CUDA_LAUNCH_DIM3((insertValuesBlocked<0, 2>), gridSize, blockSizeInsert,sparseGrid.toKernel()); // Flush inserts - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.flush<smax_<0>>(ctx, flush_type::FLUSH_ON_DEVICE); // Get output diff --git a/src/SparseGridGpu/tests/SparseGridGpu_tests.cu b/src/SparseGridGpu/tests/SparseGridGpu_tests.cu index efe86503..4b0e5fc4 100644 --- a/src/SparseGridGpu/tests/SparseGridGpu_tests.cu +++ b/src/SparseGridGpu/tests/SparseGridGpu_tests.cu @@ -225,7 +225,7 @@ BOOST_AUTO_TEST_CASE(testInsert) CUDA_LAUNCH_DIM3((insertValues<0>),gridSize, blockSizeInsert,sparseGrid.toKernel()); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE); sparseGrid.template deviceToHost<0>(); @@ -263,7 +263,7 @@ BOOST_AUTO_TEST_CASE(testInsert3D) CUDA_LAUNCH_DIM3((insertValues<0>),gridSize, blockSizeInsert,sparseGrid.toKernel()); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE); sparseGrid.template deviceToHost<0>(); @@ -295,7 +295,7 @@ BOOST_AUTO_TEST_CASE(testTagBoundaries) SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry); sparseGrid.template setBackgroundValue<0>(666); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.setGPUInsertBuffer(gridSize, blockSizeInsert); dim3 pt1(0, 0, 0); @@ -382,7 +382,7 @@ BOOST_AUTO_TEST_CASE(testTagBoundaries2) SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry); sparseGrid.template setBackgroundValue<0>(666); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; /////// { @@ -480,7 +480,7 @@ BOOST_AUTO_TEST_CASE(testStencilHeat) grid_smb<dim, blockEdgeSize> blockGeometry(gridSize); SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // Insert values on the grid @@ -529,7 +529,7 @@ BOOST_AUTO_TEST_CASE(testStencil_lap_simplified) grid_smb<dim, blockEdgeSize> blockGeometry(gridSize); SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // Insert values on the grid @@ -586,7 +586,7 @@ BOOST_AUTO_TEST_CASE(testStencil_lap_no_cross_simplified) grid_smb<dim, blockEdgeSize> blockGeometry(gridSize); SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // Insert values on the grid @@ -662,7 +662,7 @@ BOOST_AUTO_TEST_CASE(testStencil_lap_no_cross_simplified2) grid_smb<dim, blockEdgeSize> blockGeometry(gridSize); SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // Insert values on the grid @@ -751,7 +751,7 @@ BOOST_AUTO_TEST_CASE(testStencil_lap_no_cross_simplified_subset) grid_smb<dim, blockEdgeSize> blockGeometry(gridSize); SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // Insert values on the grid @@ -809,7 +809,7 @@ BOOST_AUTO_TEST_CASE(testFlushInsert) size_t sz[] = {137,100,57}; SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(sz); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); sparseGrid.insertFlush<0>(grid_key_dx<3>({3,6,7})) = 2.0; @@ -1015,7 +1015,7 @@ void test_convolution_3x3x3() size_t sz[] = {1000,1000,1000}; SparseGridZ sparseGrid(sz); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // now create 3 3D sphere @@ -1082,7 +1082,7 @@ void test_convolution_3x3x3_no_shared() size_t sz[] = {1000,1000,1000}; SparseGridZ sparseGrid(sz); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // now create 3 3D sphere @@ -1187,7 +1187,7 @@ BOOST_AUTO_TEST_CASE(test_sparse_grid_iterator_sub_host) grid_smb<dim, blockEdgeSize> blockGeometry(sz); SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); ///// Insert sparse content, a set of 3 hollow spheres ///// @@ -1242,7 +1242,7 @@ BOOST_AUTO_TEST_CASE(test_sparse_grid_iterator_host) grid_smb<dim, blockEdgeSize> blockGeometry(sz); SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); ///// Insert sparse content, a set of 3 hollow spheres ///// @@ -1291,7 +1291,7 @@ BOOST_AUTO_TEST_CASE(test_pack_request) typedef SparseGridGpu<dim, aggregate<float>, blockEdgeSize, 64, long int> SparseGridZ; SparseGridZ sparseGrid(sz); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // now create a 3D sphere @@ -1331,7 +1331,7 @@ BOOST_AUTO_TEST_CASE(test_MergeIndexMap) typedef SparseGridGpu<dim, aggregate<float>, blockEdgeSize, 64, long int> SparseGridZ; SparseGridZ sparseGrid(sz); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // now create a 3D sphere @@ -1389,7 +1389,7 @@ BOOST_AUTO_TEST_CASE(test_pack_request_with_iterator) typedef SparseGridGpu<dim, aggregate<float>, blockEdgeSize, 64, long int> SparseGridZ; SparseGridZ sparseGrid(sz); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // now create a 3D sphere @@ -1482,7 +1482,7 @@ BOOST_AUTO_TEST_CASE(sparsegridgpu_remove_test) typedef SparseGridGpu<dim, aggregate<float>, blockEdgeSize, 64, long int> SparseGridZ; SparseGridZ sparseGrid(sz); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // now create a 3D sphere @@ -1555,7 +1555,7 @@ void pack_unpack_test(SG_type & sparseGridDst, SG_type & sparseGridSrc, Box<3,size_t> & box2_dst, Box<3,size_t> & box3_dst, Box<3,size_t> & box4_dst, - mgpu::ofp_context_t & ctx, + gpu::ofp_context_t & ctx, bool test_pack) { Box<3,size_t> box1_src({256,256,256},{273,390,390}); @@ -1704,7 +1704,7 @@ BOOST_AUTO_TEST_CASE(sparsegridgpu_pack_unpack) SparseGridZ sparseGridSrc(sz); SparseGridZ sparseGridDst(sz); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGridSrc.template setBackgroundValue<0>(0); sparseGridDst.template setBackgroundValue<0>(0); @@ -1947,7 +1947,7 @@ BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput3DHeatStencil) grid_smb<dim, blockEdgeSize> blockGeometry(sz); SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); ///// Insert sparse content, a set of 3 hollow spheres ///// @@ -2021,7 +2021,7 @@ BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput) grid_smb<dim, blockEdgeSize> blockGeometry(sz); SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); grid_key_dx<2,int> start({500000,500000}); @@ -2051,7 +2051,7 @@ BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput3D) grid_smb<dim, blockEdgeSize> blockGeometry(sz); SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); grid_key_dx<3,int> start({256,256,256}); diff --git a/src/SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh b/src/SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh index 73d904ce..0f380112 100644 --- a/src/SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh +++ b/src/SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh @@ -498,7 +498,7 @@ void testConv3x3x3_perf(std::string testName) size_t sz[] = {1000,1000,1000}; SparseGridZ sparseGrid(sz); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // now create 3 3D sphere @@ -592,7 +592,7 @@ static void testConv3x3x3_no_shared_perf(std::string testName) size_t sz[] = {1000,1000,1000}; SparseGridZ sparseGrid(sz); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; sparseGrid.template setBackgroundValue<0>(0); // now create 3 3D sphere diff --git a/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu b/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu index 6a9a1537..38d4af70 100644 --- a/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu +++ b/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu @@ -204,7 +204,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu ) // we launch a kernel to insert data CUDA_LAUNCH_DIM3(test_insert_sparse,10,100,vs.toKernel()); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE); vs.setGPUInsertBuffer(10,1024); @@ -285,7 +285,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_incremental_add ) CUDA_LAUNCH_DIM3(test_insert_sparse2_inc,10,100,vs.toKernel()); CUDA_LAUNCH_DIM3(test_insert_sparse2_inc,10,100,vs.toKernel()); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE); @@ -352,7 +352,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_get ) CUDA_LAUNCH_DIM3(test_insert_sparse,10,100,vs.toKernel()); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE); vs.template deviceToHost<0,1,2>(); @@ -475,7 +475,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_special_function ) CUDA_LAUNCH_DIM3(test_insert_sparse2_inc,10,100,vs.toKernel()); CUDA_LAUNCH_DIM3(test_insert_sparse2_inc,10,100,vs.toKernel()); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; vs.flush<sstart_<0>>(ctx,flush_type::FLUSH_ON_DEVICE); @@ -604,7 +604,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_remove ) // we launch a kernel to insert data CUDA_LAUNCH_DIM3(test_insert_sparse,10,100,vs.toKernel()); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE); vs.setGPUInsertBuffer(10,1024); @@ -686,7 +686,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_remove_incremental ) CUDA_LAUNCH_DIM3(test_insert_sparse2_inc,10,100,vs.toKernel()); CUDA_LAUNCH_DIM3(test_insert_sparse2_inc,10,100,vs.toKernel()); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; vs.flush<sadd_<0>,sadd_<1>,sadd_<2>>(ctx,flush_type::FLUSH_ON_DEVICE); diff --git a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh index 33b0a68a..0079b452 100644 --- a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh +++ b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh @@ -12,19 +12,34 @@ #include "config.h" +#include <limits> + #if CUDART_VERSION < 11000 #include "util/cuda/cub_old/util_type.cuh" #include "util/cuda/cub_old/block/block_scan.cuh" -#include "util/cuda/moderngpu/operators.hxx" #include "util/cuda_launch.hpp" -#else - #if !defined(CUDA_ON_CPU) - #include "util/cuda/moderngpu/operators.hxx" - #endif +#endif + +#if !defined(CUDA_ON_CPU) +#include "util/cudify/cuda/operators.hpp" #endif #endif +template<typename type_t> +struct zero_t { + __device__ __host__ type_t operator()() const { + return 0; + } +}; + +template<typename type_t> +struct limit_max_t { + __device__ __host__ type_t operator()() const { + return std::numeric_limits<type_t>::max(); + } +}; + template<typename type_t> struct rightOperand_t : public std::binary_function<type_t, type_t, type_t> { __device__ __host__ type_t operator()(type_t a, type_t b) const { @@ -93,7 +108,8 @@ struct sadd_ typedef boost::mpl::int_<prp> prop; #ifdef __NVCC__ - template<typename red_type> using op_red = mgpu::plus_t<red_type>; + template<typename red_type> using op_red = gpu::plus_t<red_type>; + template<typename red_type> using op_initial_value = zero_t<red_type>; #endif template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2) @@ -147,6 +163,7 @@ struct sadd_block_ #ifdef __NVCC__ template<typename red_type> using op_red = plus_block_t<red_type, blockLength>; + template<typename red_type> using op_initial_value = zero_t<red_type>; #endif template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2) @@ -176,7 +193,8 @@ struct smax_ typedef boost::mpl::int_<prp> prop; #ifdef __NVCC__ - template<typename red_type> using op_red = mgpu::maximum_t<red_type>; + template<typename red_type> using op_red = gpu::maximum_t<red_type>; + template<typename red_type> using op_initial_value = zero_t<red_type>; #endif template<typename red_type> @@ -200,7 +218,7 @@ struct smax_ template<typename type_t, unsigned int blockLength> struct maximum_block_t : public std::binary_function<type_t, type_t, type_t> { - MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { + __forceinline__ __device__ __host__ type_t operator()(type_t a, type_t b) const { type_t res; for (int i=0; i<blockLength; ++i) { @@ -219,6 +237,7 @@ struct smax_block_ #ifdef __NVCC__ template<typename red_type> using op_red = maximum_block_t<red_type, blockLength>; + template<typename red_type> using op_initial_value = zero_t<red_type>; #endif template<typename red_type> @@ -251,7 +270,8 @@ struct smin_ typedef boost::mpl::int_<prp> prop; #ifdef __NVCC__ - template<typename red_type> using op_red = mgpu::minimum_t<red_type>; + template<typename red_type> using op_red = gpu::minimum_t<red_type>; + template<typename red_type> using op_initial_value = limit_max_t<red_type>; #endif template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2) @@ -274,7 +294,7 @@ struct smin_ template<typename type_t, unsigned int blockLength> struct minimum_block_t : public std::binary_function<type_t, type_t, type_t> { - MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { + __forceinline__ __device__ __host__ type_t operator()(type_t a, type_t b) const { type_t res; for (int i=0; i<blockLength; ++i) { @@ -293,6 +313,7 @@ struct smin_block_ #ifdef __NVCC__ template<typename red_type> using op_red = minimum_block_t<red_type, blockLength>; + template<typename red_type> using op_initial_value = limit_max_t<red_type>; #endif template<typename red_type> @@ -322,7 +343,7 @@ struct smin_block_ template<typename type_t> struct bitwiseOr_t : public std::binary_function<type_t, type_t, type_t> { - MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { + __forceinline__ __device__ __host__ type_t operator()(type_t a, type_t b) const { return a|b; } }; @@ -357,7 +378,8 @@ struct sstart_ { typedef boost::mpl::int_<prp> prop; - template<typename red_type> using op_red = mgpu::minimum_t<red_type>; + template<typename red_type> using op_red = gpu::minimum_t<red_type>; + template<typename red_type> using op_initial_value = zero_t<red_type>; template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2) { @@ -382,7 +404,7 @@ struct sstop_ { typedef boost::mpl::int_<prp> prop; - template<typename red_type> using op_red = mgpu::minimum_t<red_type>; + template<typename red_type> using op_red = gpu::minimum_t<red_type>; template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2) { @@ -407,7 +429,7 @@ struct snum_ { typedef boost::mpl::int_<prp> prop; - template<typename red_type> using op_red = mgpu::minimum_t<red_type>; + template<typename red_type> using op_red = gpu::minimum_t<red_type>; template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2) { diff --git a/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu b/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu index 0dd883bc..bbffab38 100644 --- a/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu +++ b/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu @@ -35,7 +35,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_use ) block_insert.template hostToDevice<0>(); block_n.template hostToDevice<0>(); - mgpu::ofp_context_t context; + gpu::ofp_context_t context; openfpm::scan((int *)block_n.template getDeviceBuffer<0>(), block_n.size(), (int *)block_n_scan.template getDeviceBuffer<0>() , context); block_n_scan.template deviceToHost<0>(block_n_scan.size()-1,block_n_scan.size()-1); @@ -105,7 +105,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_use_small_pool ) block_insert.template hostToDevice<0>(); block_n.template hostToDevice<0>(); - mgpu::ofp_context_t context; + gpu::ofp_context_t context; openfpm::scan((int *)block_n.template getDeviceBuffer<0>(), block_n.size(), (int *)block_n_scan.template getDeviceBuffer<0>() , context); block_n_scan.template deviceToHost<0>(block_n_scan.size()-1,block_n_scan.size()-1); @@ -168,7 +168,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_merge_use ) vct_index.resize(vct_add_index.size() + vct_index_old.size()); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; // host to device vct_index_old.template hostToDevice<0,1>(); @@ -176,7 +176,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_merge_use ) openfpm::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(), (int *)vct_add_index.template getDeviceBuffer<0>(),(int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.size(), - (int *)vct_index.template getDeviceBuffer<0>(),(int *)vct_index.template getDeviceBuffer<1>(),mgpu::less_t<int>(),ctx); + (int *)vct_index.template getDeviceBuffer<0>(),(int *)vct_index.template getDeviceBuffer<1>(),gpu::less_t<int>(),ctx); vct_index.template deviceToHost<0,1>(); @@ -262,7 +262,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_solve_conflicts_use ) vct_index.resize(vct_add_index.size() + vct_index_old.size()); merge_indexes.resize(vct_index.size()); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; // host to device vct_index_old.template hostToDevice<0,1>(); @@ -272,7 +272,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_solve_conflicts_use ) openfpm::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(), (int *)vct_add_index.template getDeviceBuffer<0>(),(int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.size(), - (int *)vct_index.template getDeviceBuffer<0>(),(int *)merge_indexes.template getDeviceBuffer<0>(),mgpu::less_t<int>(),ctx); + (int *)vct_index.template getDeviceBuffer<0>(),(int *)merge_indexes.template getDeviceBuffer<0>(),gpu::less_t<int>(),ctx); constexpr int bdim = 128; @@ -378,7 +378,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_realign_use ) vct_data.template hostToDevice<0,1,2>(); vct_tot_out.template hostToDevice<0,2>(); - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; openfpm::scan((int *)vct_tot_out.getDeviceBuffer<0>(),vct_tot_out.size(),(int *)vct_tot_out.getDeviceBuffer<1>(),ctx); vct_tot_out.deviceToHost<0,1>(); diff --git a/src/Vector/map_vector_sparse.hpp b/src/Vector/map_vector_sparse.hpp index c82e41db..3d782ae1 100644 --- a/src/Vector/map_vector_sparse.hpp +++ b/src/Vector/map_vector_sparse.hpp @@ -12,15 +12,11 @@ #include "Vector/map_vector.hpp" #include "Vector/cuda/map_vector_sparse_cuda_ker.cuh" #include "Vector/cuda/map_vector_sparse_cuda_kernels.cuh" -#include "util/cuda/ofp_context.hxx" +#include "util/ofp_context.hpp" #include <iostream> #include <limits> #if defined(__NVCC__) - #if !defined(CUDA_ON_CPU) && !defined(__HIP__) - #include "util/cuda/moderngpu/kernel_segreduce.hxx" - #include "util/cuda/moderngpu/kernel_merge.hxx" - #endif #include "util/cuda/kernels.cuh" #endif @@ -128,7 +124,10 @@ namespace openfpm static void extendSegments(vector_index_type & segments, size_t dataSize) { #ifdef __NVCC__ - // Pass as there is nothing to append for mgpu + // Append trailing element to segment (marks end of last segment) + segments.resize(segments.size()+1); + segments.template get<p>(segments.size() - 1) = dataSize; + segments.template hostToDevice<p>(segments.size() - 1, segments.size() - 1); #else // __NVCC__ std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl; #endif // __NVCC__ @@ -141,23 +140,22 @@ namespace openfpm vector_index_type2 & segment_offset, vector_data_type & vector_data_red, block_functor & blf, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) { #ifdef __NVCC__ typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type; typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type; typedef typename reduction_type::template op_red<red_type> red_op; typedef typename boost::mpl::at<typename vector_index_type::value_type::type,boost::mpl::int_<0>>::type seg_type; - red_type init; - init = 0; + typename reduction_type::template op_initial_value<red_type> initial_value_functor; assert((std::is_same<seg_type,int>::value == true)); openfpm::segreduce( (red_type *)vector_data.template getDeviceBuffer<reduction_type::prop::value>(), vector_data.size(), - (int *)segment_offset.template getDeviceBuffer<1>(), segment_offset.size(), + (int *)segment_offset.template getDeviceBuffer<1>(), segment_offset.size()-1, (red_type *)vector_data_red.template getDeviceBuffer<reduction_type::prop::value>(), - red_op(), init, context); + red_op(), initial_value_functor(), context); #else // __NVCC__ std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl; #endif // __NVCC__ @@ -200,7 +198,7 @@ namespace openfpm vector_data_type & vct_data_out, ite_gpu<1> & itew, block_functor & blf, - mgpu::ofp_context_t & context + gpu::ofp_context_t & context ) { #ifdef __NVCC__ @@ -268,7 +266,7 @@ namespace openfpm vector_index_type2 & segment_offset, vector_data_type & vector_data_red, block_functor & blf, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) { } @@ -294,7 +292,7 @@ namespace openfpm vector_data_type & vct_data_out, ite_gpu<1> & itew, block_functor & blf, - mgpu::ofp_context_t & context + gpu::ofp_context_t & context ) { #ifdef __NVCC__ @@ -634,7 +632,7 @@ namespace openfpm block_functor & blf; //! gpu context - mgpu::ofp_context_t & context; + gpu::ofp_context_t & context; /*! \brief constructor * @@ -648,7 +646,7 @@ namespace openfpm vector_index_type & vector_data_map, vector_index_type2 & segment_offset, block_functor & blf, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) :vector_data_red(vector_data_red), vector_data(vector_data), vector_data_unsorted(vector_data_unsorted), @@ -697,7 +695,7 @@ namespace openfpm vector_data_type &data1, vector_data_type &data2, vector_index_type &indices_tmp, vector_data_type &data_tmp, vector_index_type &keysOut, vector_data_type &dataOut, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) { return true; } @@ -738,7 +736,7 @@ namespace openfpm vector_index_type & segment_offset; //! gpu context - mgpu::ofp_context_t & context; + gpu::ofp_context_t & context; /*! \brief constructor * @@ -749,7 +747,7 @@ namespace openfpm inline sparse_vector_special(vector_data_type & vector_data_red, vector_data_type & vector_data, vector_index_type & segment_offset, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) :vector_data_red(vector_data_red),vector_data(vector_data),segment_offset(segment_offset),context(context) {}; @@ -881,7 +879,7 @@ namespace openfpm * \param vct_add_cont_index output continuos array of inserted indexes * \param vct_add_data array of added data * \param vct_add_data_cont continuos array of inserted data - * \param contect mgpu context + * \param contect gpu context * */ size_t make_continuos(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_nadd_index, @@ -890,7 +888,7 @@ namespace openfpm vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map, vector<T,Memory,layout_base,grow_p> & vct_add_data, vector<T,Memory,layout_base,grow_p> & vct_add_data_cont, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) { #ifdef __NVCC__ @@ -970,7 +968,7 @@ namespace openfpm vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map, vector<T,Memory,layout_base,grow_p> & vct_add_data_reord, vector<T,Memory,layout_base,grow_p> & vct_add_data_cont, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) { #ifdef __NVCC__ ite_gpu<1> itew; @@ -990,7 +988,7 @@ namespace openfpm (Ti *)vct_add_cont_index.template getDeviceBuffer<0>(), (Ti *)vct_add_cont_index_map.template getDeviceBuffer<0>(), vct_add_cont_index.size(), - mgpu::template less_t<Ti>(), + gpu::template less_t<Ti>(), context); auto ite = vct_add_cont_index.getGPUIterator(); @@ -1017,7 +1015,7 @@ namespace openfpm vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & vct_add_index_unique, vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index, vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index_map, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) { #ifdef __NVCC__ @@ -1112,7 +1110,7 @@ namespace openfpm openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(), (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_add_index_unique.size(), - (Ti *)vct_merge_index.template getDeviceBuffer<0>(),(Ti *)vct_merge_index_map.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context); + (Ti *)vct_merge_index.template getDeviceBuffer<0>(),(Ti *)vct_merge_index_map.template getDeviceBuffer<0>(),gpu::less_t<Ti>(),context); #endif @@ -1125,7 +1123,7 @@ namespace openfpm vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & segments_new, vector<T,Memory,layout_base,grow_p> & vct_add_data, vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_data_reord_map, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) { #ifdef __NVCC__ ite_gpu<1> itew; @@ -1159,6 +1157,7 @@ namespace openfpm context); boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr); + vct_add_index_unique.remove(vct_add_index_unique.size()-1); } sparse_vector_special<typename std::remove_reference<decltype(vct_add_data)>::type, @@ -1204,7 +1203,7 @@ namespace openfpm void flush_on_gpu_insert(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0, vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1, vector<T,Memory,layout_base,grow_p> & vct_add_data_reord, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) { #ifdef __NVCC__ @@ -1237,7 +1236,7 @@ namespace openfpm void flush_on_gpu_remove( - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) { #ifdef __NVCC__ @@ -1275,7 +1274,7 @@ namespace openfpm // now we sort openfpm::sort((Ti *)vct_add_index_cont_0.template getDeviceBuffer<0>(),(Ti *)vct_add_index_cont_1.template getDeviceBuffer<0>(), - vct_add_index_cont_0.size(), mgpu::template less_t<Ti>(), context); + vct_add_index_cont_0.size(), gpu::template less_t<Ti>(), context); auto ite = vct_add_index_cont_0.getGPUIterator(); @@ -1297,7 +1296,7 @@ namespace openfpm vct_add_index_unique.resize(n_ele_unique); openfpm::sort((Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<0>(), - vct_add_index_unique.size(),mgpu::template less_t<Ti>(),context); + vct_add_index_unique.size(),gpu::template less_t<Ti>(),context); // Then we merge the two list vct_index and vct_add_index_unique @@ -1329,7 +1328,7 @@ namespace openfpm // openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(), (Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),vct_add_index_unique.size(), - (Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context); + (Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),gpu::less_t<Ti>(),context); vct_index_tmp3.resize(128*itew.wthr.x); @@ -1377,7 +1376,7 @@ namespace openfpm void flush_on_gpu(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0, vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1, vector<T,Memory,layout_base,grow_p> & vct_add_data_reord, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) { flush_on_gpu_insert<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context); } @@ -1747,7 +1746,7 @@ namespace openfpm */ template<typename ... v_reduce> void flush_v(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0, - mgpu::ofp_context_t & context, + gpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST, int i = 0) { @@ -1771,7 +1770,7 @@ namespace openfpm */ template<typename ... v_reduce> void flush_vd(vector<T,Memory,layout_base,grow_p> & vct_add_data_reord, - mgpu::ofp_context_t & context, + gpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST) { // Eliminate background @@ -1791,7 +1790,7 @@ namespace openfpm * */ template<typename ... v_reduce> - void flush(mgpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST) + void flush(gpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST) { // Eliminate background vct_data.resize(vct_index.size()); @@ -1809,7 +1808,7 @@ namespace openfpm * \param opt options * */ - void flush_remove(mgpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST) + void flush_remove(gpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST) { vct_data.resize(vct_data.size()-1); diff --git a/src/Vector/map_vector_sparse_unit_tests.cu b/src/Vector/map_vector_sparse_unit_tests.cu index 474005b8..1b6e4f06 100644 --- a/src/Vector/map_vector_sparse_unit_tests.cu +++ b/src/Vector/map_vector_sparse_unit_tests.cu @@ -34,7 +34,7 @@ BOOST_AUTO_TEST_CASE ( test_sparse_vector_use ) vs.template insert<0>(35) = 35; vs.template insert<0>(28) = 28; - mgpu::ofp_context_t ctx; + gpu::ofp_context_t ctx; vs.template flush<sadd_<0>>(ctx); BOOST_REQUIRE_EQUAL(vs.get<0>(5),5); diff --git a/src/util/cuda/merge_ofp.cuh b/src/util/cuda/merge_ofp.cuh index 367da428..f5a917cb 100644 --- a/src/util/cuda/merge_ofp.cuh +++ b/src/util/cuda/merge_ofp.cuh @@ -13,8 +13,7 @@ #include "Vector/map_vector.hpp" #include "util/cuda_launch.hpp" - #if CUDART_VERSION >= 11000 - #ifndef CUDA_ON_CPU + #ifndef CUDA_ON_CPU // Here we have for sure CUDA >= 11 #ifdef __HIP__ #undef __CUDACC__ @@ -27,13 +26,7 @@ #include <thrust/merge.h> #include <thrust/execution_policy.h> #endif - #endif - #else - #include <thrust/merge.h> - #include <thrust/execution_policy.h> -// #include "util/cuda/moderngpu/kernel_merge.hxx" #endif - #include "util/cuda/ofp_context.hxx" namespace openfpm @@ -101,10 +94,6 @@ #else -// It seems broken on some CUDA on some hardware. Anyway is not anymore supported -// on some hardware ... we move to thrust -// mgpu::merge(a_keys,a_vals,a_count,b_keys,b_vals,b_count,c_keys,c_vals,comp,context); - thrust::merge_by_key(thrust::device, a_keys,a_keys + a_count, b_keys,b_keys + b_count, a_vals,b_vals, diff --git a/src/util/cuda/modern_gpu_tests.cu b/src/util/cuda/modern_gpu_tests.cu deleted file mode 100644 index 383d46ab..00000000 --- a/src/util/cuda/modern_gpu_tests.cu +++ /dev/null @@ -1,222 +0,0 @@ -#include "config.h" -#define BOOST_TEST_DYN_LINK -#include <boost/test/unit_test.hpp> - -#include "util/cuda_util.hpp" -#include "Vector/map_vector.hpp" - -#ifndef CUDA_ON_CPU - -#ifndef __HIP__ -#include "util/cuda/moderngpu/kernel_load_balance.hxx" -#include "util/cuda/moderngpu/kernel_mergesort.hxx" -#include "util/cuda/moderngpu/kernel_reduce.hxx" -#include "util/cuda/moderngpu/kernel_segreduce.hxx" - - -BOOST_AUTO_TEST_SUITE( modern_gpu_tests ) - -BOOST_AUTO_TEST_CASE( modern_gpu_loadbalance_lbs ) -{ - std::cout << "Test modern gpu test tansform_lbs" << "\n"; - - mgpu::standard_context_t context(false); - - int count = 200030; - int spacing = 100; - - int num_segments = mgpu::div_up(count, spacing); - openfpm::vector_gpu<aggregate<int>> segments(num_segments); - for(int i = 0; i < num_segments; ++i) - {segments.template get<0>(i) = i * spacing;} - - openfpm::vector_gpu<aggregate<int>> lbs(count); - - segments.template hostToDevice<0>(); - - mgpu::load_balance_search(count, (int *)segments.template getDeviceBuffer<0>(), num_segments, (int *)lbs.template getDeviceBuffer<0>(),context); - - lbs.deviceToHost<0>(); - - bool check = true; - for(size_t i = 0; i < lbs.size(); ++i) - { - check &= lbs.template get<0>(i) == i / spacing; - } - - BOOST_REQUIRE_EQUAL(check,true); - - std::cout << "End test modern gpu test tansform_lbs" << "\n"; - - // Test the cell list -} - -BOOST_AUTO_TEST_CASE( modern_gpu_sort ) -{ - std::cout << "Test modern gpu test tansform_lbs" << "\n"; - - mgpu::standard_context_t context(false); - - int count = 200030; - - openfpm::vector_gpu<aggregate<unsigned int,unsigned int>> vgpu; - openfpm::vector_gpu<aggregate<unsigned int>> gpu_ns; - - vgpu.resize(count); - gpu_ns.resize(count); - - for (size_t i = 0 ; i < count ; i++) - { - vgpu.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 17; - vgpu.template get<1>(i) = i; - - gpu_ns.template get<0>(i) = vgpu.template get<0>(i); - } - - vgpu.hostToDevice<0,1>(); - - mergesort((unsigned int *)vgpu.getDeviceBuffer<0>(),(unsigned int *)vgpu.getDeviceBuffer<1>(), count, mgpu::less_t<unsigned int>(), context); - - vgpu.deviceToHost<0,1>(); - - // print - - bool match = true; - for (int i = 0 ; i < count - 1 ; i++) - { - match &= vgpu.template get<0>(i) <= vgpu.template get<0>(i+1); - match &= gpu_ns.template get<0>(vgpu.template get<1>(i)) == vgpu.template get<0>(i); - } - - BOOST_REQUIRE_EQUAL(match,true); - - std::cout << "End test modern gpu test tansform_lbs" << "\n"; - - // Test the cell list -} - -BOOST_AUTO_TEST_CASE( modern_gpu_reduce ) -{ - std::cout << "Test modern gpu reduce" << "\n"; - - mgpu::standard_context_t context(false); - - int count = 200030; - - openfpm::vector_gpu<aggregate<int>> vgpu; - - vgpu.resize(count); - - for (size_t i = 0 ; i < count ; i++) - { - vgpu.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 17; - } - - vgpu.hostToDevice<0>(); - - CudaMemory mem; - mem.allocate(sizeof(int)); - mgpu::reduce((int *)vgpu.template getDeviceBuffer<0>(), count, (int *)mem.getDevicePointer(), mgpu::plus_t<int>(), context); - - mem.deviceToHost(); - int red_p = *(int *)mem.getPointer(); - - // print - - int red = 0; - for (int i = 0 ; i < count ; i++) - { - red += vgpu.template get<0>(i); - } - - BOOST_REQUIRE_EQUAL(red,red_p); - - std::cout << "End test modern gpu test reduce" << "\n"; - - // Test the cell list -} - - -BOOST_AUTO_TEST_CASE( modern_gpu_seg_reduce ) -{ - std::cout << "Test modern gpu segmented reduce" << "\n"; - - mgpu::standard_context_t context(false); - - int count = 130; - - openfpm::vector_gpu<aggregate<int>> vgpu; - openfpm::vector_gpu<aggregate<int>> segment_offset; - openfpm::vector_gpu<aggregate<int>> output; - int init = 0; - - vgpu.resize(count); - - for (size_t i = 0 ; i < count ; i++) - { - vgpu.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 17; - } - - segment_offset.add(); - segment_offset.template get<0>(0) = 0; - size_t base = 0; - while (1) - { - int c = ((float)rand() / (float)RAND_MAX) * 17; - - if (c + base >= count) - {break;} - - segment_offset.add(); - segment_offset.template get<0>(segment_offset.size() - 1) = c + segment_offset.template get<0>(segment_offset.size() - 2); - - base += c; - } - - vgpu.hostToDevice<0>(); - segment_offset.hostToDevice<0>(); - output.resize(segment_offset.size()); - - mgpu::segreduce((int *)vgpu.template getDeviceBuffer<0>(), vgpu.size(), - (int *)segment_offset.template getDeviceBuffer<0>(), segment_offset.size(), - (int *)output.template getDeviceBuffer<0>(), - mgpu::plus_t<int>(), init, context); - - - output.template deviceToHost<0>(); - - bool match = true; - size_t i = 0; - for ( ; i < segment_offset.size()-1 ; i++) - { - size_t red = 0; - for (size_t j = 0 ; j < segment_offset.template get<0>(i+1) - segment_offset.template get<0>(i) ; j++) - { - red += vgpu.template get<0>(segment_offset.template get<0>(i) + j); - } - match &= red == output.template get<0>(i); - } - - BOOST_REQUIRE_EQUAL(match,true); - - size_t red2 = 0; - for (size_t j = 0 ; j < vgpu.size() - segment_offset.template get<0>(i) ; j++) - { - red2 += vgpu.template get<0>(segment_offset.template get<0>(i) + j); - } - match &= red2 == output.template get<0>(i); - - BOOST_REQUIRE_EQUAL(match,true); - - std::cout << "End test modern gpu test reduce" << "\n"; - - // Test the cell list -} - - -BOOST_AUTO_TEST_SUITE_END() - -#endif - -#endif - diff --git a/src/util/cuda/moderngpu/context.hxx b/src/util/cuda/moderngpu/context.hxx deleted file mode 100644 index 93af53d9..00000000 --- a/src/util/cuda/moderngpu/context.hxx +++ /dev/null @@ -1,221 +0,0 @@ -#pragma once - -#include <vector> -#include <memory> -#include <cassert> -#include <exception> -#include "util.hxx" -#include "launch_params.hxx" - -BEGIN_MGPU_NAMESPACE - -enum memory_space_t { - memory_space_device = 0, - memory_space_host = 1 -}; - - -inline std::string device_prop_string(cudaDeviceProp prop) { - int ordinal; - cudaGetDevice(&ordinal); - - size_t freeMem, totalMem; - cudaError_t result = cudaMemGetInfo(&freeMem, &totalMem); - if(cudaSuccess != result) throw cuda_exception_t(result); - - double memBandwidth = (prop.memoryClockRate * 1000.0) * - (prop.memoryBusWidth / 8 * 2) / 1.0e9; - - std::string s = detail::stringprintf( - "%s : %8.3lf Mhz (Ordinal %d)\n" - "%d SMs enabled. Compute Capability sm_%d%d\n" - "FreeMem: %6dMB TotalMem: %6dMB %2d-bit pointers.\n" - "Mem Clock: %8.3lf Mhz x %d bits (%5.1lf GB/s)\n" - "ECC %s\n\n", - prop.name, prop.clockRate / 1000.0, ordinal, - prop.multiProcessorCount, prop.major, prop.minor, - (int)(freeMem / (1<< 20)), (int)(totalMem / (1<< 20)), 8 * sizeof(int*), - prop.memoryClockRate / 1000.0, prop.memoryBusWidth, memBandwidth, - prop.ECCEnabled ? "Enabled" : "Disabled"); - return s; -} - -//////////////////////////////////////////////////////////////////////////////// -// context_t -// Derive context_t to add support for streams and a custom allocator. - -struct context_t { - context_t() = default; - - // Disable copy ctor and assignment operator. We don't want to let the - // user copy only a slice. - context_t(const context_t& rhs) = delete; - context_t& operator=(const context_t& rhs) = delete; - - virtual const cudaDeviceProp& props() const = 0; - virtual int ptx_version() const = 0; - virtual cudaStream_t stream() = 0; - - // Alloc GPU memory. - virtual void* alloc(size_t size, memory_space_t space) = 0; - virtual void free(void* p, memory_space_t space) = 0; - - // cudaStreamSynchronize or cudaDeviceSynchronize for stream 0. - virtual void synchronize() = 0; - - virtual cudaEvent_t event() = 0; - virtual void timer_begin() = 0; - virtual double timer_end() = 0; -}; - -//////////////////////////////////////////////////////////////////////////////// -// standard_context_t is a trivial implementation of context_t. Users can -// derive this type to provide a custom allocator. - -class standard_context_t : public context_t { -protected: - cudaDeviceProp _props; - int _ptx_version; - cudaStream_t _stream; - - cudaEvent_t _timer[2]; - cudaEvent_t _event; - - // Making this a template argument means we won't generate an instance - // of dummy_k for each translation unit. - template<int dummy_arg = 0> - void init() { - cudaFuncAttributes attr; - cudaError_t result = cudaFuncGetAttributes(&attr, (void *)dummy_k<0>); - if(cudaSuccess != result) throw cuda_exception_t(result); - _ptx_version = attr.ptxVersion; - - int ord; - cudaGetDevice(&ord); - cudaGetDeviceProperties(&_props, ord); - - cudaEventCreate(&_timer[0]); - cudaEventCreate(&_timer[1]); - cudaEventCreate(&_event); - } - -public: - standard_context_t(bool print_prop = true, cudaStream_t stream_ = 0) : - context_t(), _stream(stream_) { - - init(); - if(print_prop) { - printf("%s\n", device_prop_string(_props).c_str()); - } - } - ~standard_context_t() { - cudaEventDestroy(_timer[0]); - cudaEventDestroy(_timer[1]); - cudaEventDestroy(_event); - } - - virtual const cudaDeviceProp& props() const { return _props; } - virtual int ptx_version() const { return _ptx_version; } - virtual cudaStream_t stream() { return _stream; } - - // Alloc GPU memory. - virtual void* alloc(size_t size, memory_space_t space) { - void* p = nullptr; - if(size) { - cudaError_t result = (memory_space_device == space) ? - cudaMalloc(&p, size) : - cudaMallocHost(&p, size); - if(cudaSuccess != result) throw cuda_exception_t(result); - } - return p; - } - - virtual void free(void* p, memory_space_t space) { - if(p) { - cudaError_t result = (memory_space_device == space) ? - cudaFree(p) : - cudaFreeHost(p); - if(cudaSuccess != result) throw cuda_exception_t(result); - } - } - - virtual void synchronize() { - cudaError_t result = _stream ? - cudaStreamSynchronize(_stream) : - cudaDeviceSynchronize(); - if(cudaSuccess != result) throw cuda_exception_t(result); - } - - virtual cudaEvent_t event() { - return _event; - } - virtual void timer_begin() { - cudaEventRecord(_timer[0], _stream); - } - virtual double timer_end() { - cudaEventRecord(_timer[1], _stream); - cudaEventSynchronize(_timer[1]); - float ms; - cudaEventElapsedTime(&ms, _timer[0], _timer[1]); - return ms / 1.0e3; - } -}; - -//////////////////////////////////////////////////////////////////////////////// -// mem_t - -template<typename type_t> -class mem_t { - context_t* _context; - type_t* _pointer; - size_t _size; - memory_space_t _space; - -public: - void swap(mem_t& rhs) { - std::swap(_context, rhs._context); - std::swap(_pointer, rhs._pointer); - std::swap(_size, rhs._size); - std::swap(_space, rhs._space); - } - - mem_t() : _context(nullptr), _pointer(nullptr), _size(0), - _space(memory_space_device) { } - mem_t& operator=(const mem_t& rhs) = delete; - mem_t(const mem_t& rhs) = delete; - - mem_t(size_t size, context_t& context, - memory_space_t space = memory_space_device) : - _context(&context), _pointer(nullptr), _size(size), _space(space) { - _pointer = (type_t*)context.alloc(sizeof(type_t) * size, space); - } - - mem_t(mem_t&& rhs) : mem_t() { - swap(rhs); - } - mem_t& operator=(mem_t&& rhs) { - swap(rhs); - return *this; - } - - ~mem_t() { - if(_context && _pointer) _context->free(_pointer, _space); - _pointer = nullptr; - _size = 0; - } - - context_t& context() { return *_context; } - size_t size() const { return _size; } - type_t* data() const { return _pointer; } - memory_space_t space() const { return _space; } - - // Return a deep copy of this container. - mem_t clone() { - mem_t cloned(size(), context(), space()); - if(memory_space_device) dtod(cloned.data(), data(), size()); - else htoh(cloned.data(), data(), size()); - return cloned; - } -}; - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/context_reduced.hxx b/src/util/cuda/moderngpu/context_reduced.hxx deleted file mode 100644 index 88544a7e..00000000 --- a/src/util/cuda/moderngpu/context_reduced.hxx +++ /dev/null @@ -1,107 +0,0 @@ -/* - * context_reduced.hxx - * - * Created on: Dec 27, 2018 - * Author: i-bird - */ - -#ifndef CONTEXT_REDUCED_HXX_ -#define CONTEXT_REDUCED_HXX_ - -#include <cstdarg> -#include <string> - - -namespace mgpu { - -enum memory_space_t { - memory_space_device = 0, - memory_space_host = 1 -}; - -struct cuda_exception_t : std::exception { - cudaError_t result; - - cuda_exception_t(cudaError_t result_) : result(result_) { } - virtual const char* what() const noexcept { - return cudaGetErrorString(result); - } -}; - -namespace detail { - -inline std::string stringprintf(const char* format, ...) { - va_list args; - va_start(args, format); - int len = vsnprintf(0, 0, format, args); - va_end(args); - - // allocate space. - std::string text; - text.resize(len); - - va_start(args, format); - vsnprintf(&text[0], len + 1, format, args); - va_end(args); - - return text; -} - -} // namespace detail - -inline std::string device_prop_string(cudaDeviceProp prop) { - int ordinal; - cudaGetDevice(&ordinal); - - size_t freeMem, totalMem; - cudaError_t result = cudaMemGetInfo(&freeMem, &totalMem); - if(cudaSuccess != result) throw cuda_exception_t(result); - - double memBandwidth = (prop.memoryClockRate * 1000.0) * - (prop.memoryBusWidth / 8 * 2) / 1.0e9; - - std::string s = detail::stringprintf( - "%s : %8.3lf Mhz (Ordinal %d)\n" - "%d SMs enabled. Compute Capability sm_%d%d\n" - "FreeMem: %6dMB TotalMem: %6dMB %2d-bit pointers.\n" - "Mem Clock: %8.3lf Mhz x %d bits (%5.1lf GB/s)\n" - "ECC %s\n\n", - prop.name, prop.clockRate / 1000.0, ordinal, - prop.multiProcessorCount, prop.major, prop.minor, - (int)(freeMem / (1<< 20)), (int)(totalMem / (1<< 20)), 8 * sizeof(int*), - prop.memoryClockRate / 1000.0, prop.memoryBusWidth, memBandwidth, - prop.ECCEnabled ? "Enabled" : "Disabled"); - return s; -} - -//////////////////////////////////////////////////////////////////////////////// -// context_t -// Derive context_t to add support for streams and a custom allocator. - -struct context_t { - context_t() = default; - - // Disable copy ctor and assignment operator. We don't want to let the - // user copy only a slice. - context_t(const context_t& rhs) = delete; - context_t& operator=(const context_t& rhs) = delete; - - virtual const cudaDeviceProp& props() const = 0; - virtual int ptx_version() const = 0; - virtual cudaStream_t stream() = 0; - - // Alloc GPU memory. - virtual void* alloc(size_t size, memory_space_t space) = 0; - virtual void free(void* p, memory_space_t space) = 0; - - // cudaStreamSynchronize or cudaDeviceSynchronize for stream 0. - virtual void synchronize() = 0; - - virtual cudaEvent_t event() = 0; - virtual void timer_begin() = 0; - virtual double timer_end() = 0; -}; - -} - -#endif /* CONTEXT_REDUCED_HXX_ */ diff --git a/src/util/cuda/moderngpu/cpp11.hxx b/src/util/cuda/moderngpu/cpp11.hxx deleted file mode 100644 index 7b0dad23..00000000 --- a/src/util/cuda/moderngpu/cpp11.hxx +++ /dev/null @@ -1,154 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "tuple.hxx" - -BEGIN_MGPU_NAMESPACE - -/////////////////////// -// tuple_iterator_value - -template<typename tpl_t> -struct tuple_iterator_value; - -template<typename... args_t> -struct tuple_iterator_value<tuple<args_t...> > { - typedef tuple<typename std::iterator_traits<args_t>::value_type...> type; -}; - -template<typename tpl_t> -using tuple_iterator_value_t = typename tuple_iterator_value<tpl_t>::type; - -//////////////////////////////////// -// load and store to pointer tuples. - -namespace detail { - -template<typename int_t, typename... pointers_t, size_t... seq_i> -MGPU_HOST_DEVICE auto _lvalue_dereference(tuple<pointers_t...> pointers, - index_sequence<seq_i...> seq, int_t index) -> - decltype(forward_as_tuple(get<seq_i>(pointers)[0]...)) { - - return forward_as_tuple(get<seq_i>(pointers)[index]...); -} - -} - -// Returns lvalues for each of the dereferenced pointers in the tuple. -template<typename int_t, typename... pointers_t> -MGPU_HOST_DEVICE auto dereference(tuple<pointers_t...> pointers, - int_t index) -> decltype(detail::_lvalue_dereference(pointers, - make_index_sequence<sizeof...(pointers_t)>(), index)) { - - return detail::_lvalue_dereference(pointers, - make_index_sequence<sizeof...(pointers_t)>(), index); -} - -template<typename int_t, typename... pointers_t> -MGPU_HOST_DEVICE void store(tuple<pointers_t...> pointers, - tuple_iterator_value_t<tuple<pointers_t...> > values, - int_t index) { - - dereference(pointers, index) = values; -} - -template<typename int_t, typename... pointers_t> -tuple_iterator_value_t<tuple<pointers_t...> > -MGPU_HOST_DEVICE load(tuple<pointers_t...> pointers, int_t index) { - typedef tuple_iterator_value_t<tuple<pointers_t...> > value_t; - return value_t(dereference(pointers, index)); -} - -///////////////////////////// -// Tuple comparison operators - -namespace detail { -template<size_t i, size_t count> -struct _tuple_compare { - template<typename tpl_t> - MGPU_HOST_DEVICE static bool eq(const tpl_t a, const tpl_t b) { - return get<i>(a) == get<i>(b) && _tuple_compare<i + 1, count>::eq(a, b); - } - - template<typename tpl_t> - MGPU_HOST_DEVICE static bool less(const tpl_t a, const tpl_t b) { - return get<i>(a) < get<i>(b) || - (!(get<i>(b) < get<i>(a)) && _tuple_compare<i + 1, count>::less(a, b)); - } -}; - -template<size_t count> -struct _tuple_compare<count, count> { - template<typename tpl_t> - MGPU_HOST_DEVICE static bool eq(const tpl_t, const tpl_t) { - return true; - } - - template<typename tpl_t> - MGPU_HOST_DEVICE static bool less(const tpl_t, const tpl_t) { - return false; - } -}; - -} // namespace detail - -////////////////////////////////////////////// -// Size of the largest component in the tuple. - -template<size_t... values> -struct var_max; - -template<size_t value_, size_t... values_> -struct var_max<value_, values_...> { - constexpr static size_t value = max(value_, var_max<values_...>::value); -}; - -template<size_t value_> -struct var_max<value_> { - constexpr static size_t value = value_; -}; - -template<> struct var_max<> { - constexpr static size_t value = 0; -}; - -template<typename tpl_t> -struct tuple_union_size; - -template<typename... args_t> -struct tuple_union_size<tuple<args_t...> > { - constexpr static size_t value = var_max<sizeof(args_t)...>::value; -}; - -END_MGPU_NAMESPACE - -// Putting comparison operators back into global namespace. -template<typename... args_t> -MGPU_HOST_DEVICE bool operator<(const mgpu::tuple<args_t...>& a, - const mgpu::tuple<args_t...>& b) { - return mgpu::detail::_tuple_compare<0, sizeof...(args_t)>::less(a, b); -} -template<typename... args_t> -MGPU_HOST_DEVICE bool operator<=(const mgpu::tuple<args_t...>& a, - const mgpu::tuple<args_t...>& b) { - return !(b < a); -} -template<typename... args_t> -MGPU_HOST_DEVICE bool operator>(const mgpu::tuple<args_t...>& a, - const mgpu::tuple<args_t...>& b) { - return b < a; -} -template<typename... args_t> -MGPU_HOST_DEVICE bool operator>=(const mgpu::tuple<args_t...>& a, - const mgpu::tuple<args_t...>& b) { - return !(a < b); -} -template<typename... args_t> -MGPU_HOST_DEVICE bool operator==(const mgpu::tuple<args_t...>& a, - const mgpu::tuple<args_t...>& b) { - return mgpu::detail::_tuple_compare<0, sizeof...(args_t)>::eq(a, b); -} -template<typename... args_t> -MGPU_HOST_DEVICE bool operator!=(const mgpu::tuple<args_t...>& a, - const mgpu::tuple<args_t...>& b) { - return !(a == b); -} diff --git a/src/util/cuda/moderngpu/cta_load_balance.hxx b/src/util/cuda/moderngpu/cta_load_balance.hxx deleted file mode 100644 index c397b789..00000000 --- a/src/util/cuda/moderngpu/cta_load_balance.hxx +++ /dev/null @@ -1,263 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "cta_merge.hxx" -#include "operators.hxx" -#include "cpp11.hxx" - -BEGIN_MGPU_NAMESPACE - -struct lbs_placement_t { - merge_range_t range; // The merge range of *loaded* values. - // May extend b_range one element in each direction. - int a_index; // Starting A index for merge. - int b_index; // Starting B index for merge. -}; - -template<int nt, int vt, typename segments_it> -MGPU_DEVICE lbs_placement_t cta_load_balance_place(int tid, - merge_range_t range, int count, segments_it segments, int num_segments, - int* b_shared) { - - // We want to know the value of the segment ID for the segment starting - // this tile. Load it by decrementing range.b_begin. - int load_preceding = 0 < range.b_begin; - range.b_begin -= load_preceding; - - // Load a trailing member of the segment ID array. This lets us read one past - // the last member: b_key = b_shared[++b0]. Note the use of prefix increment, - // which gets the beginning of the next identifier, not the current one. - if(range.b_end < num_segments && range.a_end < count) - ++range.b_end; - - int load_count = range.b_count(); - int fill_count = nt * vt + 1 + load_preceding - load_count - range.a_count(); - - // Fill the end of the array with dest_count. - for(int i = tid; i < fill_count; i += nt) - b_shared[load_count + i] = count; - - // Load the segments descriptors into the front of the indices array. - // TODO: SUBTRACT OUT A_BEGIN FROM B_BEGIN SO WE CAN DO 32-BIT COMPARISONS! - for(int i = tid; i < load_count; i += nt) - b_shared[i] = segments[range.b_begin + i]; - __syncthreads(); - - // Run a merge path search to find the start of the serial merge for - // each thread. If we loaded a preceding value from B, increment the - // cross-diagonal so that we don't redundantly process it. - int diag = vt * tid + load_preceding; - int mp = merge_path<bounds_upper>(counting_iterator_t<int>(range.a_begin), - range.a_count(), b_shared, load_count + fill_count, diag, less_t<int>()); - __syncthreads(); - - // Get the starting points for the merge for A and B. Why do we subtract 1 - // from B? At the start of the array, we are pointing to output 0 and - // segment 0. But we don't really start merging A until we've encountered - // its start flag at B. That is, the first iteration should increment b_index - // to 0, then start merging from the first segment of A, so b_index needs to - // start at -1. - int a_index = range.a_begin + mp; - int b_index = range.b_begin + (diag - mp) - 1; - - return lbs_placement_t { - range, a_index, b_index - }; -} - -struct lbs_fill_t { - merge_range_t range; - int b_offset; -}; - -template<int nt, int vt, typename segments_it, typename partition_it> -MGPU_DEVICE lbs_fill_t cta_load_balance_fill(int count, - segments_it segments, int num_segments, int tid, int cta, - partition_it partitions, int* shared) { - - merge_range_t range = compute_merge_range(count, num_segments, cta, - nt * vt, partitions[cta], partitions[cta + 1]); - - int* a_shared = shared - range.a_begin; - int* b_shared = shared + range.a_count(); - - lbs_placement_t placement = cta_load_balance_place<nt, vt>(tid, range, - count, segments, num_segments, b_shared); - - // Adjust the b pointer by the loaded b_begin. This lets us dereference it - // directly with the segment index. - b_shared -= placement.range.b_begin; - - // Fill shared memory with the segment IDs of the in-range values. - int cur_item = placement.a_index; - int cur_segment = placement.b_index; - - iterate<vt>([&](int i) { - bool p = cur_item < b_shared[cur_segment + 1]; - if(p) a_shared[cur_item++] = cur_segment; - else ++cur_segment; - }); - __syncthreads(); - - return lbs_fill_t { - range, - range.a_count() - placement.range.b_begin - }; -} - -template<int nt, int vt> -struct cta_load_balance_t { - enum { nv = nt * vt }; - struct storage_t { - int indices[nv + 2]; - }; - - struct result_t { - lbs_placement_t placement; - merge_range_t merge_range; - - // thread-order data. - int merge_flags; - - // strided-order data. - array_t<int, vt> indices; - array_t<int, vt> segments; - array_t<int, vt> ranks; - }; - - template<typename segments_it, typename partition_it> - MGPU_DEVICE result_t load_balance(int count, segments_it segments, - int num_segments, int tid, int cta, partition_it partitions, - storage_t& storage) const { - - merge_range_t range = compute_merge_range(count, num_segments, cta, - nv, partitions[cta], partitions[cta + 1]); - - int* a_shared = storage.indices - range.a_begin; - int* b_shared = storage.indices + range.a_count(); - - lbs_placement_t placement = cta_load_balance_place<nt, vt>(tid, range, - count, segments, num_segments, b_shared); - - // Adjust the b pointer by the loaded b_begin. This lets us dereference it - // directly with the segment index. - b_shared -= placement.range.b_begin; - - // Store the segment of each element in A. - int cur_item = placement.a_index; - int cur_segment = placement.b_index; - int merge_flags = 0; - - // Fill shared memory with the segment IDs of the in-range values. - iterate<vt + 1>([&](int i) { - // Compare the output index to the starting position of the next segment. - bool p = cur_item < b_shared[cur_segment + 1]; - if(p && i < vt) // Advance A (the needle). - a_shared[cur_item++] = cur_segment; - else // Advance B (the haystack) - ++cur_segment; - merge_flags |= (int)p<< i; - }); - __syncthreads(); - - // Load the segment indices in strided order. Use the segment ID to compute - // rank of each element. These strided-order (index, seg, rank) tuples - // will be passed to the lbs functor. - array_t<int, vt> indices, seg, ranks; - iterate<vt>([&](int i) { - int j = nt * i + tid; - indices[i] = range.a_begin + j; - if(j < range.a_count()) { - seg[i] = storage.indices[j]; - ranks[i] = indices[i] - b_shared[seg[i]]; - } else { - seg[i] = range.b_begin; - ranks[i] = -1; - } - }); - __syncthreads(); - - return result_t { - placement, range, merge_flags, - indices, seg, ranks - }; - } -}; - - -namespace detail { - -template<int nt, typename pointers_t> -struct cached_segment_load_t { - - enum { size = tuple_size<pointers_t>:: value }; - typedef make_index_sequence<size> seq_t; - typedef tuple_iterator_value_t<pointers_t> value_t; - - template<typename seq_t> - struct load_storage_t; - - template<size_t... seq_i> - struct load_storage_t<index_sequence<seq_i...> > { - tuple< - array_t<typename tuple_element<seq_i, value_t>::type, nt>... - > data; - - MGPU_HOST_DEVICE void store_value(const value_t& value, int index) { - swallow(get<seq_i>(data)[index] = get<seq_i>(value)...); - } - - MGPU_HOST_DEVICE value_t load_value(int index) const { - return make_tuple(get<seq_i>(data)[index]...); - } - }; - - typedef load_storage_t<seq_t> storage_t; - - template<int vt0, int vt> - MGPU_DEVICE static array_t<value_t, vt> load(int tid, int count, - range_t range, array_t<int, vt> segments, storage_t& storage, - pointers_t iterators) { - - array_t<value_t, vt> loaded; - if(range.count() <= nt) { - // Cached load through shared memory. - if(tid < range.count()) { - value_t value = mgpu::load(iterators, range.begin + tid); - storage.store_value(value, tid); - } - __syncthreads(); - - // Load the values into register. - strided_iterate<nt, vt, vt0>([&](int i, int j) { - loaded[i] = storage.load_value(segments[i] - range.begin); - }, tid, count); - __syncthreads(); - - } else { - // Direct load. - strided_iterate<nt, vt, vt0>([&](int i, int j) { - loaded[i] = mgpu::load(iterators, segments[i]); - }, tid, count); - } - - return loaded; - } -}; - -template<int nt> -struct cached_segment_load_t<nt, tuple<> > { - typedef empty_t storage_t; - typedef tuple<> value_t; - - template<int vt0, int vt> - MGPU_DEVICE static array_t<value_t, vt> load(int tid, int count, - range_t range, array_t<int, vt> segments, storage_t& storage, - tuple<> iterators) { - - return array_t<value_t, vt>(); - } -}; - -} // namespace detail - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/cta_merge.hxx b/src/util/cuda/moderngpu/cta_merge.hxx deleted file mode 100644 index 9ff38db3..00000000 --- a/src/util/cuda/moderngpu/cta_merge.hxx +++ /dev/null @@ -1,209 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "loadstore.hxx" - -BEGIN_MGPU_NAMESPACE - -template<bounds_t bounds = bounds_lower, typename a_keys_it, - typename b_keys_it, typename int_t, typename comp_t> -MGPU_HOST_DEVICE int_t merge_path(a_keys_it a_keys, int_t a_count, - b_keys_it b_keys, int_t b_count, int_t diag, comp_t comp) { - - typedef typename std::iterator_traits<a_keys_it>::value_type type_t; - int_t begin = max(0, diag - b_count); - int_t end = min(diag, a_count); - - while(begin < end) { - int_t mid = (begin + end) / 2; - type_t a_key = a_keys[mid]; - type_t b_key = b_keys[diag - 1 - mid]; - bool pred = (bounds_upper == bounds) ? - comp(a_key, b_key) : - !comp(b_key, a_key); - - if(pred) begin = mid + 1; - else end = mid; - } - return begin; -} - -template<bounds_t bounds, typename keys_it, typename comp_t> -MGPU_HOST_DEVICE int merge_path(keys_it keys, merge_range_t range, - int diag, comp_t comp) { - - return merge_path<bounds>( - keys + range.a_begin, range.a_count(), - keys + range.b_begin, range.b_count(), - diag, comp); -} - -template<bounds_t bounds, bool range_check, typename type_t, typename comp_t> -MGPU_HOST_DEVICE bool merge_predicate(type_t a_key, type_t b_key, - merge_range_t range, comp_t comp) { - - bool p; - if(range_check && !range.a_valid()) p = false; - else if(range_check && !range.b_valid()) p = true; - else p = (bounds_upper == bounds) ? comp(a_key, b_key) : !comp(b_key, a_key); - return p; -} - -MGPU_HOST_DEVICE merge_range_t compute_merge_range(int a_count, int b_count, - int partition, int spacing, int mp0, int mp1) { - - int diag0 = spacing * partition; - int diag1 = min(a_count + b_count, diag0 + spacing); - - return merge_range_t { mp0, mp1, diag0 - mp0, diag1 - mp1 }; -} - - -// Specialization that emits just one LD instruction. Can only reliably used -// with raw pointer types. Fixed not to use pointer arithmetic so that -// we don't get undefined behaviors with unaligned types. -template<int nt, int vt, typename type_t> -MGPU_DEVICE array_t<type_t, vt> -load_two_streams_reg(const type_t* a, int a_count, - const type_t* b, int b_count, int tid) { - - b -= a_count; - array_t<type_t, vt> x; - strided_iterate<nt, vt>([&](int i, int index) { - const type_t* p = (index >= a_count) ? b : a; - x[i] = p[index]; - }, tid, a_count + b_count); - - return x; -} - -template<int nt, int vt, typename type_t, typename a_it, typename b_it> -MGPU_DEVICE -enable_if_t< - !(std::is_pointer<a_it>::value && std::is_pointer<b_it>::value), - array_t<type_t, vt> -> load_two_streams_reg(a_it a, int a_count, b_it b, int b_count, int tid) { - b -= a_count; - array_t<type_t, vt> x; - strided_iterate<nt, vt>([&](int i, int index) { - x[i] = (index < a_count) ? a[index] : b[index]; - }, tid, a_count + b_count); - return x; -} - -template<int nt, int vt, typename a_it, typename b_it, typename type_t, - int shared_size> -MGPU_DEVICE void load_two_streams_shared(a_it a, int a_count, - b_it b, int b_count, int tid, type_t (&shared)[shared_size], - bool sync = true) { - - // Load into register then make an unconditional strided store into memory. - array_t<type_t, vt> x = load_two_streams_reg<nt, vt, type_t>( - a, a_count, b, b_count, tid); - reg_to_shared_strided<nt>(x, tid, shared, sync); -} - -template<int nt, int vt, typename type_t> -MGPU_DEVICE array_t<type_t, vt> gather_two_streams_strided(const type_t* a, - int a_count, const type_t* b, int b_count, array_t<int, vt> indices, - int tid) { - - ptrdiff_t b_offset = b - a - a_count; - int count = a_count + b_count; - - array_t<type_t, vt> x; - strided_iterate<nt, vt>([&](int i, int j) { - ptrdiff_t gather = indices[i]; - if(gather >= a_count) gather += b_offset; - x[i] = a[gather]; - }, tid, count); - - return x; -} -template<int nt, int vt, typename type_t, typename a_it, typename b_it> -MGPU_DEVICE -enable_if_t< - !(std::is_pointer<a_it>::value && std::is_pointer<b_it>::value), - array_t<type_t, vt> -> gather_two_streams_strided(a_it a, - int a_count, b_it b, int b_count, array_t<int, vt> indices, int tid) { - - b -= a_count; - array_t<type_t, vt> x; - strided_iterate<nt, vt>([&](int i, int j) { - x[i] = (indices[i] < a_count) ? a[indices[i]] : b[indices[i]]; - }, tid, a_count + b_count); - - return x; -} - -template<int nt, int vt, typename a_it, typename b_it, typename c_it> -MGPU_DEVICE void transfer_two_streams_strided(a_it a, int a_count, b_it b, - int b_count, array_t<int, vt> indices, int tid, c_it c) { - - typedef typename std::iterator_traits<a_it>::value_type type_t; - array_t<type_t, vt> x = gather_two_streams_strided<nt, vt, type_t>(a, - a_count, b, b_count, indices, tid); - - reg_to_mem_strided<nt>(x, tid, a_count + b_count, c); -} - - -// This function must be able to dereference keys[a_begin] and keys[b_begin], -// no matter the indices for each. The caller should allocate at least -// nt * vt + 1 elements for -template<bounds_t bounds, int vt, typename type_t, typename comp_t> -MGPU_DEVICE merge_pair_t<type_t, vt> -serial_merge(const type_t* keys_shared, merge_range_t range, comp_t comp, - bool sync = true) { - - type_t a_key = keys_shared[range.a_begin]; - type_t b_key = keys_shared[range.b_begin]; - - merge_pair_t<type_t, vt> merge_pair; - iterate<vt>([&](int i) { - bool p = merge_predicate<bounds, true>(a_key, b_key, range, comp); - int index = p ? range.a_begin : range.b_begin; - - merge_pair.keys[i] = p ? a_key : b_key; - merge_pair.indices[i] = index; - - type_t c_key = keys_shared[++index]; - if(p) a_key = c_key, range.a_begin = index; - else b_key = c_key, range.b_begin = index; - }); - - if(sync) __syncthreads(); - return merge_pair; -} - -// Load arrays a and b from global memory and merge into register. -template<bounds_t bounds, int nt, int vt, typename a_it, typename b_it, - typename type_t, typename comp_t, int shared_size> -MGPU_DEVICE merge_pair_t<type_t, vt> -cta_merge_from_mem(a_it a, b_it b, merge_range_t range_mem, int tid, - comp_t comp, type_t (&keys_shared)[shared_size]) { - - static_assert(shared_size >= nt * vt + 1, - "cta_merge_from_mem requires temporary storage of at " - "least nt * vt + 1 items"); - - // Load the data into shared memory. - load_two_streams_shared<nt, vt>(a + range_mem.a_begin, range_mem.a_count(), - b + range_mem.b_begin, range_mem.b_count(), tid, keys_shared, true); - - // Run a merge path to find the start of the serial merge for each thread. - merge_range_t range_local = range_mem.to_local(); - int diag = vt * tid; - int mp = merge_path<bounds>(keys_shared, range_local, diag, comp); - - // Compute the ranges of the sources in shared memory. The end iterators - // of the range are inaccurate, but still facilitate exact merging, because - // only vt elements will be merged. - merge_pair_t<type_t, vt> merged = serial_merge<bounds, vt>(keys_shared, - range_local.partition(mp, diag), comp); - - return merged; -}; - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/cta_mergesort.hxx b/src/util/cuda/moderngpu/cta_mergesort.hxx deleted file mode 100644 index 69a60fa8..00000000 --- a/src/util/cuda/moderngpu/cta_mergesort.hxx +++ /dev/null @@ -1,140 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "cta_merge.hxx" -#include "sort_networks.hxx" - -BEGIN_MGPU_NAMESPACE - -MGPU_HOST_DEVICE int out_of_range_flags(int first, int vt, int count) { - int out_of_range = min(vt, first + vt - count); - int head_flags = 0; - if(out_of_range > 0) { - const int mask = (1<< vt) - 1; - head_flags = mask & (~mask>> out_of_range); - } - return head_flags; -} - -MGPU_HOST_DEVICE merge_range_t compute_mergesort_frame(int partition, - int coop, int spacing) { - - int size = spacing * (coop / 2); - int start = ~(coop - 1) & partition; - int a_begin = spacing * start; - int b_begin = spacing * start + size; - - return merge_range_t { - a_begin, - a_begin + size, - b_begin, - b_begin + size - }; -} - -MGPU_HOST_DEVICE merge_range_t compute_mergesort_range(int count, - int partition, int coop, int spacing) { - - merge_range_t frame = compute_mergesort_frame(partition, coop, spacing); - - return merge_range_t { - frame.a_begin, - min(count, frame.a_end), - min(count, frame.b_begin), - min(count, frame.b_end) - }; -} - -MGPU_HOST_DEVICE merge_range_t compute_mergesort_range(int count, - int partition, int coop, int spacing, int mp0, int mp1) { - - merge_range_t range = compute_mergesort_range(count, partition, - coop, spacing); - - // Locate the diagonal from the start of the A sublist. - int diag = spacing * partition - range.a_begin; - - // The end partition of the last cta for each merge operation is computed - // and stored as the begin partition for the subsequent merge. i.e. it is - // the same partition but in the wrong coordinate system, so its 0 when it - // should be listSize. Correct that by checking if this is the last cta - // in this merge operation. - if(coop - 1 != ((coop - 1) & partition)) { - range.a_end = range.a_begin + mp1; - range.b_end = min(count, range.b_begin + diag + spacing - mp1); - } - - range.a_begin = range.a_begin + mp0; - range.b_begin = min(count, range.b_begin + diag - mp0); - - return range; -} - -template<int nt, int vt, typename key_t, typename val_t> -struct cta_sort_t { - enum { - has_values = !std::is_same<val_t, empty_t>::value, - num_passes = s_log2(nt) - }; - - union storage_t { - key_t keys[nt * vt + 1]; - val_t vals[nt * vt]; - }; - - static_assert(is_pow2(nt), "cta_sort_t requires pow2 number of threads"); - - template<typename comp_t> - MGPU_DEVICE kv_array_t<key_t, val_t, vt> - merge_pass(kv_array_t<key_t, val_t, vt> x, int tid, int count, - int pass, comp_t comp, storage_t& storage) const { - - // Divide the CTA's keys into lists. - int coop = 2<< pass; - merge_range_t range = compute_mergesort_range(count, tid, coop, vt); - int diag = vt * tid - range.a_begin; - - // Store the keys into shared memory for searching. - reg_to_shared_thread<nt, vt>(x.keys, tid, storage.keys); - - // Search for the merge path for this thread within its list. - int mp = merge_path<bounds_lower>(storage.keys, range, diag, comp); - - // Run a serial merge and return. - merge_pair_t<key_t, vt> merge = serial_merge<bounds_lower, vt>( - storage.keys, range.partition(mp, diag), comp); - x.keys = merge.keys; - - if(has_values) { - // Reorder values through shared memory. - reg_to_shared_thread<nt, vt>(x.vals, tid, storage.vals); - x.vals = shared_gather<nt, vt>(storage.vals, merge.indices); - } - - return x; - } - - template<typename comp_t> - MGPU_DEVICE kv_array_t<key_t, val_t, vt> - block_sort(kv_array_t<key_t, val_t, vt> x, int tid, int count, - comp_t comp, storage_t& storage) const { - - // Sort the inputs within each thread. If any threads have fewer than - // vt items, use the segmented sort network to prevent out-of-range - // elements from contaminating the sort. - if(count < nt * vt) { - int head_flags = out_of_range_flags(vt * tid, vt, count); - x = odd_even_sort(x, comp, head_flags); - } else - x = odd_even_sort(x, comp); - - // Merge threads starting with a pair until all values are merged. - for(int pass = 0; pass < num_passes; ++pass) - x = merge_pass(x, tid, count, pass, comp, storage); - - return x; - } -}; - - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/cta_reduce.hxx b/src/util/cuda/moderngpu/cta_reduce.hxx deleted file mode 100644 index 0b377c62..00000000 --- a/src/util/cuda/moderngpu/cta_reduce.hxx +++ /dev/null @@ -1,134 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "loadstore.hxx" -#include "intrinsics.hxx" - -BEGIN_MGPU_NAMESPACE - -// requires __CUDA_ARCH__ >= 300. -// warp_size can be any power-of-two <= warp_size. -// warp_reduce_t returns the reduction only in lane 0. -template<typename type_t, int group_size> -struct shfl_reduce_t { - - static_assert(group_size <= warp_size && is_pow2(group_size), - "shfl_reduce_t must operate on a pow2 number of threads <= warp_size (32)"); - enum { num_passes = s_log2(group_size) }; - - template<typename op_t = plus_t<type_t> > - MGPU_DEVICE type_t reduce(int lane, type_t x, int count, op_t op = op_t()) { - if(count == group_size) { - iterate<num_passes>([&](int pass) { - int offset = 1<< pass; - x = shfl_down_op(x, offset, op, group_size); - }); - } else { - iterate<num_passes>([&](int pass) { - int offset = 1<< pass; - type_t y = shfl_down(x, offset, group_size); - if(lane + offset < count) x = op(x, y); - }); - } - return x; - } -}; - -// cta_reduce_t returns the reduction of all inputs for thread 0, and returns -// type_t() for all other threads. This behavior saves a broadcast. - -template<int nt, typename type_t> -struct cta_reduce_t { - - enum { - group_size = min(nt, (int)warp_size), - num_passes = s_log2(group_size), - num_items = nt / group_size - }; - - static_assert(0 == nt % warp_size, - "cta_reduce_t requires num threads to be a multiple of warp_size (32)"); - - struct storage_t { - struct { type_t data[max(nt, 2 * group_size)]; }; - }; - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - - typedef shfl_reduce_t<type_t, group_size> group_reduce_t; - - template<typename op_t = plus_t<type_t> > - MGPU_DEVICE type_t reduce(int tid, type_t x, storage_t& storage, - int count = nt, op_t op = op_t(), bool all_return = true) const { - - // Store your data into shared memory. - storage.data[tid] = x; - __syncthreads(); - - if(tid < group_size) { - // Each thread scans within its lane. - strided_iterate<group_size, num_items>([&](int i, int j) { - if(i > 0) x = op(x, storage.data[j]); - }, tid, count); - - // Cooperative reduction. - x = group_reduce_t().reduce(tid, x, min(count, (int)group_size), op); - - if(all_return) storage.data[tid] = x; - } - __syncthreads(); - - if(all_return) { - x = storage.data[0]; - __syncthreads(); - } - return x; - } - -#else - - template<typename op_t = plus_t<type_t> > - MGPU_DEVICE type_t reduce(int tid, type_t x, storage_t& storage, - int count = nt, op_t op = op_t(), bool all_return = true) const { - - // Store your data into shared memory. - storage.data[tid] = x; - __syncthreads(); - - if(tid < group_size) { - // Each thread scans within its lane. - strided_iterate<group_size, num_items>([&](int i, int j) { - type_t y = storage.data[j]; - if(i > 0) x = op(x, y); - }, tid, count); - storage.data[tid] = x; - } - __syncthreads(); - - int count2 = min(count, int(group_size)); - int first = (1 & num_passes) ? group_size : 0; - if(tid < group_size) - storage.data[first + tid] = x; - __syncthreads(); - - iterate<num_passes>([&](int pass) { - if(tid < group_size) { - int offset = 1 << pass; - if(tid + offset < count2) - x = op(x, storage.data[first + offset + tid]); - first = group_size - first; - storage.data[first + tid] = x; - } - __syncthreads(); - }); - - if(all_return) { - x = storage.data[0]; - __syncthreads(); - } - return x; - } - -#endif -}; - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/cta_scan.hxx b/src/util/cuda/moderngpu/cta_scan.hxx deleted file mode 100644 index f690157e..00000000 --- a/src/util/cuda/moderngpu/cta_scan.hxx +++ /dev/null @@ -1,231 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "loadstore.hxx" -#include "intrinsics.hxx" - -BEGIN_MGPU_NAMESPACE - -enum scan_type_t { - scan_type_exc, - scan_type_inc -}; - -template<typename type_t, int vt = 0, bool is_array = (vt > 0)> -struct scan_result_t { - type_t scan; - type_t reduction; -}; - -template<typename type_t, int vt> -struct scan_result_t<type_t, vt, true> { - array_t<type_t, vt> scan; - type_t reduction; -}; - -//////////////////////////////////////////////////////////////////////////////// - -template<int nt, typename type_t> -struct cta_scan_t { - enum { num_warps = nt / warp_size, capacity = nt + num_warps }; - union storage_t { - type_t data[2 * nt]; - struct { type_t threads[nt], warps[num_warps]; }; - }; - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - - ////////////////////////////////////////////////////////////////////////////// - // Optimized CTA scan code that uses warp shfl intrinsics. - // Shfl is used for all data types, not just 4-byte built-in types, however - // those have accelerated plus, maximum and minimum operators. - - template<typename op_t = plus_t<type_t> > - MGPU_DEVICE scan_result_t<type_t> - scan(int tid, type_t x, storage_t& storage, int count = nt, op_t op = op_t(), - type_t init = type_t(), scan_type_t type = scan_type_exc) const { - - int warp = tid / warp_size; - - // Scan each warp using shfl_add. - type_t warp_scan = x; - iterate<s_log2(warp_size)>([&](int pass) { - warp_scan = shfl_up_op(warp_scan, 1<< pass, op, warp_size); - }); - - // Store the intra-warp scans. - storage.threads[tid] = warp_scan; - - // Store the reduction (last element) of each warp into storage. - if(min(warp_size * (warp + 1), count) - 1 == tid) - storage.warps[warp] = warp_scan; - __syncthreads(); - - // Scan the warp reductions. - if(tid < num_warps) { - type_t cta_scan = storage.warps[tid]; - iterate<s_log2(num_warps)>([&](int pass) { - cta_scan = shfl_up_op(cta_scan, 1<< pass, op, num_warps); - }); - storage.warps[tid] = cta_scan; - } - __syncthreads(); - - type_t scan = warp_scan; - if(scan_type_exc == type) { - scan = tid ? storage.threads[tid - 1] : init; - warp = (tid - 1) / warp_size; - } - if(warp > 0) scan = op(scan, storage.warps[warp - 1]); - - type_t reduction = storage.warps[div_up(count, warp_size) - 1]; - - scan_result_t<type_t> result { - tid < count ? scan : reduction, - reduction - }; - __syncthreads(); - - return result; - } - -#else - - ////////////////////////////////////////////////////////////////////////////// - // Standard CTA scan code that does not use shfl intrinsics. - - template<typename op_t = plus_t<type_t> > - MGPU_DEVICE scan_result_t<type_t> - scan(int tid, type_t x, storage_t& storage, int count = nt, op_t op = op_t(), - type_t init = type_t(), scan_type_t type = scan_type_exc) const { - - int first = 0; - storage.data[first + tid] = x; - __syncthreads(); - - iterate<s_log2(nt)>([&](int pass) { - int offset = 1<< pass; - if(tid >= offset) - x = op(storage.data[first + tid - offset], x); - first = nt - first; - storage.data[first + tid] = x; - __syncthreads(); - }); - - scan_result_t<type_t> result; - result.reduction = storage.data[first + count - 1]; - result.scan = (tid < count) ? - (scan_type_inc == type ? x : - (tid ? storage.data[first + tid - 1] : init)) : - result.reduction; - __syncthreads(); - - return result; - } - -#endif - - ////////////////////////////////////////////////////////////////////////////// - // CTA vectorized scan. Accepts multiple values per thread and adds in - // optional global carry-in. - - template<int vt, typename op_t = plus_t<type_t> > - MGPU_DEVICE scan_result_t<type_t, vt> - scan(int tid, array_t<type_t, vt> x, storage_t& storage, - type_t carry_in = type_t(), bool use_carry_in = false, - int count = nt, op_t op = op_t(), type_t init = type_t(), - scan_type_t type = scan_type_exc) const { - - // Start with an inclusive scan of the in-range elements. - if(count >= nt * vt) { - iterate<vt>([&](int i) { - x[i] = i ? op(x[i], x[i - 1]) : x[i]; - }); - } else { - iterate<vt>([&](int i) { - int index = vt * tid + i; - x[i] = i ? - ((index < count) ? op(x[i], x[i - 1]) : x[i - 1]) : - (x[i] = (index < count) ? x[i] : init); - }); - } - - // Scan the thread-local reductions for a carry-in for each thread. - scan_result_t<type_t> result = scan(tid, x[vt - 1], storage, - div_up(count, vt), op, init, scan_type_exc); - - // Perform the scan downsweep and add both the global carry-in and the - // thread carry-in to the values. - if(use_carry_in) { - result.reduction = op(carry_in, result.reduction); - result.scan = tid ? op(carry_in, result.scan) : carry_in; - } else - use_carry_in = tid > 0; - - array_t<type_t, vt> y; - iterate<vt>([&](int i) { - if(scan_type_exc == type) { - y[i] = i ? x[i - 1] : result.scan; - if(use_carry_in && i > 0) y[i] = op(result.scan, y[i]); - } else { - y[i] = use_carry_in ? op(x[i], result.scan) : x[i]; - } - }); - - return scan_result_t<type_t, vt> { y, result.reduction }; - } -}; - -//////////////////////////////////////////////////////////////////////////////// -// Overload for scan of bools. - -template<int nt> -struct cta_scan_t<nt, bool> { - enum { num_warps = nt / warp_size }; - struct storage_t { - int warps[num_warps]; - }; - - MGPU_DEVICE scan_result_t<int> scan(int tid, bool x, - storage_t& storage) const { - - // Store the bit totals for each warp. - int lane = (warp_size - 1) & tid; - int warp = tid / warp_size; - - int bits = ballot(x); - storage.warps[warp] = popc(bits); - __syncthreads(); - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - if(tid < num_warps) { - // Cooperative warp scan of partial reductions. - int scan = storage.warps[tid]; - iterate<s_log2(num_warps)>([&](int i) { - scan = shfl_up_op(scan, 1<< i, plus_t<int>(), num_warps); - }); - storage.warps[tid] = scan; - } - __syncthreads(); -#else - - if(0 == tid) { - // Inclusive scan of partial reductions.. - int scan = 0; - iterate<num_warps>([&](int i) { - storage.warps[i] = scan += storage.warps[i]; - }); - } - __syncthreads(); - -#endif - - int scan = ((warp > 0) ? storage.warps[warp - 1] : 0) + - popc(bfe(bits, 0, lane)); - int reduction = storage.warps[num_warps - 1]; - __syncthreads(); - - return scan_result_t<int> { scan, reduction }; - } -}; - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/cta_search.hxx b/src/util/cuda/moderngpu/cta_search.hxx deleted file mode 100644 index 8ff23f49..00000000 --- a/src/util/cuda/moderngpu/cta_search.hxx +++ /dev/null @@ -1,100 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "cta_merge.hxx" - -BEGIN_MGPU_NAMESPACE - -template<bounds_t bounds, typename keys_it, typename int_t, typename key_t, - typename comp_t> -MGPU_HOST_DEVICE int_t binary_search(keys_it keys, int_t count, key_t key, - comp_t comp) { - - int_t begin = 0; - int_t end = count; - while(begin < end) { - int_t mid = (begin + end) / 2; - key_t key2 = keys[mid]; - bool pred = (bounds_upper == bounds) ? - !comp(key, key2) : - comp(key2, key); - if(pred) begin = mid + 1; - else end = mid; - } - return begin; -} - -//////////////////////////////////////////////////////////////////////////////// -// TODO: Implement a moderngpu V1 style vectorized sorted search. - -template<typename type_t, int vt> -struct search_result_t { - array_t<type_t, vt> keys; - array_t<int, vt> indices; - int decisions; // Set a bit if this iteration has progressed A. - int matches_a; // A set flag for a match on each iteration. - int matches_b; -}; - -template<int vt, bounds_t bounds, bool range_check, typename type_t, - typename comp_t> -MGPU_DEVICE search_result_t<type_t, vt> -serial_search(const type_t* keys_shared, merge_range_t range, - int a_offset, int b_offset, comp_t comp, bool sync = true) { - - type_t a_key = keys_shared[range.a_begin]; - type_t b_key = keys_shared[range.b_begin]; - type_t a_prev = type_t(), b_prev = type_t(); - - int a_start = 0; - int b_start = range.a_end; // Assume the b_keys start right after the end - // of the a_keys. - if(range.a_begin > 0) a_prev = keys_shared[range.a_begin - 1]; - if(range.b_begin > b_start) b_prev = keys_shared[range.b_begin - 1]; - - search_result_t<type_t, vt> result = search_result_t<type_t, vt>(); - - iterate<vt>([&](int i) { - // This is almost the same body as serial_merge, except for the match - // criterion below. - bool p = merge_predicate<bounds, range_check>(a_key, b_key, range, comp); - - if(p) { - bool match = (bounds_upper == bounds) ? - (!range_check || range.b_begin > b_start) && - !comp(b_prev, a_key) : - (!range_check || range.b_valid()) && - !comp(a_key, b_key); - - result.decisions |= 1<< i; - result.matches_a |= (int)match<< i; - a_prev = a_key; - - } else { - bool match = (bounds_upper == bounds) ? - (!range_check || (range.a_valid() && range.b_valid())) && - !comp(b_key, a_key) : - (!range_check || (range.b_valid() && range.a_begin > a_start)) && - !comp(a_prev, b_key); - - result.matches_b |= (int)match<< i; - b_prev = b_key; - } - - // Same advancement behavior as serial_merge. - int index = p ? range.a_begin : range.b_begin; - - result.keys[i] = p ? a_key : b_key; - result.indices[i] = index + (p ? a_offset : b_offset); - - type_t c_key = keys_shared[++index]; - if(p) a_key = c_key, range.a_begin = index; - else b_key = c_key, range.b_begin = index; - }); - - if(sync) __syncthreads(); - - return result; -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/cta_segscan.hxx b/src/util/cuda/moderngpu/cta_segscan.hxx deleted file mode 100644 index e8738c5c..00000000 --- a/src/util/cuda/moderngpu/cta_segscan.hxx +++ /dev/null @@ -1,119 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "cta_scan.hxx" - -BEGIN_MGPU_NAMESPACE - -template<typename type_t> -struct segscan_result_t { - type_t scan; - type_t reduction; - bool has_carry_in; - int left_lane; -}; - -template<int nt, typename type_t> -struct cta_segscan_t { - enum { num_warps = nt / warp_size }; - - union storage_t { - int delta[num_warps + nt]; - struct { type_t values[2 * nt]; int packed[nt]; }; - }; - - MGPU_DEVICE int find_left_lane(int tid, bool has_head_flag, - storage_t& storage) const { - - int warp = tid / warp_size; - int lane = (warp_size - 1) & tid; - int warp_mask = 0xffffffff>> (31 - lane); // inclusive search. - int cta_mask = 0x7fffffff>> (31 - lane); // exclusive search. - - #ifdef __HIP__ - // Build a head flag bitfield and store it into shared memory. - long int warp_bits = ballot(has_head_flag); - storage.delta[warp] = (int)warp_bits; - #else - // Build a head flag bitfield and store it into shared memory. - int warp_bits = ballot(has_head_flag); - storage.delta[warp] = warp_bits; - #endif - - - __syncthreads(); - - if(tid < num_warps) { - #ifdef __HIP__ - int cta_bits = (int)ballot(0 != storage.delta[tid]); - #else - unsigned mask = __activemask(); - int cta_bits = ballot(0 != storage.delta[tid], mask); - #endif - int warp_segment = 31 - clz(cta_mask & cta_bits); - int start = (-1 != warp_segment) ? - (31 - clz(storage.delta[warp_segment]) + 32 * warp_segment) : 0; - storage.delta[num_warps + tid] = start; - - } - __syncthreads(); - - // Find the closest flag to the left of this thread within the warp. - // Include the flag for this thread. - int start = 31 - clz(warp_mask & warp_bits); - if(-1 != start) start += ~31 & tid; - else start = storage.delta[num_warps + warp]; - __syncthreads(); - - return start; - } - - template<typename op_t = plus_t<type_t> > - MGPU_DEVICE segscan_result_t<type_t> segscan(int tid, bool has_head_flag, - bool has_carry_out, type_t x, storage_t& storage, type_t init = type_t(), - op_t op = op_t()) const { - - if(!has_carry_out) x = init; - - int left_lane = find_left_lane(tid, has_head_flag, storage); - int tid_delta = tid - left_lane; - - // Store the has_carry_out flag. - storage.packed[tid] = (int)has_carry_out | (left_lane<< 1); - - // Run an inclusive scan. - int first = 0; - storage.values[first + tid] = x; - __syncthreads(); - - int packed = storage.packed[left_lane]; - left_lane = packed>> 1; - tid_delta = tid - left_lane; - if(0 == (1 & packed)) --tid_delta; - - iterate<s_log2(nt)>([&](int pass) { - int offset = 1<< pass; - if(tid_delta >= offset) - x = op(x, storage.values[first + tid - offset]); - first = nt - first; - storage.values[first + tid] = x; - __syncthreads(); - }); - - // Get the exclusive scan by fetching the preceding element. Also return - // the carry-out value as the total. - bool has_carry_in = tid ? (0 != (1 & storage.packed[tid - 1])) : false; - - segscan_result_t<type_t> result { - (has_carry_in && tid) ? storage.values[first + tid - 1] : init, - storage.values[first + nt - 1], - has_carry_in, - left_lane - }; - __syncthreads(); - - return result; - } -}; - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/cta_segsort.hxx b/src/util/cuda/moderngpu/cta_segsort.hxx deleted file mode 100644 index 3e75791b..00000000 --- a/src/util/cuda/moderngpu/cta_segsort.hxx +++ /dev/null @@ -1,226 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "cta_mergesort.hxx" - -BEGIN_MGPU_NAMESPACE - -template<typename keys_it, typename comp_t> -MGPU_HOST_DEVICE int segmented_merge_path(keys_it keys, merge_range_t range, - range_t active, int diag, comp_t comp) { - - // Consider a rectangle defined by range. - // Now consider a sub-rectangle at the top-right corner defined by - // active. We want to run the merge path only within this corner part. - - // If the cross-diagonal does not intersect our corner, return immediately. - if(range.a_begin + diag <= active.begin) return diag; - if(range.a_begin + diag >= active.end) return range.a_count(); - - // Call merge_path on the corner domain. - active.begin = max(active.begin, range.a_begin); - active.end = min(active.end, range.b_end); - - merge_range_t active_range = { - active.begin, range.a_end, - range.b_begin, active.end - }; - - int active_offset = active.begin - range.a_begin; - int p = merge_path<bounds_lower>(keys, active_range, - diag - active_offset, comp); - - return p + active_offset; -} - -template<int vt, typename type_t, typename comp_t> -MGPU_DEVICE merge_pair_t<type_t, vt> segmented_serial_merge( - const type_t* keys_shared, merge_range_t range, range_t active, - comp_t comp, bool sync = true) { - - range.b_end = min(active.end, range.b_end); - - type_t a_key = keys_shared[range.a_begin]; - type_t b_key = keys_shared[range.b_begin]; - - merge_pair_t<type_t, vt> merge_pair; - iterate<vt>([&](int i) { - bool p; - if(range.a_begin >= range.a_end) - // If A has run out of inputs, emit B. - p = false; - else if(range.b_begin >= range.b_end || range.a_begin < active.begin) - // B has hit the end of the middle segment. - // Emit A if A has inputs remaining in the middle segment. - p = true; - else - // Emit the smaller element in the middle segment. - p = !comp(b_key, a_key); - - int index = p ? range.a_begin : range.b_begin; - merge_pair.keys[i] = p ? a_key : b_key; - merge_pair.indices[i] = index; - - type_t c_key = keys_shared[++index]; - if(p) a_key = c_key, range.a_begin = index; - else b_key = c_key, range.b_begin = index; - }); - - if(sync) __syncthreads(); - return merge_pair; -} - -template<int nt, int vt> -struct cta_load_head_flags { - enum { - nv = nt * vt, - - // Store each flag in a byte; there are 4 bytes in a word, and threads - // cooperatively reset these. - words_per_thread = div_up(vt, 32 / 8) - }; - - union storage_t { - char flags[nv]; - int words[nt * words_per_thread]; - }; - - template<typename seg_it> - MGPU_DEVICE int load(seg_it segments, const int* partitions_global, - int tid, int cta, int count, storage_t& storage) { - - int mp0 = partitions_global[0]; - int mp1 = partitions_global[1]; - int gid = nv * cta; - count -= gid; - - // Set the head flags for out-of-range keys. - int head_flags = out_of_range_flags(vt * tid, vt, count); - - if(mp1 > mp0) { - // Clear the flag bytes, then loop through the indices and poke in - // flag bytes. - iterate<words_per_thread>([&](int i) { - storage.words[nt * i + tid] = 0; - }); - __syncthreads(); - - for(int index = mp0 + tid; index < mp1; index += nt) - storage.flags[segments[index] - gid] = 1; - __syncthreads(); - - // Combine all the head flags for this thread. - int first = vt * tid; - int offset = first / 4; - int prev = storage.words[offset]; - int mask = 0x3210 + 0x1111 * (3 & first); - iterate<words_per_thread>([&](int i) { - int next = storage.words[offset + 1 + i]; - int x = prmt(prev, next, mask); - prev = next; - - // Set the head flag bits. - if(0x00000001 & x) head_flags |= 1<< (4 * i + 0); - if(0x00000100 & x) head_flags |= 1<< (4 * i + 1); - if(0x00010000 & x) head_flags |= 1<< (4 * i + 2); - if(0x01000000 & x) head_flags |= 1<< (4 * i + 3); - }); - head_flags &= (1<< vt) - 1; - __syncthreads(); - } - - return head_flags; - } -}; - -template<int nt, int vt, typename key_t, typename val_t> -struct cta_segsort_t { - enum { - nv = nt * vt, - has_values = !std::is_same<val_t, empty_t>::value, - num_passes = s_log2(nt) - }; - - struct storage_t { - union { - key_t keys[nt * vt]; - val_t vals[nt * vt]; - }; - int ranges[nt]; - }; - - static_assert(is_pow2(nt), "cta_segsort_t requires pow2 number of threads"); - - template<typename comp_t> - MGPU_DEVICE kv_array_t<key_t, val_t, vt> - merge_pass(kv_array_t<key_t, val_t, vt> x, int tid, int count, - int pass, range_t& active, comp_t comp, storage_t& storage) const { - - int coop = 2<< pass; - merge_range_t range = compute_mergesort_range(count, tid, coop, vt); - - int list = tid>> pass; - - int list_parity = 1 & list; - int diag = vt * tid - range.a_begin; - - // Fetch the active range for the list this thread's list is merging with. - int sibling_range = storage.ranges[1 ^ list]; - range_t sibling { 0x0000ffff & sibling_range, sibling_range>> 16 }; - - // This pass does a segmented merge on ranges list and 1 ^ list. - // ~1 & list is the left list and 1 | list is the right list. - // We find the inner segments for merging, then update the active - // range to the outer segments for the next pass. - range_t left = list_parity ? sibling : active; - range_t right = list_parity ? active : sibling; - range_t inner = { left.end, right.begin }; - active.begin = min(left.begin, right.begin); - active.end = max(left.end, right.end); - - // Store the data from thread order into shared memory. - reg_to_shared_thread<nt, vt>(x.keys, tid, storage.keys); - - int mp = segmented_merge_path(storage.keys, range, inner, diag, comp); - - // Run a segmented serial merge. - merge_pair_t<key_t, vt> merge = segmented_serial_merge<vt>(storage.keys, - range.partition(mp, diag), inner, comp); - - // Pack and store the outer range to shared memory. - storage.ranges[list>> 1] = (int)bfi(active.end, active.begin, 16, 16); - if(!has_values) __syncthreads(); - - x.keys = merge.keys; - if(has_values) { - // Reorder values through shared memory. - reg_to_shared_thread<nt, vt>(x.vals, tid, storage.vals); - x.vals = shared_gather<nt, vt>(storage.vals, merge.indices); - } - - return x; - } - - template<typename comp_t> - MGPU_DEVICE kv_array_t<key_t, val_t, vt> - block_sort(kv_array_t<key_t, val_t, vt> x, int tid, int count, - int head_flags, range_t& active, comp_t comp, storage_t& storage) const { - - // Sort the inputs within each thread. - x = odd_even_sort(x, comp, head_flags); - - // Record the first and last occurrences of head flags in this segment. - active.begin = head_flags ? (vt * tid - 1 + ffs(head_flags)) : nv; - active.end = head_flags ? (vt * tid + 31 - clz(head_flags)) : -1; - storage.ranges[tid] = bfi(active.end, active.begin, 16, 16); - __syncthreads(); - - // Merge threads starting with a pair until all values are merged. - for(int pass = 0; pass < num_passes; ++pass) - x = merge_pass(x, tid, count, pass, active, comp, storage); - - return x; - } -}; - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/intrinsics.hxx b/src/util/cuda/moderngpu/intrinsics.hxx deleted file mode 100644 index 39af8b6e..00000000 --- a/src/util/cuda/moderngpu/intrinsics.hxx +++ /dev/null @@ -1,363 +0,0 @@ -#pragma once - -#include "operators.hxx" - -#if !defined(__CUDACC__) && !defined(__HIP__) -#error "You must compile this file with nvcc. You must." -#endif - -BEGIN_MGPU_NAMESPACE - -#ifndef MEMBERMASK - #define MEMBERMASK 0xffffffff -#endif - -#if (__CUDACC_VER_MAJOR__ >= 9 && defined(__CUDA_ARCH__) && \ - __CUDA_ARCH__ >= 300) && !defined(USE_SHFL_SYNC) - #define USE_SHFL_SYNC -#endif - -//////////////////////////////////////////////////////////////////////////////// -// ballot, brev, popc, clz, bfe, bfi, prmt - -// ballot - -MGPU_HOST_DEVICE unsigned ballot(int predicate, unsigned mask=MEMBERMASK) { - unsigned y = 0; -#ifdef USE_SHFL_SYNC - y = __ballot_sync(mask, predicate); -#else -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - y = __ballot(predicate); -#endif -#endif - return y; -} - -// Reverse the bits in an integer. -MGPU_HOST_DEVICE unsigned brev(unsigned x) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200 - unsigned y = __brev(x); -#else - unsigned y = 0; - for(int i = 0; i < 32; ++i) - y |= (1 & (x>> i))<< (31 - i); -#endif - return y; -} - -// Count number of bits in a register. -MGPU_HOST_DEVICE int popc(unsigned x) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200 - return __popc(x); -#else - int c; - for(c = 0; x; ++c) - x &= x - 1; - return c; -#endif -} - -// Count leading zeros - start from most significant bit. -MGPU_HOST_DEVICE int clz(int x) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200 - return __clz(x); -#else - for(int i = 31; i >= 0; --i) - if((1<< i) & x) return 31 - i; - return 32; -#endif -} - -// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0. -MGPU_HOST_DEVICE int ffs(int x) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200 - return __ffs(x); -#else - for(int i = 0; i < 32; ++i) - if((1<< i) & x) return i + 1; - return 0; -#endif -} - -MGPU_HOST_DEVICE unsigned bfe(unsigned x, unsigned bit, unsigned num_bits) { - unsigned result; -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200 - asm("bfe.u32 %0, %1, %2, %3;" : - "=r"(result) : "r"(x), "r"(bit), "r"(num_bits)); -#else - result = ((1<< num_bits) - 1) & (x>> bit); -#endif - return result; -} - -MGPU_HOST_DEVICE unsigned bfi(unsigned x, unsigned y, unsigned bit, - unsigned num_bits) { - unsigned result; -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200 - asm("bfi.b32 %0, %1, %2, %3, %4;" : - "=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(num_bits)); -#else - if(bit + num_bits > 32) num_bits = 32 - bit; - unsigned mask = ((1<< num_bits) - 1)<< bit; - result = y & ~mask; - result |= mask & (x<< bit); -#endif - return result; -} - -MGPU_HOST_DEVICE unsigned prmt(unsigned a, unsigned b, unsigned index) { - unsigned result; -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200 - asm("prmt.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(a), "r"(b), "r"(index)); -#else - result = 0; - for(int i = 0; i < 4; ++i) { - unsigned sel = 0xf & (index>> (4 * i)); - unsigned x = ((7 & sel) > 3) ? b : a; - x = 0xff & (x>> (8 * (3 & sel))); - if(8 & sel) x = (128 & x) ? 0xff : 0; - result |= x<< (8 * i); - } -#endif - return result; -} - -// Find log2(x) and optionally round up to the next integer logarithm. -MGPU_HOST_DEVICE int find_log2(int x, bool round_up = false) { - int a = 31 - clz(x); - if(round_up) a += !is_pow2(x); - return a; -} - -//////////////////////////////////////////////////////////////////////////////// -// Divide operators. - -MGPU_HOST_DEVICE int mulhi(int a, int b) { -#ifdef __CUDA_ARCH__ - return __mulhi(a, b); -#else - union { - int64_t x; - struct { int low, high; }; - } product; - product.x = (int64_t)a * b; - return product.high; -#endif -} - -MGPU_HOST_DEVICE unsigned umulhi(unsigned a, unsigned b) { -#ifdef __CUDA_ARCH__ - return __mulhi(a, b); -#else - union { - uint64_t x; - struct { unsigned low, high; }; - } product; - product.x = (uint64_t)a * b; - return product.high; -#endif -} - -//////////////////////////////////////////////////////////////////////////////// -// Wrappers around PTX shfl_up and shfl_down. - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - -template<typename type_t> -MGPU_DEVICE type_t shfl_up(type_t x, int offset, int width = warp_size) { - enum { num_words = div_up(sizeof(type_t), sizeof(int)) }; - union { - int x[num_words]; - type_t t; - } u; - u.t = x; - - iterate<num_words>([&](int i) { - #ifdef USE_SHFL_SYNC - if (i < width) { - unsigned mask = __activemask(); - u.x[i] = __shfl_up_sync(mask, u.x[i], offset); - } - #else - u.x[i] = __shfl_up(u.x[i], offset, width); - #endif - }); - return u.t; -} - -template<typename type_t> -MGPU_DEVICE type_t shfl_down(type_t x, int offset, int width = warp_size) { - enum { num_words = div_up(sizeof(type_t), sizeof(int)) }; - union { - int x[num_words]; - type_t t; - } u; - u.t = x; - - iterate<num_words>([&](int i) { - #ifdef USE_SHFL_SYNC - if (i < width) { - unsigned mask = __activemask(); - u.x[i] = __shfl_down_sync(mask, u.x[i], offset); - } - #else - u.x[i] = __shfl_down(u.x[i], offset, width); - #endif - }); - return u.t; -} - -template<typename type_t, typename op_t> -MGPU_DEVICE type_t shfl_up_op(type_t x, int offset, op_t op, - int width = warp_size) { - - type_t y = shfl_up(x, offset, width); - int lane = (width - 1) & threadIdx.x; - if(lane >= offset) x = op(x, y); - return x; -} - -template<typename type_t, typename op_t> -MGPU_DEVICE type_t shfl_down_op(type_t x, int offset, op_t op, - int width = warp_size) { - - type_t y = shfl_down(x, offset, width); - int lane = (width - 1) & threadIdx.x; - if(lane < width - offset) x = op(x, y); - return x; -} - -#ifdef USE_SHFL_SYNC -#define SHFL_OP_MACRO(dir, is_up, ptx_type, r, c_type, ptx_op, c_op) \ -MGPU_DEVICE inline c_type shfl_##dir##_op(c_type x, int offset, \ - c_op<c_type> op, int width = warp_size) { \ - c_type result = x; \ - int mask = (warp_size - width)<< 8 | (is_up ? 0 : (width - 1)); \ - int lane = threadIdx.x & (warp_size - 1); \ - if (lane < width) { \ - unsigned threadmask = __activemask(); \ - asm( \ - "{.reg ."#ptx_type" r0;" \ - ".reg .pred p;" \ - "shfl.sync."#dir".b32 r0|p, %1, %2, %3, %4;" \ - "@p "#ptx_op"."#ptx_type" r0, r0, %5;" \ - "mov."#ptx_type" %0, r0; }" \ - : "="#r(result) : #r(x), "r"(offset), "r"(mask), "r"(threadmask), #r(x)); \ - } \ - return result; \ -} -#else -#define SHFL_OP_MACRO(dir, is_up, ptx_type, r, c_type, ptx_op, c_op) \ -MGPU_DEVICE inline c_type shfl_##dir##_op(c_type x, int offset, \ - c_op<c_type> op, int width = warp_size) { \ - c_type result = c_type(); \ - int mask = (warp_size - width)<< 8 | (is_up ? 0 : (width - 1)); \ - asm( \ - "{.reg ."#ptx_type" r0;" \ - ".reg .pred p;" \ - "shfl."#dir".b32 r0|p, %1, %2, %3;" \ - "@p "#ptx_op"."#ptx_type" r0, r0, %4;" \ - "mov."#ptx_type" %0, r0; }" \ - : "="#r(result) : #r(x), "r"(offset), "r"(mask), #r(x)); \ - return result; \ -} -#endif - -SHFL_OP_MACRO(up, true, s32, r, int, add, plus_t) -SHFL_OP_MACRO(up, true, s32, r, int, max, maximum_t) -SHFL_OP_MACRO(up, true, s32, r, int, min, minimum_t) -SHFL_OP_MACRO(down, false, s32, r, int, add, plus_t) -SHFL_OP_MACRO(down, false, s32, r, int, max, maximum_t) -SHFL_OP_MACRO(down, false, s32, r, int, min, minimum_t) - -SHFL_OP_MACRO(up, true, u32, r, unsigned, add, plus_t) -SHFL_OP_MACRO(up, true, u32, r, unsigned, max, maximum_t) -SHFL_OP_MACRO(up, true, u32, r, unsigned, min, minimum_t) -SHFL_OP_MACRO(down, false, u32, r, unsigned, add, plus_t) -SHFL_OP_MACRO(down, false, u32, r, unsigned, max, maximum_t) -SHFL_OP_MACRO(down, false, u32, r, unsigned, min, minimum_t) - -SHFL_OP_MACRO(up, true, f32, f, float, add, plus_t) -SHFL_OP_MACRO(up, true, f32, f, float, max, maximum_t) -SHFL_OP_MACRO(up, true, f32, f, float, max, minimum_t) -SHFL_OP_MACRO(down, false, f32, f, float, add, plus_t) -SHFL_OP_MACRO(down, false, f32, f, float, max, maximum_t) -SHFL_OP_MACRO(down, false, f32, f, float, max, minimum_t) - -#undef SHFL_OP_MACRO - -#ifdef USE_SHFL_SYNC -#define SHFL_OP_64b_MACRO(dir, is_up, ptx_type, r, c_type, ptx_op, c_op) \ -MGPU_DEVICE inline c_type shfl_##dir##_op(c_type x, int offset, \ - c_op<c_type> op, int width = warp_size) { \ - c_type result = x; \ - int mask = (warp_size - width)<< 8 | (is_up ? 0 : (width - 1)); \ - int lane = threadIdx.x & (warp_size - 1); \ - if (lane < width) { \ - unsigned threadmask = __activemask(); \ - asm( \ - "{.reg ."#ptx_type" r0;" \ - ".reg .u32 lo;" \ - ".reg .u32 hi;" \ - ".reg .pred p;" \ - "mov.b64 {lo, hi}, %1;" \ - "shfl.sync."#dir".b32 lo|p, lo, %2, %3, %4;" \ - "shfl.sync."#dir".b32 hi , hi, %2, %3, %4;" \ - "mov.b64 r0, {lo, hi};" \ - "@p "#ptx_op"."#ptx_type" r0, r0, %5;" \ - "mov."#ptx_type" %0, r0; }" \ - : "="#r(result) : #r(x), "r"(offset), "r"(mask), "r"(threadmask), #r(x) \ - ); \ - } \ - return result; \ -} -#else -#define SHFL_OP_64b_MACRO(dir, is_up, ptx_type, r, c_type, ptx_op, c_op) \ -MGPU_DEVICE inline c_type shfl_##dir##_op(c_type x, int offset, \ - c_op<c_type> op, int width = warp_size) { \ - c_type result = c_type(); \ - int mask = (warp_size - width)<< 8 | (is_up ? 0 : (width - 1)); \ - asm( \ - "{.reg ."#ptx_type" r0;" \ - ".reg .u32 lo;" \ - ".reg .u32 hi;" \ - ".reg .pred p;" \ - "mov.b64 {lo, hi}, %1;" \ - "shfl."#dir".b32 lo|p, lo, %2, %3;" \ - "shfl."#dir".b32 hi , hi, %2, %3;" \ - "mov.b64 r0, {lo, hi};" \ - "@p "#ptx_op"."#ptx_type" r0, r0, %4;" \ - "mov."#ptx_type" %0, r0; }" \ - : "="#r(result) : #r(x), "r"(offset), "r"(mask), #r(x) \ - ); \ - return result; \ -} -#endif - -SHFL_OP_64b_MACRO(up, true, s64, l, int64_t, add, plus_t) -SHFL_OP_64b_MACRO(up, true, s64, l, int64_t, max, maximum_t) -SHFL_OP_64b_MACRO(up, true, s64, l, int64_t, min, minimum_t) -SHFL_OP_64b_MACRO(down, false, s64, l, int64_t, add, plus_t) -SHFL_OP_64b_MACRO(down, false, s64, l, int64_t, max, maximum_t) -SHFL_OP_64b_MACRO(down, false, s64, l, int64_t, min, minimum_t) - -SHFL_OP_64b_MACRO(up, true, u64, l, uint64_t, add, plus_t) -SHFL_OP_64b_MACRO(up, true, u64, l, uint64_t, max, maximum_t) -SHFL_OP_64b_MACRO(up, true, u64, l, uint64_t, min, minimum_t) -SHFL_OP_64b_MACRO(down, false, u64, l, uint64_t, add, plus_t) -SHFL_OP_64b_MACRO(down, false, u64, l, uint64_t, max, maximum_t) -SHFL_OP_64b_MACRO(down, false, u64, l, uint64_t, min, minimum_t) - -SHFL_OP_64b_MACRO(up, true, f64, d, double, add, plus_t) -SHFL_OP_64b_MACRO(up, true, f64, d, double, max, maximum_t) -SHFL_OP_64b_MACRO(up, true, f64, d, double, min, minimum_t) -SHFL_OP_64b_MACRO(down, false, f64, d, double, add, plus_t) -SHFL_OP_64b_MACRO(down, false, f64, d, double, max, maximum_t) -SHFL_OP_64b_MACRO(down, false, f64, d, double, min, minimum_t) - -#undef SHFL_OP_64b_MACRO - -#endif - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_bulkinsert.hxx b/src/util/cuda/moderngpu/kernel_bulkinsert.hxx deleted file mode 100644 index c85e8221..00000000 --- a/src/util/cuda/moderngpu/kernel_bulkinsert.hxx +++ /dev/null @@ -1,18 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "kernel_merge.hxx" - -BEGIN_MGPU_NAMESPACE - -// Insert the values at a_keys before the values at b_keys identified by -// insert. -template<typename launch_t = empty_t, typename a_it, typename insert_it, - typename b_it, typename c_it> -void bulk_insert(a_it a, insert_it a_insert, int insert_size, b_it b, - int source_size, c_it c, context_t& context) { - - merge<launch_t>(a_insert, a, insert_size, counting_iterator_t<int>(0), b, - source_size, discard_iterator_t<int>(), c, mgpu::less_t<int>(), context); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_bulkremove.hxx b/src/util/cuda/moderngpu/kernel_bulkremove.hxx deleted file mode 100644 index 497e9867..00000000 --- a/src/util/cuda/moderngpu/kernel_bulkremove.hxx +++ /dev/null @@ -1,91 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "search.hxx" - -BEGIN_MGPU_NAMESPACE - -template<typename launch_arg_t = empty_t, - typename input_it, typename indices_it, typename output_it> -void bulk_remove(input_it input, int count, indices_it indices, - int num_indices, output_it output, context_t& context) { - - typedef typename conditional_typedef_t<launch_arg_t, - launch_box_t< - arch_20_cta<128, 15>, - arch_35_cta<128, 11>, - arch_52_cta<128, 15> - > - >::type_t launch_t; - - typedef typename std::iterator_traits<input_it>::value_type type_t; - - // Map the removal indices into tiles. - mem_t<int> partitions = binary_search_partitions<bounds_lower>(indices, - count, num_indices, launch_t::nv(context), context); - const int* p_data = partitions.data(); - - auto k = [=]MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - - __shared__ union { - int indices[nv + 1]; - } shared; - - range_t tile = get_tile(cta, nv, count); - - // Search the begin and end iterators to load. - int begin = p_data[cta]; - int end = p_data[cta + 1]; - int b_count = end - begin; - - int* a_shared = shared.indices; - int* b_shared = shared.indices + tile.count() - b_count; - - // Store the indices to shared memory. - // TODO: MODIFY MEM_TO_SHARED TO UNCONDITIONALLY WRITE TO FULL SMEM. - mem_to_shared<nt, vt>(indices + begin, tid, b_count, b_shared, false); - - // Binary search into the remove array to prepare a range for the thread. - merge_range_t range = { - // a range - vt * tid, - tile.count(), - - // b range - binary_search<bounds_lower>(b_shared, b_count, - tile.begin + vt * tid, less_t<int>()), - b_count - }; - - // Emit all values that aren't removed. - iterate<vt>([&](int i) { - bool p = range.a_valid() && (!range.b_valid() || - tile.begin + range.a_begin < b_shared[range.b_begin]); - if(p) - a_shared[range.a_begin - range.b_begin] = tile.begin + range.a_begin; - else - ++range.b_begin; - ++range.a_begin; - }); - __syncthreads(); - - // Pull the gather indices out of shared memory in strided order. - array_t<int, vt> gather = shared_to_reg_strided<nt, vt>( - shared.indices, tid); - - // Gather the elements from input. - int num_move = tile.count() - b_count; - array_t<type_t, vt> values; - strided_iterate<nt, vt, 0>([&](int i, int j) { - values[i] = input[gather[i]]; - }, tid, num_move); - - // Stream to output. - reg_to_mem_strided<nt, vt>(values, tid, num_move, - output + tile.begin - begin); - }; - cta_transform<launch_t>(k, count, context); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_compact.hxx b/src/util/cuda/moderngpu/kernel_compact.hxx deleted file mode 100644 index 774b69a2..00000000 --- a/src/util/cuda/moderngpu/kernel_compact.hxx +++ /dev/null @@ -1,139 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "kernel_scan.hxx" - -BEGIN_MGPU_NAMESPACE - -template<typename launch_arg_t> -struct stream_compact_t { - - typedef typename conditional_typedef_t<launch_arg_t, - launch_box_t< - arch_20_cta<128, 11>, - arch_35_cta<128, 7>, - arch_52_cta<128, 11> - > - >::type_t launch_t; - - cta_dim_t cta_dim; - int num_ctas; - int count; - context_t& context; - - mem_t<short> bits; - mem_t<int> cta_offsets; - -public: - stream_compact_t(int count_, context_t& context_) : context(context_) { - count = count_; - cta_dim = launch_t::cta_dim(context); - num_ctas = cta_dim.num_ctas(count); - - bits = mem_t<short>(num_ctas * cta_dim.nt, context); - cta_offsets = mem_t<int>(num_ctas, context); - } - - // upsweep of stream compaction. - // func_t implements bool operator(int index); - // The return value is flag for indicating that we want to *keep* the data - // in the compacted stream. - template<typename func_t> - int upsweep(func_t f) { - short* bits_data = bits.data(); - int* cta_offsets_data = cta_offsets.data(); - int count = this->count; - - auto upsweep_k = [=]MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - typedef cta_reduce_t<nt, int> reduce_t; - static_assert(vt <= 16, "mgpu::stream_compact_vt must be <= 16."); - - __shared__ union { - typename reduce_t::storage_t reduce; - } shared; - - range_t tile = get_tile(cta, nv, count); - short stream_bits = 0; - strided_iterate<nt, vt>([&](int i, int j) { - int index = tile.begin + j; - bool stream = f(index); - if(stream) stream_bits |= 1<< i; - }, tid, tile.count()); - - // Reduce the values and store to global memory. - int total_stream = reduce_t().reduce(tid, popc(stream_bits), - shared.reduce, nt, plus_t<int>(), false); - - bits_data[nt * cta + tid] = stream_bits; - if(!tid) cta_offsets_data[cta] = total_stream; - }; - cta_launch<launch_t>(upsweep_k, num_ctas, context); - - // Scan reductions. - mem_t<int> counts_host(1, context, memory_space_host); - scan_event(cta_offsets_data, num_ctas, cta_offsets_data, - plus_t<int>(), counts_host.data(), context, context.event()); - cudaEventSynchronize(context.event()); - - // Return the total number of elements to stream. - int stream_total = counts_host.data()[0]; - return stream_total; - } - - // downsweep of stream compaction. - // func_t implements void operator(int dest_index, int source_index). - template<typename func_t> - void downsweep(func_t f) { - const short* bits_data = bits.data(); - const int* cta_offsets_data = cta_offsets.data(); - - auto downsweep_k = [=]MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - typedef cta_scan_t<nt, int> scan_t; - __shared__ union { - typename scan_t::storage_t scan; - short indices[nv]; - } shared; - - short stream_bits = bits_data[nt * cta + tid]; - int cta_offset = cta_offsets_data[cta]; - - // For each set stream_bits bit, set shared.indices to 1. - iterate<vt>([&](int i) { - shared.indices[nt * i + tid] = 0 != ((1<< i) & stream_bits); - }); - __syncthreads(); - - // Load out the values and scan. Compact into shared.indices the - // CTA-local indices of each streaming work-item. - array_t<short, vt> flags = shared_to_reg_thread<nt, vt>( - shared.indices, tid); - scan_result_t<int> scan = scan_t().scan(tid, reduce(flags), - shared.scan); - iterate<vt>([&](int i) { - if(flags[i]) shared.indices[scan.scan++] = (short)(vt * tid + i); - }); - __syncthreads(); - - // Call the user-supplied callback with destination and source indices. - for(int i = tid; i < scan.reduction; i += nt) { - int source_index = nv * cta + shared.indices[i]; - int dest_index = cta_offset + i; - f(dest_index, source_index); - } - __syncthreads(); - }; - cta_launch<launch_t>(downsweep_k, num_ctas, context); - } -}; - -template<typename launch_arg_t = empty_t> -stream_compact_t<launch_arg_t> -transform_compact(int count, context_t& context) { - return stream_compact_t<launch_arg_t>(count, context); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_intervalmove.hxx b/src/util/cuda/moderngpu/kernel_intervalmove.hxx deleted file mode 100644 index 8a4cad20..00000000 --- a/src/util/cuda/moderngpu/kernel_intervalmove.hxx +++ /dev/null @@ -1,67 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "kernel_load_balance.hxx" - -BEGIN_MGPU_NAMESPACE - -template<typename launch_arg_t = empty_t, typename input_it, - typename segments_it, typename output_it> -void interval_expand(input_it input, int count, segments_it segments, - int num_segments, output_it output, context_t& context) { - - typedef typename std::iterator_traits<input_it>::value_type type_t; - transform_lbs<launch_arg_t>( - []MGPU_DEVICE(int index, int seg, int rank, tuple<type_t> desc, - output_it output) { - output[index] = get<0>(desc); - }, - count, segments, num_segments, make_tuple(input), context, output - ); -} - -template<typename launch_arg_t = empty_t, typename input_it, - typename segments_it, typename gather_it, typename output_it> -void interval_gather(input_it input, int count, segments_it segments, - int num_segments, gather_it gather, output_it output, context_t& context) { - - transform_lbs<launch_arg_t>( - []MGPU_DEVICE(int index, int seg, int rank, tuple<int> desc, - input_it input, output_it output) { - output[index] = input[get<0>(desc) + rank]; - }, - count, segments, num_segments, make_tuple(gather), context, input, output - ); -} - -template<typename launch_arg_t = empty_t, typename input_it, - typename segments_it, typename scatter_it, typename output_it> -void interval_scatter(input_it input, int count, segments_it segments, - int num_segments, scatter_it scatter, output_it output, context_t& context) { - - transform_lbs<launch_arg_t>( - []MGPU_DEVICE(int index, int seg, int rank, tuple<int> desc, - input_it input, output_it output) { - output[get<0>(desc) + rank] = input[index]; - }, - count, segments, num_segments, make_tuple(scatter), context, input, output - ); -} - -template<typename launch_arg_t = empty_t, - typename input_it, typename segments_it, typename scatter_it, - typename gather_it, typename output_it> -void interval_move(input_it input, int count, segments_it segments, - int num_segments, scatter_it scatter, gather_it gather, output_it output, - context_t& context) { - - transform_lbs<launch_arg_t>( - []MGPU_DEVICE(int index, int seg, int rank, tuple<int, int> desc, - input_it input, output_it output) { - output[get<0>(desc) + rank] = input[get<1>(desc) + rank]; - }, - count, segments, num_segments, make_tuple(scatter, gather), context, - input, output - ); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_join.hxx b/src/util/cuda/moderngpu/kernel_join.hxx deleted file mode 100644 index 48ca685d..00000000 --- a/src/util/cuda/moderngpu/kernel_join.hxx +++ /dev/null @@ -1,50 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "kernel_sortedsearch.hxx" -#include "kernel_scan.hxx" -#include "kernel_load_balance.hxx" - -BEGIN_MGPU_NAMESPACE - -template<typename launch_arg_t = empty_t, - typename a_it, typename b_it, typename comp_t> -mem_t<int2> inner_join(a_it a, int a_count, b_it b, int b_count, - comp_t comp, context_t& context) { - - // Compute lower and upper bounds of a into b. - mem_t<int> lower(a_count, context); - mem_t<int> upper(a_count, context); - sorted_search<bounds_lower, launch_arg_t>(a, a_count, b, b_count, - lower.data(), comp, context); - sorted_search<bounds_upper, launch_arg_t>(a, a_count, b, b_count, - upper.data(), comp, context); - - // Compute output ranges by scanning upper - lower. Retrieve the reduction - // of the scan, which specifies the size of the output array to allocate. - mem_t<int> scanned_sizes(a_count, context); - const int* lower_data = lower.data(); - const int* upper_data = upper.data(); - - mem_t<int> count(1, context); - transform_scan<int>([=]MGPU_DEVICE(int index) { - return upper_data[index] - lower_data[index]; - }, a_count, scanned_sizes.data(), plus_t<int>(), count.data(), context); - - // Allocate an int2 output array and use load-balancing search to compute - // the join. - int join_count = from_mem(count)[0]; - mem_t<int2> output(join_count, context); - int2* output_data = output.data(); - - // Use load-balancing search on the segmens. The output is a pair with - // a_index = seg and b_index = lower_data[seg] + rank. - auto k = [=]MGPU_DEVICE(int index, int seg, int rank, tuple<int> lower) { - output_data[index] = make_int2(seg, get<0>(lower) + rank); - }; - transform_lbs<launch_arg_t>(k, join_count, scanned_sizes.data(), a_count, - make_tuple(lower_data), context); - - return output; -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_load_balance.hxx b/src/util/cuda/moderngpu/kernel_load_balance.hxx deleted file mode 100644 index 7a0e8459..00000000 --- a/src/util/cuda/moderngpu/kernel_load_balance.hxx +++ /dev/null @@ -1,88 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "cta_load_balance.hxx" -#include "search.hxx" - -BEGIN_MGPU_NAMESPACE - -template<typename launch_arg_t = empty_t, typename func_t, - typename segments_it, typename pointers_t, typename... args_t> -void transform_lbs(func_t f, int count, segments_it segments, - int num_segments, pointers_t caching_iterators, context_t& context, - args_t... args) { - - typedef typename conditional_typedef_t<launch_arg_t, - launch_box_t< - arch_20_cta<128, 11, 9>, - arch_35_cta<128, 7, 5>, - arch_52_cta<128, 11, 9> - > - >::type_t launch_t; - - typedef typename std::iterator_traits<segments_it>::value_type int_t; - typedef tuple_iterator_value_t<pointers_t> value_t; - - mem_t<int_t> mp = load_balance_partitions(count, segments, num_segments, - launch_t::nv(context), context); - const int_t* mp_data = mp.data(); - - auto k = [=]MGPU_DEVICE(int tid, int cta, args_t... args) { - - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 }; - typedef cta_load_balance_t<nt, vt> load_balance_t; - typedef detail::cached_segment_load_t<nt, pointers_t> cached_load_t; - - __shared__ union { - typename load_balance_t::storage_t lbs; - typename cached_load_t::storage_t cached; - } shared; - - // Compute the load-balancing search and materialize (index, seg, rank) - // arrays. - auto lbs = load_balance_t().load_balance(count, segments, num_segments, - tid, cta, mp_data, shared.lbs); - - // Load from the cached iterators. Use the placement range, not the - // merge-path range for situating the segments. - array_t<value_t, vt> cached_values = cached_load_t::template load<vt0>( - tid, lbs.merge_range.a_count(), lbs.placement.range.b_range(), - lbs.segments, shared.cached, caching_iterators); - - // Call the user-supplied functor f. - strided_iterate<nt, vt, vt0>([=](int i, int j) { - int index = lbs.merge_range.a_begin + j; - int seg = lbs.segments[i]; - int rank = lbs.ranks[i]; - - f(index, seg, rank, cached_values[i], args...); - }, tid, lbs.merge_range.a_count()); - }; - cta_transform<launch_t>(k, count + num_segments, context, args...); -} - -// load-balancing search without caching. -template<typename launch_arg_t = empty_t, typename func_t, - typename segments_it, typename... args_t> -void transform_lbs(func_t f, int count, segments_it segments, - int num_segments, context_t& context, args_t... args) { - - transform_lbs<launch_arg_t>( - [=]MGPU_DEVICE(int index, int seg, int rank, tuple<>, args_t... args) { - f(index, seg, rank, args...); // drop the cached values. - }, - count, segments, num_segments, tuple<>(), context, args... - ); -} - -template<typename launch_arg_t = empty_t, typename segments_it, - typename output_it> -void load_balance_search(int count, segments_it segments, - int num_segments, output_it output, context_t& context) { - - transform_lbs<launch_arg_t>([=]MGPU_DEVICE(int index, int seg, int rank) { - output[index] = seg; - }, count, segments, num_segments, context); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_merge.hxx b/src/util/cuda/moderngpu/kernel_merge.hxx deleted file mode 100644 index 7e5cac27..00000000 --- a/src/util/cuda/moderngpu/kernel_merge.hxx +++ /dev/null @@ -1,92 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "cta_merge.hxx" -#include "search.hxx" - -BEGIN_MGPU_NAMESPACE - -// Key-value merge. -template<typename launch_arg_t = empty_t, - typename a_keys_it, typename a_vals_it, - typename b_keys_it, typename b_vals_it, - typename c_keys_it, typename c_vals_it, - typename comp_t> -void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count, - b_keys_it b_keys, b_vals_it b_vals, int b_count, - c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context) { - - typedef typename conditional_typedef_t<launch_arg_t, - launch_box_t< - arch_20_cta<128, 15>, - arch_35_cta<128, 11>, - arch_52_cta<128, 15> - > - >::type_t launch_t; - - typedef typename std::iterator_traits<a_keys_it>::value_type type_t; - typedef typename std::iterator_traits<a_vals_it>::value_type val_t; - enum { has_values = !std::is_same<val_t, empty_t>::value }; - - mem_t<int> partitions = merge_path_partitions<bounds_lower>(a_keys, a_count, - b_keys, b_count, launch_t::nv(context), comp, context); - int* mp_data = partitions.data(); - - auto k = [=] MGPU_DEVICE (int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - - __shared__ union { - type_t keys[nv + 1]; - int indices[nv]; - } shared; - - // Load the range for this CTA and merge the values into register. - int mp0 = mp_data[cta + 0]; - int mp1 = mp_data[cta + 1]; - merge_range_t range = compute_merge_range(a_count, b_count, cta, nv, - mp0, mp1); - -// Any attempt to debug the problem on clang failed (if you remove this will crash on clang) -#ifdef __clang__ - - if (range.b_end > b_count) - { - return; - } - -#endif - - merge_pair_t<type_t, vt> merge = cta_merge_from_mem<bounds_lower, nt, vt>( - a_keys, b_keys, range, tid, comp, shared.keys); - - int dest_offset = nv * cta; - reg_to_mem_thread<nt>(merge.keys, tid, range.total(), c_keys + dest_offset, - shared.keys); - - if(has_values) { - // Transpose the indices from thread order to strided order. - array_t<int, vt> indices = reg_thread_to_strided<nt>(merge.indices, tid, - shared.indices); - - // Gather the input values and merge into the output values. - transfer_two_streams_strided<nt>(a_vals + range.a_begin, range.a_count(), - b_vals + range.b_begin, range.b_count(), indices, tid, - c_vals + dest_offset); - } - }; - cta_transform<launch_t>(k, a_count + b_count, context); -} - -// Key-only merge. -template<typename launch_t = empty_t, - typename a_keys_it, typename b_keys_it, typename c_keys_it, - typename comp_t> -void merge(a_keys_it a_keys, int a_count, b_keys_it b_keys, int b_count, - c_keys_it c_keys, comp_t comp, context_t& context) { - - merge<launch_t>(a_keys, (const empty_t*)nullptr, a_count, b_keys, - (const empty_t*)nullptr, b_count, c_keys, (empty_t*)nullptr, comp, - context); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_mergesort.hxx b/src/util/cuda/moderngpu/kernel_mergesort.hxx deleted file mode 100644 index 6d3f9459..00000000 --- a/src/util/cuda/moderngpu/kernel_mergesort.hxx +++ /dev/null @@ -1,150 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "transform.hxx" -#include "kernel_merge.hxx" -#include "cta_mergesort.hxx" -#include "intrinsics.hxx" - -BEGIN_MGPU_NAMESPACE - -template<typename keys_it, typename comp_t> -mem_t<int> merge_sort_partitions(keys_it keys, int count, int coop, - int spacing, comp_t comp, context_t& context) { - - int num_partitions = div_up(count, spacing) + 1; - auto k = [=]MGPU_DEVICE(int index) { - merge_range_t range = compute_mergesort_range(count, index, coop, spacing); - int diag = min(spacing * index, count) - range.a_begin; - return merge_path<bounds_lower>(keys + range.a_begin, range.a_count(), - keys + range.b_begin, range.b_count(), diag, comp); - }; - - return fill_function<int>(k, num_partitions, context); -} - -// Key-value mergesort. -template<typename launch_arg_t = empty_t, typename key_t, typename val_t, - typename comp_t> -void mergesort(key_t* keys_input, val_t* vals_input, int count, - comp_t comp, context_t& context) { - - enum { has_values = !std::is_same<val_t, empty_t>::value }; - - typedef typename conditional_typedef_t<launch_arg_t, - launch_box_t< - arch_20_cta<128, 17>, - arch_35_cta<128, 11>, - arch_52_cta<128, 15> - > - >::type_t launch_t; - - int nv = launch_t::nv(context); - int num_ctas = div_up(count, nv); - int num_passes = find_log2(num_ctas, true); - - mem_t<key_t> keys_temp(num_passes ? count : 0, context); - key_t* keys_output = keys_temp.data(); - - mem_t<val_t> vals_temp(has_values && num_passes ? count : 0, context); - val_t* vals_output = vals_temp.data(); - - key_t* keys_blocksort = (1 & num_passes) ? keys_output : keys_input; - val_t* vals_blocksort = (1 & num_passes) ? vals_output : vals_input; - - auto k = [=] MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - typedef cta_sort_t<nt, vt, key_t, val_t> sort_t; - - __shared__ union { - typename sort_t::storage_t sort; - key_t keys[nv]; - val_t vals[nv]; - } shared; - - range_t tile = get_tile(cta, nv, count); - - // Load the keys and values. - kv_array_t<key_t, val_t, vt> unsorted; - unsorted.keys = mem_to_reg_thread<nt, vt>(keys_input + tile.begin, tid, - tile.count(), shared.keys); - if(has_values) - unsorted.vals = mem_to_reg_thread<nt, vt>(vals_input + tile.begin, tid, - tile.count(), shared.vals); - - // Blocksort. - kv_array_t<key_t, val_t, vt> sorted = sort_t().block_sort(unsorted, - tid, tile.count(), comp, shared.sort); - - // Store the keys and values. - reg_to_mem_thread<nt, vt>(sorted.keys, tid, tile.count(), - keys_blocksort + tile.begin, shared.keys); - if(has_values) - reg_to_mem_thread<nt, vt>(sorted.vals, tid, tile.count(), - vals_blocksort + tile.begin, shared.vals); - }; - - cta_transform<launch_t>(k, count, context); - - if(1 & num_passes) { - std::swap(keys_input, keys_output); - std::swap(vals_input, vals_output); - } - - for(int pass = 0; pass < num_passes; ++pass) { - int coop = 2<< pass; - mem_t<int> partitions = merge_sort_partitions(keys_input, count, coop, - nv, comp, context); - int* mp_data = partitions.data(); - - auto k = [=] MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - - __shared__ union { - key_t keys[nv + 1]; - int indices[nv]; - } shared; - - range_t tile = get_tile(cta, nv, count); - - // Load the range for this CTA and merge the values into register. - merge_range_t range = compute_mergesort_range(count, cta, coop, nv, - mp_data[cta + 0], mp_data[cta + 1]); - - merge_pair_t<key_t, vt> merge = cta_merge_from_mem<bounds_lower, nt, vt>( - keys_input, keys_input, range, tid, comp, shared.keys); - - // Store merged values back out. - reg_to_mem_thread<nt>(merge.keys, tid, tile.count(), - keys_output + tile.begin, shared.keys); - - if(has_values) { - // Transpose the indices from thread order to strided order. - array_t<int, vt> indices = reg_thread_to_strided<nt>(merge.indices, - tid, shared.indices); - - // Gather the input values and merge into the output values. - transfer_two_streams_strided<nt>(vals_input + range.a_begin, - range.a_count(), vals_input + range.b_begin, range.b_count(), - indices, tid, vals_output + tile.begin); - } - }; - cta_transform<launch_t>(k, count, context); - - std::swap(keys_input, keys_output); - std::swap(vals_input, vals_output); - } -} - -// Key-only mergesort -template<typename launch_arg_t = empty_t, typename key_t, typename comp_t> -void mergesort(key_t* keys_input, int count, comp_t comp, - context_t& context) { - - mergesort<launch_arg_t>(keys_input, (empty_t*)nullptr, count, comp, - context); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_reduce.hxx b/src/util/cuda/moderngpu/kernel_reduce.hxx deleted file mode 100644 index 28112218..00000000 --- a/src/util/cuda/moderngpu/kernel_reduce.hxx +++ /dev/null @@ -1,70 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "cta_reduce.hxx" -#include "memory.hxx" -#include "transform.hxx" -#include "operators.hxx" - -BEGIN_MGPU_NAMESPACE - -template<typename launch_arg_t = empty_t, typename input_it, - typename output_it, typename op_t> -void reduce(input_it input, int count, output_it reduction, op_t op, - context_t& context) { - - typedef typename conditional_typedef_t<launch_arg_t, - launch_params_t<128, 8> - >::type_t launch_t; - - typedef typename std::iterator_traits<input_it>::value_type type_t; - - int num_ctas = launch_t::cta_dim(context).num_ctas(count); - mem_t<type_t> partials(num_ctas, context); - type_t* partials_data = partials.data(); - - auto k = [=] MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - typedef cta_reduce_t<nt, type_t> reduce_t; - __shared__ typename reduce_t::storage_t shared_reduce; - - // Load the data for the first tile for each cta. - range_t tile = get_tile(cta, nv, count); - array_t<type_t, vt> x = mem_to_reg_strided<nt, vt>(input + tile.begin, - tid, tile.count()); - - // Reduce the multiple values per thread into a scalar. - type_t scalar; - strided_iterate<nt, vt>([&](int i, int j) { - scalar = i ? op(scalar, x[i]) : x[0]; - }, tid, tile.count()); - - // Reduce to a scalar per CTA. - scalar = reduce_t().reduce(tid, scalar, shared_reduce, - min(tile.count(), (int)nt), op, false); - - if(!tid) { - if(1 == num_ctas) *reduction = scalar; - else partials_data[cta] = scalar; - } - }; - cta_launch<launch_t>(k, num_ctas, context); - - // Recursively call reduce until there's just one scalar. - if(num_ctas > 1) - reduce<launch_params_t<512, 4> >(partials_data, num_ctas, reduction, op, - context); -} - -template<typename launch_arg_t = empty_t, typename func_t, - typename output_it, typename op_t> -void transform_reduce(func_t f, int count, output_it reduction, op_t op, - context_t& context) { - - typedef typename std::iterator_traits<output_it>::value_type type_t; - reduce<launch_arg_t>(make_load_iterator<type_t>(f), count, reduction, op, - context); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_scan.hxx b/src/util/cuda/moderngpu/kernel_scan.hxx deleted file mode 100644 index b5f30859..00000000 --- a/src/util/cuda/moderngpu/kernel_scan.hxx +++ /dev/null @@ -1,198 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "cta_reduce.hxx" -#include "cta_scan.hxx" -#include "memory.hxx" -#include "operators.hxx" - -BEGIN_MGPU_NAMESPACE - -template<scan_type_t scan_type = scan_type_exc, - typename launch_arg_t = empty_t, typename input_it, - typename output_it, typename op_t, typename reduction_it> -void scan_event(input_it input, int count, output_it output, op_t op, - reduction_it reduction, context_t& context, cudaEvent_t event) { - - typedef typename conditional_typedef_t<launch_arg_t, - launch_box_t< - arch_20_cta<128, 11>, - arch_35_cta<128, 7>, - arch_52_cta<128, 11> - > - >::type_t launch_t; - - typedef typename std::iterator_traits<input_it>::value_type type_t; - - int num_ctas = launch_t::cta_dim(context).num_ctas(count); - - if(num_ctas > 8) { - mem_t<type_t> partials(num_ctas, context); - type_t* partials_data = partials.data(); - - //////////////////////////////////////////////////////////////////////////// - // Upsweep phase. Reduce each tile to a scalar and store to partials. - - auto upsweep_k = [=] MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - typedef cta_reduce_t<nt, type_t> reduce_t; - - __shared__ union { - typename reduce_t::storage_t reduce; - } shared; - - // Load the tile's data into register. - range_t tile = get_tile(cta, nv, count); - array_t<type_t, vt> x = mem_to_reg_strided<nt, vt>(input + tile.begin, - tid, tile.count()); - - // Reduce the thread's values into a scalar. - type_t scalar = type_t(); - strided_iterate<nt, vt>([&](int i, int j) { - scalar = i ? op(scalar, x[i]) : x[0]; - }, tid, tile.count()); - - // Reduce across all threads. - type_t all_reduce = reduce_t().reduce(tid, scalar, shared.reduce, - tile.count(), op); - - // Store the final reduction to the partials. - if(!tid) - partials_data[cta] = all_reduce; - }; - cta_transform<launch_t>(upsweep_k, count, context); - - //////////////////////////////////////////////////////////////////////////// - // Spine phase. Recursively call scan on the CTA partials. - - scan_event<scan_type_exc>(partials_data, num_ctas, partials_data, - op, reduction, context, event); - - // Record the event. This lets the caller wait on just the reduction - // part of the operation. It's useful when writing the reduction to - // host-side paged-locked memory; the caller can read out the value more - // quickly to allocate memory and launch the next kernel. - if(event) - cudaEventRecord(event, context.stream()); - - //////////////////////////////////////////////////////////////////////////// - // Downsweep phase. Perform an intra-tile scan and add the scan of the - // partials as carry-in. - - auto downsweep_k = [=] MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - typedef cta_scan_t<nt, type_t> scan_t; - - __shared__ union { - typename scan_t::storage_t scan; - type_t values[nv]; - } shared; - - // Load a tile to register in thread order. - range_t tile = get_tile(cta, nv, count); - array_t<type_t, vt> x = mem_to_reg_thread<nt, vt>(input + tile.begin, - tid, tile.count(), shared.values); - - // Scan the array with carry-in from the partials. - array_t<type_t, vt> y = scan_t().scan(tid, x, shared.scan, - partials_data[cta], cta > 0, tile.count(), op, type_t(), - scan_type).scan; - - // Store the scanned values to the output. - reg_to_mem_thread<nt, vt>(y, tid, tile.count(), output + tile.begin, - shared.values); - }; - cta_transform<launch_t>(downsweep_k, count, context); - - } else { - - //////////////////////////////////////////////////////////////////////////// - // Small input specialization. This is the non-recursive branch. - - typedef launch_params_t<512, 3> spine_params_t; - auto spine_k = [=] MGPU_DEVICE(int tid, int cta) { - - enum { nt = spine_params_t::nt, vt = spine_params_t::vt, nv = nt * vt }; - typedef cta_scan_t<nt, type_t> scan_t; - - __shared__ union { - typename scan_t::storage_t scan; - type_t values[nv]; - } shared; - - type_t carry_in = type_t(); - for(int cur = 0; cur < count; cur += nv) { - // Cooperatively load values into register. - int count2 = min<int>(count - cur, nv); - array_t<type_t, vt> x = mem_to_reg_thread<nt, vt>(input + cur, - tid, count2, shared.values); - - scan_result_t<type_t, vt> result = scan_t().scan(tid, x, shared.scan, - carry_in, cur > 0, count2, op, type_t(), scan_type); - - // Store the scanned values back to global memory. - reg_to_mem_thread<nt, vt>(result.scan, tid, count2, - output + cur, shared.values); - - // Roll the reduction into carry_in. - carry_in = result.reduction; - } - - // Store the carry-out to the reduction pointer. This may be a - // discard_iterator_t if no reduction is wanted. - if(!tid) - *reduction = carry_in; - }; - cta_launch<spine_params_t>(spine_k, 1, context); - - // Record the event. This lets the caller wait on just the reduction - // part of the operation. It's useful when writing the reduction to - // host-side paged-locked memory; the caller can read out the value more - // quickly to allocate memory and launch the next kernel. - if(event) - cudaEventRecord(event, context.stream()); - } -} - -template<scan_type_t scan_type = scan_type_exc, - typename launch_arg_t = empty_t, typename input_it, - typename output_it, typename op_t, typename reduction_it> -void scan(input_it input, int count, output_it output, op_t op, - reduction_it reduction, context_t& context) { - return scan_event<scan_type, launch_arg_t>(input, count, output, op, - reduction, context, 0); -} - -template<scan_type_t scan_type = scan_type_exc, - typename launch_arg_t = empty_t, - typename input_it, typename output_it> -void scan(input_it input, int count, output_it output, context_t& context) { - - typedef typename std::iterator_traits<input_it>::value_type type_t; - scan<scan_type, launch_arg_t>(input, count, output, plus_t<type_t>(), - discard_iterator_t<type_t>(), context); -} - -template<typename type_t, scan_type_t scan_type = scan_type_exc, - typename launch_arg_t = empty_t, typename func_t, typename output_it, - typename op_t, typename reduction_it> -void transform_scan_event(func_t f, int count, output_it output, op_t op, - reduction_it reduction, context_t& context, cudaEvent_t event) { - - scan_event<scan_type, launch_arg_t>(make_load_iterator<type_t>(f), - count, output, op, reduction, context, event); -} - -template<typename type_t, scan_type_t scan_type = scan_type_exc, - typename launch_arg_t = empty_t, typename func_t, typename output_it, - typename op_t, typename reduction_it> -void transform_scan(func_t f, int count, output_it output, op_t op, - reduction_it reduction, context_t& context) { - - transform_scan_event<type_t, scan_type, launch_arg_t>(f, count, output, op, - reduction, context, 0); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_segreduce.hxx b/src/util/cuda/moderngpu/kernel_segreduce.hxx deleted file mode 100644 index 185c34ef..00000000 --- a/src/util/cuda/moderngpu/kernel_segreduce.hxx +++ /dev/null @@ -1,406 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "search.hxx" -#include "cta_load_balance.hxx" -#include "cta_segscan.hxx" -#include "transform.hxx" -#include "memory.hxx" - -BEGIN_MGPU_NAMESPACE - -namespace detail { - -//////////////////////////////////////////////////////////////////////////////// -// cta_segreduce_t is common intra-warp segmented reduction code for -// these kernels. Should clean up and move to cta_segreduce.hxx. - -template<int nt, int vt, typename type_t> -struct cta_segreduce_t { - typedef cta_segscan_t<nt, type_t> segscan_t; - - union storage_t { - typename segscan_t::storage_t segscan; - type_t values[nt * vt + 1]; - }; - - // Values must be stored in storage.values on entry. - template<typename op_t, typename output_it> - MGPU_DEVICE void segreduce(merge_range_t merge_range, - lbs_placement_t placement, array_t<bool, vt + 1> p, int tid, - int cta, type_t init, op_t op, output_it output, - type_t* carry_out_values, int* carry_out_codes, storage_t& storage) { - - int cur_item = placement.a_index; - int begin_segment = placement.b_index; - int cur_segment = begin_segment; - bool carry_in = false; - - const type_t* a_shared = storage.values - merge_range.a_begin; - type_t x[vt]; - int segments[vt + 1]; - iterate<vt>([&](int i) { - if(p[i]) { - // This is a data node, so accumulate and advance the data ID. - x[i] = a_shared[cur_item++]; - if(carry_in) x[i] = op(x[i - 1], x[i]); - carry_in = true; - } else { - // This is a segment node, so advance the segment ID. - x[i] = init; - ++cur_segment; - carry_in = false; - } - segments[i] = cur_segment; - }); - // Always flush at the end of the last thread. - bool overwrite = (nt - 1 == tid) && (!p[vt - 1] && p[vt]); - if(nt - 1 == tid) p[vt] = false; - if(!p[vt]) ++cur_segment; - segments[vt] = cur_segment; - overwrite = __syncthreads_or(overwrite); - - // Get the segment ID for the next item. This lets us find an end flag - // for the last value in this thread. - bool has_head_flag = begin_segment < segments[vt - 1]; - bool has_carry_out = p[vt - 1]; - - // Compute the carry-in for each thread. - segscan_result_t<type_t> result = segscan_t().segscan(tid, has_head_flag, - has_carry_out, x[vt - 1], storage.segscan, init, op); - - // Add the carry-in back into each value and recompute the reductions. - type_t* x_shared = storage.values - placement.range.b_begin; - carry_in = result.has_carry_in && p[0]; - iterate<vt>([&](int i) { - if(segments[i] < segments[i + 1]) { - // We've hit the end of this segment. Store the reduction to shared - // memory. - if(carry_in) x[i] = op(result.scan, x[i]); - x_shared[segments[i]] = x[i]; - carry_in = false; - } - }); - __syncthreads(); - - // Store the reductions for segments which begin in this tile. - for(int i = merge_range.b_begin + tid; i < merge_range.b_end; i += nt) - output[i] = x_shared[i]; - - // Store the partial reduction for the segment which begins in the - // preceding tile, if there is one. - if(!tid) { - if(segments[0] == merge_range.b_begin) segments[0] = -1; - int code = (segments[0]<< 1) | (int)overwrite; - carry_out_values[cta] = (segments[0] != -1) ? - x_shared[segments[0]] : - init; - carry_out_codes[cta] = code; - } - } -}; - -//////////////////////////////////////////////////////////////////////////////// -// Adds the carry-out for each segreduce CTA into the outputs. - -template<typename output_it, typename type_t, typename op_t> -void segreduce_fixup(output_it output, const type_t* values, - const int* codes, int count, op_t op, type_t init, - context_t& context) { - - enum { nt = 512 }; - int num_ctas = div_up(count, nt); - - mem_t<type_t> carry_out(num_ctas, context); - mem_t<int> codes_out(num_ctas, context); - type_t* carry_out_data = carry_out.data(); - int* codes_data = codes_out.data(); - - auto k_fixup = [=]MGPU_DEVICE(int tid, int cta) { - typedef cta_segscan_t<nt, type_t> segscan_t; - __shared__ struct { - bool head_flags[nt]; - typename segscan_t::storage_t segscan; - } shared; - - range_t tile = get_tile(cta, nt, count); - int gid = tile.begin + tid; - - //////////////////////////////////////////////////////////////////////////// - // As in the outer segmented reduce kernel, update the reductions for all - // segments that *start* in this CTA. That is, the first carry-out code - // for a segment must be mapped into this CTA to actually apply the - // accumulate. This CTA will return a partial reduction for the segment - // that overlaps this CTA but starts in a preceding CTA. - - // We don't need to worry about storing new overwrite bits as this kernel - // will always add carry-in values to empty segments. - - int code0 = (gid - 1 >= 0 && gid - 1 < count) ? codes[gid - 1] : -1; - int code1 = (gid < count) ? codes[gid] : -1; - int code2 = (gid + 1 < count) ? codes[gid + 1] : -1; - type_t value = (gid < count) ? values[gid] : init; - - int seg0 = code0>> 1; - int seg1 = code1>> 1; - int seg2 = code2>> 1; - bool has_head_flag = seg0 != seg1 || -1 == seg1; - bool has_carry_out = -1 != seg1 && seg1 == seg2; - bool has_end_flag = seg1 != seg2; - - // Put the head flag in shared memory, because the last thread - // participating in a reduction in the CTA needs to check the head flag - // for the first thread in the reduction. - shared.head_flags[tid] = has_head_flag; - - segscan_result_t<type_t> result = segscan_t().segscan(tid, has_head_flag, - has_carry_out, value, shared.segscan, init, op); - - bool carry_out_written = false; - if(-1 != seg1 && (has_end_flag || nt - 1 == tid)) { - // This is a valid reduction. - if(result.has_carry_in) - value = op(value, result.scan); - - if(0 == result.left_lane && !shared.head_flags[result.left_lane]) { - carry_out_data[cta] = value; - codes_data[cta] = seg1<< 1; - carry_out_written = true; - } else { - int left_code = codes[tile.begin + result.left_lane - 1]; - if(0 == (1 & left_code)) // Add in the value already stored. - value = op(value, output[seg1]); - output[seg1] = value; - } - } - - carry_out_written = __syncthreads_or(carry_out_written); - if(!carry_out_written && !tid) - codes_data[cta] = -1<< 1; - }; - cta_launch<nt>(k_fixup, num_ctas, context); - - if(num_ctas > 1) - segreduce_fixup(output, carry_out_data, codes_data, - num_ctas, op, init, context); -} - -} // namespace detail - -//////////////////////////////////////////////////////////////////////////////// -// Segmented reduction with loading from an input iterator. This does not -// require explicit materialization of the load-balancing search. - -template<typename launch_arg_t = empty_t, typename input_it, - typename segments_it, typename output_it, typename op_t, typename type_t> -void segreduce(input_it input, int count, segments_it segments, - int num_segments, output_it output, op_t op, type_t init, - context_t& context) { - - typedef typename conditional_typedef_t<launch_arg_t, - launch_box_t< - arch_20_cta<128, 11, 8>, - arch_35_cta<128, 7, 5>, - arch_52_cta<128, 11, 8> - > - >::type_t launch_t; - - cta_dim_t cta_dim = launch_t::cta_dim(context); - int num_ctas = cta_dim.num_ctas(count + num_segments); - - mem_t<type_t> carry_out(num_ctas, context); - mem_t<int> codes(num_ctas, context); - type_t* carry_out_data = carry_out.data(); - int* codes_data = codes.data(); - - mem_t<int> mp = load_balance_partitions(count, segments, num_segments, - cta_dim.nv(), context); - const int* mp_data = mp.data(); - - auto k_reduce = [=]MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 }; - typedef detail::cta_segreduce_t<nt, vt, type_t> segreduce_t; - - __shared__ union { - typename segreduce_t::storage_t segreduce; - type_t values[nt * vt + 1]; - type_t indices[nt * vt + 2]; - } shared; - - merge_range_t merge_range = compute_merge_range(count, num_segments, - cta, nt * vt, mp_data[cta], mp_data[cta + 1]); - - // Cooperatively load values from input into shared. - mem_to_shared<nt, vt, vt0>(input + merge_range.a_begin, tid, - merge_range.a_count(), shared.segreduce.values); - - // Load segment data into the B region of shared. Search for the starting - // index of each thread for a merge. - int* b_shared = sizeof(type_t) > sizeof(int) ? - (int*)(shared.segreduce.values + merge_range.a_count()) : - ((int*)shared.segreduce.values + merge_range.a_count()); - lbs_placement_t placement = cta_load_balance_place<nt, vt>(tid, - merge_range, count, segments, num_segments, b_shared); - - // Adjust the pointer so that dereferencing at the segment ID returns the - // offset of that segment. - b_shared -= placement.range.b_begin; - int cur_item = placement.a_index; - int cur_segment = placement.b_index; - array_t<bool, vt + 1> merge_bits; - iterate<vt + 1>([&](int i) { - bool p = cur_item < b_shared[cur_segment + 1]; - if(p) ++cur_item; - else ++cur_segment; - merge_bits[i] = p; - }); - - // Compute the segmented reduction. - segreduce_t().segreduce(merge_range, placement, merge_bits, tid, cta, - init, op, output, carry_out_data, codes_data, shared.segreduce); - - }; - cta_launch<launch_t>(k_reduce, num_ctas, context); - - if(num_ctas > 1) - detail::segreduce_fixup(output, carry_out_data, codes_data, num_ctas, - op, init, context); -} - -//////////////////////////////////////////////////////////////////////////////// - -template<typename launch_arg_t = empty_t, typename func_t, - typename segments_it, typename output_it, typename op_t, typename type_t> -void transform_segreduce(func_t f, int count, segments_it segments, - int num_segments, output_it output, op_t op, type_t init, - context_t& context) { - - segreduce<launch_arg_t>(make_load_iterator<type_t>(f), count, segments, - num_segments, output, op, init, context); -} - -//////////////////////////////////////////////////////////////////////////////// -// spmv - sparse matrix * vector. - -template<typename launch_arg_t = empty_t, typename matrix_it, - typename columns_it, typename vector_it, typename segments_it, - typename output_it> -void spmv(matrix_it matrix, columns_it columns, vector_it vector, - int count, segments_it segments, int num_segments, output_it output, - context_t& context) { - - typedef typename std::iterator_traits<matrix_it>::value_type type_t; - - transform_segreduce<launch_arg_t>([=]MGPU_DEVICE(int index) { - return matrix[index] * ldg(vector + columns[index]); // sparse m * v. - }, count, segments, num_segments, output, plus_t<type_t>(), - (type_t)0, context); -} - -//////////////////////////////////////////////////////////////////////////////// -// lbs_segreduce - -template<typename launch_arg_t = empty_t, - typename func_t, typename segments_it, typename pointers_t, - typename output_it, typename op_t, typename type_t, typename... args_t> -void lbs_segreduce(func_t f, int count, segments_it segments, - int num_segments, pointers_t caching_iterators, output_it output, op_t op, - type_t init, context_t& context, args_t... args) { - - typedef typename conditional_typedef_t<launch_arg_t, - launch_box_t< - arch_20_cta<128, 11, 8>, - arch_35_cta<128, 7, 5>, - arch_52_cta<128, 11, 8> - > - >::type_t launch_t; - - typedef tuple_iterator_value_t<pointers_t> value_t; - - cta_dim_t cta_dim = launch_t::cta_dim(context); - int num_ctas = cta_dim.num_ctas(count + num_segments); - - mem_t<type_t> carry_out(num_ctas, context); - mem_t<int> codes(num_ctas, context); - type_t* carry_out_data = carry_out.data(); - int* codes_data = codes.data(); - - mem_t<int> mp = load_balance_partitions(count, segments, num_segments, - cta_dim.nv(), context); - const int* mp_data = mp.data(); - - auto k_reduce = [=]MGPU_DEVICE(int tid, int cta, args_t... args) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 }; - typedef cta_load_balance_t<nt, vt> load_balance_t; - typedef detail::cached_segment_load_t<nt, pointers_t> cached_load_t; - typedef detail::cta_segreduce_t<nt, vt, type_t> segreduce_t; - - __shared__ union { - typename load_balance_t::storage_t lbs; - typename cached_load_t::storage_t cached; - typename segreduce_t::storage_t segreduce; - type_t values[nt * vt + 1]; - } shared; - - // Compute the load-balancing search and materialize (index, seg, rank) - // arrays. - auto lbs = load_balance_t().load_balance(count, segments, num_segments, - tid, cta, mp_data, shared.lbs); - - // Load from the cached iterators. Use the placement range, not the - // merge-path range for situating the segments. - array_t<value_t, vt> cached_values = cached_load_t::template load<vt0>( - tid, lbs.merge_range.a_count(), lbs.placement.range.b_range(), - lbs.segments, shared.cached, caching_iterators); - - // Call the user-supplied functor f. - array_t<type_t, vt> strided_values; - strided_iterate<nt, vt, vt0>([&](int i, int j) { - int index = lbs.merge_range.a_begin + j; - int seg = lbs.segments[i]; - int rank = lbs.ranks[i]; - - strided_values[i] = f(index, seg, rank, cached_values[i], args...); - }, tid, lbs.merge_range.a_count()); - - // Store the values back to shared memory for segmented reduction. - reg_to_shared_strided<nt, vt>(strided_values, tid, - shared.segreduce.values); - - // Split the flags. - array_t<bool, vt + 1> merge_bits; - iterate<vt + 1>([&](int i) { - merge_bits[i] = 0 != ((1<< i) & lbs.merge_flags); - }); - - // Compute the segmented reduction. - segreduce_t().segreduce(lbs.merge_range, lbs.placement, merge_bits, - tid, cta, init, op, output, carry_out_data, codes_data, - shared.segreduce); - }; - cta_launch<launch_t>(k_reduce, num_ctas, context, args...); - - if(num_ctas > 1) - detail::segreduce_fixup(output, carry_out_data, codes_data, num_ctas, - op, init, context); -} - -// lbs_segreduce with no caching iterators. -template<typename launch_arg_t = empty_t, - typename func_t, typename segments_it, typename output_it, typename op_t, - typename type_t, typename... args_t> -void lbs_segreduce(func_t f, int count, segments_it segments, - int num_segments, output_it output, op_t op, type_t init, - context_t& context, args_t... args) { - - lbs_segreduce<launch_arg_t>( - [=]MGPU_DEVICE(int index, int seg, int rank, tuple<>, args_t... args) { - return f(index, seg, rank, args...); - }, - count, segments, num_segments, tuple<>(), output, op, init, context, - args... - ); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_segsort.hxx b/src/util/cuda/moderngpu/kernel_segsort.hxx deleted file mode 100644 index 217a45ad..00000000 --- a/src/util/cuda/moderngpu/kernel_segsort.hxx +++ /dev/null @@ -1,444 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "search.hxx" -#include "cta_segsort.hxx" -#include "cta_scan.hxx" - -BEGIN_MGPU_NAMESPACE - -namespace detail { - -template<typename launch_arg_t, typename key_t, typename val_t, - typename comp_t> -struct segsort_t { - enum { has_values = !std::is_same<val_t, empty_t>::value }; - typedef typename conditional_typedef_t<launch_arg_t, - launch_box_t< - arch_20_cta<128, 15>, - arch_35_cta<128, 11>, - arch_52_cta<128, 15> - > - >::type_t launch_t; - - context_t& context; - comp_t comp; - cta_dim_t cta_dim; - int count, nv, num_ctas, num_passes; - - mem_t<key_t> keys_temp; - mem_t<val_t> vals_temp; - - key_t* keys_source, *keys_dest, *keys_blocksort; - val_t* vals_source, *vals_dest, *vals_blocksort; - - mem_t<range_t> merge_ranges; - mem_t<merge_range_t> merge_list; - mem_t<int> compressed_ranges, copy_list, copy_status; - mem_t<int2> op_counters; - - segsort_t(key_t* keys, val_t* vals, int count_, comp_t comp_, - context_t& context_) : count(count_), comp(comp_), context(context_) { - - nv = launch_t::nv(context); - num_ctas = div_up(count, nv); - num_passes = find_log2(num_ctas, true); - - int capacity = num_ctas; // log(num_ctas) per pass. - for(int i = 0; i < num_passes; ++i) - capacity += div_up(num_ctas, 1<< i); - - if(num_passes ) keys_temp = mem_t<key_t>(count, context); - if(num_passes && has_values) vals_temp = mem_t<val_t>(count, context); - - keys_source = keys; - vals_source = vals; - keys_dest = keys_temp.data(); - vals_dest = vals_temp.data(); - - // The blocksort passes outputs to these arrays. - keys_blocksort = (1 & num_passes) ? keys_dest : keys_source; - vals_blocksort = (1 & num_passes) ? vals_dest : vals_source; - - // Allocate space for temporary variables. - merge_ranges = mem_t<range_t>(capacity, context); - merge_list = mem_t<merge_range_t>(num_ctas, context); - compressed_ranges = mem_t<int>(num_ctas, context); - copy_list = mem_t<int>(num_ctas, context); - copy_status = mem_t<int>(num_ctas, context); - op_counters = fill<int2>(int2(), num_passes, context); - } - - template<bool sort_indices = false, typename keys_it, typename vals_it, - typename segments_it> - void blocksort_segments(keys_it keys, vals_it vals, segments_it segments, - int num_segments) { - - // Distribute the segment descriptors to different CTAs. - mem_t<int> partitions = binary_search_partitions<bounds_lower>(segments, - count, num_segments, nv, context); - const int* mp_data = partitions.data(); - - //////////////////////////////////////////////////////////////////////////// - // Block sort the input. The position of the first and last segment - // descriptors are stored to merge_ranges. - - comp_t comp = this->comp; - int count = this->count; - key_t* keys_blocksort = this->keys_blocksort; - val_t* vals_blocksort = this->vals_blocksort; - int* compressed_ranges_data = compressed_ranges.data(); - - auto blocksort_k = [=] MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - typedef cta_load_head_flags<nt, vt> load_head_flags_t; - typedef cta_segsort_t<nt, vt, key_t, val_t> sort_t; - - __shared__ union { - typename load_head_flags_t::storage_t load_head_flags; - typename sort_t::storage_t sort; - key_t keys[nv + 1]; - val_t vals[nv]; - } shared; - - // Load the partitions for the segment descriptors and extract head - // flags for each key. - int p[2] = { mp_data[cta], mp_data[cta + 1] }; - int head_flags = load_head_flags_t().load(segments, p, tid, cta, - count, shared.load_head_flags); - - // Load the keys and values. - range_t tile = get_tile(cta, nv, count); - - kv_array_t<key_t, val_t, vt> unsorted; - unsorted.keys = mem_to_reg_thread<nt, vt>(keys + tile.begin, tid, - tile.count(), shared.keys); - if(sort_indices) { - // If we're sorting indices, load from the counting_iterator_t directly - // without staging through shared memory. - iterate<vt>([&](int i) { - unsorted.vals[i] = vals[tile.begin + vt * tid + i]; - }); - } else if(has_values) { - // If we're storing actual values, stage through shared memory. - unsorted.vals = mem_to_reg_thread<nt, vt>(vals + tile.begin, tid, - tile.count(), shared.vals); - } - - // Blocksort. - range_t active { }; - kv_array_t<key_t, val_t, vt> sorted = sort_t().block_sort(unsorted, - tid, tile.count(), head_flags, active, comp, shared.sort); - - // Store the keys and values. - reg_to_mem_thread<nt, vt>(sorted.keys, tid, tile.count(), - keys_blocksort + tile.begin, shared.keys); - if(has_values) - reg_to_mem_thread<nt, vt>(sorted.vals, tid, tile.count(), - vals_blocksort + tile.begin, shared.vals); - - // Store the active range for the entire CTA. These are used by the - // segmented partitioning kernels. - if(!tid) - compressed_ranges_data[cta] = bfi(active.end, active.begin, 16, 16); - }; - cta_transform<launch_t>(blocksort_k, count, context); - - if(1 & num_passes) { - std::swap(this->keys_source, this->keys_dest); - std::swap(this->vals_source, this->vals_dest); - } - } - - void merge_passes() { - - //////////////////////////////////////////////////////////////////////////// - // Execute a partitioning and a merge for each mergesort pass. - - comp_t comp = this->comp; - int num_ranges = num_ctas; - int num_partitions = num_ctas + 1; - int count = this->count; - int nv = this->nv; - - key_t* keys_source = this->keys_source; - val_t* vals_source = this->vals_source; - key_t* keys_dest = this->keys_dest; - val_t* vals_dest = this->vals_dest; - - range_t* source_ranges = merge_ranges.data(); - range_t* dest_ranges = merge_ranges.data(); - - const int* compressed_ranges_data = compressed_ranges.data(); - int* copy_status_data = copy_status.data(); - int* copy_list_data = copy_list.data(); - merge_range_t* merge_list_data = merge_list.data(); - int2* op_counters_data = op_counters.data(); - - for(int pass = 0; pass < num_passes; ++pass) { - int coop = 2<< pass; - - ////////////////////////////////////////////////////////////////////////// - // Partition the data within its segmented mergesort list. - - enum { nt = 64 }; - int num_partition_ctas = div_up(num_partitions, nt - 1); - - auto partition_k = [=] MGPU_DEVICE(int tid, int cta) { - typedef cta_scan_t<nt, int> scan_t; - __shared__ union { - typename scan_t::storage_t scan; - int partitions[nt + 1]; - struct { int merge_offset, copy_offset; }; - } shared; - - int partition = (nt - 1) * cta + tid; - int first = nv * partition; - int count2 = min(nv, count - first); - - int mp0 = 0; - bool active = (tid < nt - 1) && (partition < num_partitions - 1); - int range_index = partition>> pass; - - if(partition < num_partitions) { - - merge_range_t range = compute_mergesort_range(count, partition, - coop, nv); - int diag = min(nv * partition - range.a_begin, range.total()); - - int indices[2] = { - min(num_ranges - 1, ~1 & range_index), - min(num_ranges - 1, 1 | range_index) - }; - range_t ranges[2]; - - if(pass > 0) { - ranges[0] = source_ranges[indices[0]]; - ranges[1] = source_ranges[indices[1]]; - } else { - iterate<2>([&](int i) { - int compressed = compressed_ranges_data[indices[i]]; - int first = nv * indices[i]; - - ranges[i] = range_t { 0x0000ffff & compressed, compressed>> 16 }; - if(nv != ranges[i].begin) ranges[i].begin += first; - else ranges[i].begin = count; - if(-1 != ranges[i].end) ranges[i].end += first; - }); - } - - range_t inner = { - ranges[0].end, - max(range.b_begin, ranges[1].begin) - }; - range_t outer = { - min(ranges[0].begin, ranges[1].begin), - max(ranges[0].end, ranges[1].end) - }; - - // Segmented merge path on inner. - mp0 = segmented_merge_path(keys_source, range, inner, diag, comp); - - // Store outer merge range. - if(active && 0 == diag) - dest_ranges[range_index / 2] = outer; - } - shared.partitions[tid] = mp0; - __syncthreads(); - - int mp1 = shared.partitions[tid + 1]; - __syncthreads(); - - // Update the merge range to include partitioning. - merge_range_t range = compute_mergesort_range(count, partition, coop, - nv, mp0, mp1); - - // Merge if the source interval does not exactly cover the destination - // interval. Otherwise copy or skip. - range_t interval = (1 & range_index) ? - range.b_range() : range.a_range(); - bool merge_op = false; - bool copy_op = false; - - // Create a segsort job. - if(active) { - merge_op = (first != interval.begin) || (interval.count() != count2); - copy_op = !merge_op && (!pass || !copy_status_data[partition]); - - // Use the b_end component to store the index of the destination tile. - // The actual b_end can be inferred from a_count and the length of - // the input array. - range.b_end = partition; - } - - // Scan the counts of merges and copies. - scan_result_t<int> merge_scan = scan_t().scan(tid, (int)merge_op, - shared.scan); - scan_result_t<int> copy_scan = scan_t().scan(tid, (int)copy_op, - shared.scan); - - // Increment the operation counters by the totals. - if(!tid) { - shared.merge_offset = atomicAdd(&op_counters_data[pass].x, - merge_scan.reduction); - shared.copy_offset = atomicAdd(&op_counters_data[pass].y, - copy_scan.reduction); - } - __syncthreads(); - - if(active) { - copy_status_data[partition] = !merge_op; - if(merge_op) - merge_list_data[shared.merge_offset + merge_scan.scan] = range; - if(copy_op) - copy_list_data[shared.copy_offset + copy_scan.scan] = partition; - } - }; - cta_launch<nt>(partition_k, num_partition_ctas, context); - - source_ranges = dest_ranges; - num_ranges = div_up(num_ranges, 2); - dest_ranges += num_ranges; - - ////////////////////////////////////////////////////////////////////////// - // Merge or copy unsorted tiles. - - auto merge_k = [=] MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - - __shared__ union { - key_t keys[nv + 1]; - int indices[nv]; - } shared; - - merge_range_t range = merge_list_data[cta]; - - int tile = range.b_end; - int first = nv * tile; - int count2 = min((int)nv, count - first); - range.b_end = range.b_begin + (count2 - range.a_count()); - - int compressed_range = compressed_ranges_data[tile]; - range_t active = { - 0x0000ffff & compressed_range, - compressed_range>> 16 - }; - load_two_streams_shared<nt, vt>(keys_source + range.a_begin, - range.a_count(), keys_source + range.b_begin, range.b_count(), - tid, shared.keys); - - // Run a merge path search to find the starting point for each thread - // to merge. If the entire warp fits into the already-sorted segments, - // we can skip sorting it and leave its keys in shared memory. - int list_parity = 1 & (tile>> pass); - if(list_parity) active = range_t { 0, active.begin }; - else active = range_t { active.end, nv }; - - int warp_offset = vt * (~(warp_size - 1) & tid); - bool sort_warp = list_parity ? - (warp_offset < active.end) : - (warp_offset + vt * warp_size >= active.begin); - - merge_pair_t<key_t, vt> merge; - merge_range_t local_range = range.to_local(); - if(sort_warp) { - int diag = vt * tid; - int mp = segmented_merge_path(shared.keys, local_range, - active, diag, comp); - - merge_range_t partitioned = local_range.partition(mp, diag); - merge = segmented_serial_merge<vt>(shared.keys, - local_range.partition(mp, diag), active, comp, false); - } else { - iterate<vt>([&](int i) { - merge.indices[i] = vt * tid + i; - }); - } - __syncthreads(); - - // Store keys to global memory. - if(sort_warp) - reg_to_shared_thread<nt, vt>(merge.keys, tid, shared.keys, false); - __syncthreads(); - - shared_to_mem<nt, vt>(shared.keys, tid, count2, keys_dest + first); - - if(has_values) { - // Transpose the indices from thread order to strided order. - array_t<int, vt> indices = reg_thread_to_strided<nt>(merge.indices, - tid, shared.indices); - - // Gather the input values and merge into the output values. - transfer_two_streams_strided<nt>(vals_source + range.a_begin, - range.a_count(), vals_source + range.b_begin, range.b_count(), - indices, tid, vals_dest + first); - } - }; - cta_launch<launch_t>(merge_k, &op_counters_data[pass].x, context); - - auto copy_k = [=] MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - - int tile = copy_list_data[cta]; - int first = nv * tile; - int count2 = min((int)nv, count - first); - - mem_to_mem<nt, vt>(keys_source + first, tid, count2, - keys_dest + first); - - if(has_values) - mem_to_mem<nt, vt>(vals_source + first, tid, count2, - vals_dest + first); - }; - cta_launch<launch_t>(copy_k, &op_counters_data[pass].y, context); - - std::swap(keys_source, keys_dest); - std::swap(vals_source, vals_dest); - } - } -}; - -} // namespace detail - -// Key-value mergesort. -template<typename launch_arg_t = empty_t, typename key_t, typename val_t, - typename seg_it, typename comp_t> -void segmented_sort(key_t* keys, val_t* vals, int count, seg_it segments, - int num_segments, comp_t comp, context_t& context) { - - detail::segsort_t<launch_arg_t, key_t, val_t, comp_t> - segsort(keys, vals, count, comp, context); - - segsort.blocksort_segments(keys, vals, segments, num_segments); - segsort.merge_passes(); -} - -// Key-value mergesort. Automatically generate indices to sort as values. -template<typename launch_arg_t = empty_t, typename key_t, typename seg_it, - typename comp_t> -void segmented_sort_indices(key_t* keys, int* indices, int count, - seg_it segments, int num_segments, comp_t comp, context_t& context) { - - detail::segsort_t<launch_arg_t, key_t, int, comp_t> - segsort(keys, indices, count, comp, context); - - segsort.template blocksort_segments<true>(keys, counting_iterator_t<int>(), - segments, num_segments); - segsort.merge_passes(); -} - -// Key-only segmented sort -template<typename launch_arg_t = empty_t, typename key_t, typename seg_it, - typename comp_t> -void segmented_sort(key_t* keys, int count, seg_it segments, - int num_segments, comp_t comp, context_t& context) { - - segmented_sort<launch_arg_t>(keys, (empty_t*)nullptr, count, - segments, num_segments, comp, context); -} - - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_sortedsearch.hxx b/src/util/cuda/moderngpu/kernel_sortedsearch.hxx deleted file mode 100644 index c614175f..00000000 --- a/src/util/cuda/moderngpu/kernel_sortedsearch.hxx +++ /dev/null @@ -1,64 +0,0 @@ -#pragma once -#include "cta_merge.hxx" -#include "search.hxx" - -BEGIN_MGPU_NAMESPACE - -template<bounds_t bounds, typename launch_arg_t = empty_t, - typename needles_it, typename haystack_it, typename indices_it, - typename comp_it> -void sorted_search(needles_it needles, int num_needles, haystack_it haystack, - int num_haystack, indices_it indices, comp_it comp, context_t& context) { - - typedef typename conditional_typedef_t<launch_arg_t, - launch_box_t< - arch_20_cta<128, 15>, - arch_35_cta<128, 11>, - arch_52_cta<128, 15> - > - >::type_t launch_t; - - typedef typename std::iterator_traits<needles_it>::value_type type_t; - - // Partition the needles and haystacks into tiles. - mem_t<int> partitions = merge_path_partitions<bounds>(needles, num_needles, - haystack, num_haystack, launch_t::nv(context), comp, context); - const int* mp_data = partitions.data(); - - auto k = [=]MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - - __shared__ union { - type_t keys[nv + 1]; - int indices[nv]; - } shared; - - // Load the range for this CTA and merge the values into register. - int mp0 = mp_data[cta + 0]; - int mp1 = mp_data[cta + 1]; - merge_range_t range = compute_merge_range(num_needles, num_haystack, cta, - nv, mp0, mp1); - - // Merge the values needles and haystack. - merge_pair_t<type_t, vt> merge = cta_merge_from_mem<bounds, nt, vt>( - needles, haystack, range, tid, comp, shared.keys); - - // Store the needle indices to shared memory. - iterate<vt>([&](int i) { - if(merge.indices[i] < range.a_count()) { - int needle = merge.indices[i]; - int haystack = range.b_begin + vt * tid + i - needle; - shared.indices[needle] = haystack; - } - }); - __syncthreads(); - - shared_to_mem<nt, vt>(shared.indices, tid, range.a_count(), - indices + range.a_begin); - }; - - cta_transform<launch_t>(k, num_needles + num_haystack, context); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/kernel_workcreate.hxx b/src/util/cuda/moderngpu/kernel_workcreate.hxx deleted file mode 100644 index 69494c90..00000000 --- a/src/util/cuda/moderngpu/kernel_workcreate.hxx +++ /dev/null @@ -1,272 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "search.hxx" -#include "cta_load_balance.hxx" -#include "kernel_scan.hxx" -#include "tuple.hxx" - -BEGIN_MGPU_NAMESPACE - -// experimental feature -namespace expt { - -template<typename launch_arg_t, typename segments_it> -struct workcreate_t { - typedef typename conditional_typedef_t<launch_arg_t, - launch_box_t< - arch_20_cta<128, 11, 8>, - arch_35_cta<128, 7, 5>, - arch_52_cta<128, 11, 8> - > - >::type_t launch_t; - - segments_it segments; - int num_segments; - int count; - context_t& context; - - cta_dim_t cta_dim; - int num_ctas; - - mem_t<int> mp; - mem_t<short> bits; - mem_t<int2> cta_offsets; - int2 cta_total; - - struct add_int2_t { - MGPU_HOST_DEVICE int2 operator()(int2 a, int2 b) const { - return make_int2(a.x + b.x, a.y + b.y); - } - }; - -public: - - struct count_t { - int count; - int num_segments; - }; - - workcreate_t(int count_, segments_it segments_, int num_segments_, - context_t& context_) : - count(count_), segments(segments_), num_segments(num_segments_), - context(context_) { - - // Compute the number of CTAs. - cta_dim = launch_t::cta_dim(context); - num_ctas = cta_dim.num_ctas(count + num_segments); - - mp = load_balance_partitions(count, segments, num_segments, cta_dim.nv(), - context); - - bits = mem_t<short>(num_ctas * cta_dim.nt, context); - - cta_offsets = mem_t<int2>(num_ctas, context); - } - - // f(int index, int seg, int rank, tuple<...> desc) returns the number - // of work-items to create. - template<typename func_t, typename pointers_t> - count_t upsweep(func_t f, pointers_t caching_iterators) { - - const int* mp_data = mp.data(); - short* bits_data = bits.data(); - int2* counts_data = cta_offsets.data(); - int count = this->count; - auto segments = this->segments; - int num_segments = this->num_segments; - - typedef tuple_iterator_value_t<pointers_t> value_t; - auto upsweep_k = [=]MGPU_DEVICE(int tid, int cta) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 }; - typedef cta_reduce_t<nt, int2> reduce_t; - typedef cta_load_balance_t<nt, vt> load_balance_t; - typedef detail::cached_segment_load_t<nt, pointers_t> cached_load_t; - - static_assert(vt <= 16, "mgpu::workcreate_t vt must be <= 16."); - - __shared__ union { - typename reduce_t::storage_t reduce; - typename load_balance_t::storage_t lbs; - typename cached_load_t::storage_t cached; - } shared; - - // Compute the load-balancing search and materialize (index, seg, rank) - // arrays. - auto lbs = load_balance_t().load_balance(count, segments, num_segments, - tid, cta, mp_data, shared.lbs); - - // Call the user-supplied functor f. - short segment_bits = 0; - int work_items = 0; - - // Load from the cached iterators. Use the placement range, not the - // merge-path range for situating the segments. - array_t<value_t, vt> cached_values = cached_load_t::template load<vt0>( - tid, lbs.merge_range.a_count(), lbs.placement.range.b_range(), - lbs.segments, shared.cached, caching_iterators); - - strided_iterate<nt, vt, vt0>([&](int i, int j) { - int index = lbs.merge_range.a_begin + j; - int seg = lbs.segments[i]; - int rank = lbs.ranks[i]; - - int work_count = f(index, seg, rank, cached_values[i]); - - if(work_count > 0) segment_bits |= 1<< i; - work_items += work_count; - }, tid, lbs.merge_range.a_count()); - - // Store the worker bits for this thread. - bits_data[nt * cta + tid] = segment_bits; - - // Scan the segment and work-item counts. - int2 reduction = reduce_t().reduce(tid, - make_int2(popc(segment_bits), work_items), shared.reduce, - nt, add_int2_t(), false); - if(!tid) counts_data[cta] = reduction; - }; - cta_launch<launch_t>(upsweep_k, num_ctas, context); - - // Scan the partial reductions. - mem_t<int2> counts_host(1, context, memory_space_host); - scan_event(counts_data, num_ctas, counts_data, add_int2_t(), - counts_host.data(), context, context.event()); - cudaEventSynchronize(context.event()); - - cta_total = counts_host.data()[0]; - return count_t { cta_total.y, cta_total.x }; - } - - // upsweep without caching iterators. - template<typename func_t> - count_t upsweep(func_t f) { - return upsweep( - [=]MGPU_DEVICE(int index, int seg, int rank, tuple<>) { - return f(index, seg, rank); - }, - tuple<>() - ); - } - - // f(int dest_seg, int index, int source_seg, int rank, tuple<...> desc) - // returns the number of work-items to create. - template<typename func_t, typename pointers_t, typename... args_t> - mem_t<int> downsweep(func_t f, pointers_t caching_iterators, args_t... args) { - // Input - const int* mp_data = mp.data(); - const short* bits_data = bits.data(); - const int2* counts_data = cta_offsets.data(); - int count = this->count; - auto segments = this->segments; - int num_segments = this->num_segments; - - // Output. - int num_dest_segments = cta_total.x; - mem_t<int> segments_result(num_dest_segments, context); - int* segments_output = segments_result.data(); - - // typedef tuple_iterator_value_t<pointers_t> value_t; - // typedef tuple<int> value_t; - auto downsweep_k = [=]MGPU_DEVICE(int tid, int cta, args_t... args) { - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; - typedef cta_scan_t<nt, int> scan_t; - - // Note that this is a struct rather than the typical union. We want - // all three kinds of things to be valid during the callbacks into - // f. - __shared__ struct { - int indices[nv + 2]; - short targets[nv]; - typename scan_t::storage_t scan; - } shared; - - // Decode the bits signifying work creation and compact them. - int segment_bits = bits_data[nt * cta + tid]; - strided_iterate<nt, vt>([&](int i, int j) { - int work_create = 0 != ((1<< i) & segment_bits); - shared.indices[j] = work_create; - }, tid); - __syncthreads(); - - // Do a parallel scan of the work-create flags. Compact the indices - // of the work-creating items into shared.targets. - array_t<int, vt> flags = shared_to_reg_thread<nt, vt>( - shared.indices, tid); - scan_result_t<int> scan = scan_t().scan(tid, reduce(flags), shared.scan); - iterate<vt>([&](int i) { - if(flags[i]) shared.targets[scan.scan++] = (short)(vt * tid + i); - }); - - // Use load-balancing search to fill shared memory with the segment of - // each in-range work-item. - lbs_fill_t fill = cta_load_balance_fill<nt, vt>(count, segments, - num_segments, tid, cta, mp_data, shared.indices); - const int* a_shared = shared.indices; - const int* b_shared = shared.indices + fill.b_offset; - - int num_items = scan.reduction; - int segments_dest = counts_data[cta].x; - int work_item_dest = counts_data[cta].y; - - int num_rounds = div_up(num_items, nt); - for(int i = 0; i < num_rounds; ++i) { - int j = i * nt + tid; - int dest_seg = segments_dest + j; - int work_count = 0; - if(j < num_items) { - // Lookup the segment info. - int cta_index = shared.targets[j]; - int seg = a_shared[cta_index]; - int seg_begin = b_shared[seg]; - int index = fill.range.a_begin + cta_index; - int rank = index - seg_begin; - - // Invoke the callback and the get the work-item count. - tuple<int> cached = load(caching_iterators, seg); - work_count = f(dest_seg, index, seg, rank, cached, args...); - } - - // Scan the work-counts. - scan_result_t<int> work_scan = scan_t().scan(tid, work_count, - shared.scan); - - // Stream the segments-descriptor array. - if(j < num_items) - segments_output[dest_seg] = work_item_dest + work_scan.scan; - work_item_dest += work_scan.reduction; - } - }; - cta_launch<launch_t>(downsweep_k, num_ctas, context, args...); - - return segments_result; - } - - template<typename func_t, typename... args_t> - mem_t<int> downsweep(func_t f, args_t... args) { - return downsweep( - [=]MGPU_DEVICE(int dest_seg, int index, int seg, int rank, tuple<>, - args_t... args) { - return f(dest_seg, index, seg, rank, args...); - }, - tuple<>(), args... - ); - } -}; - -// Use lbs_workcreate to construct an expt::workcreate_t instance. Then call -// upsweep and downsweep, providing an appropriate lambda function. -template<typename launch_arg_t = empty_t, typename segments_it> -workcreate_t<launch_arg_t, segments_it> -lbs_workcreate(int count, segments_it segments, int num_segments, - context_t& context) { - return workcreate_t<launch_arg_t, segments_it> { - count, segments, num_segments, context - }; -} - -} // namespace expt - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/launch_box.hxx b/src/util/cuda/moderngpu/launch_box.hxx deleted file mode 100644 index ad449c64..00000000 --- a/src/util/cuda/moderngpu/launch_box.hxx +++ /dev/null @@ -1,93 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "context.hxx" - -BEGIN_MGPU_NAMESPACE - -// Specializable launch parameters. -struct launch_box_default_t { - typedef launch_cta_t<0, 0, 0> sm_00; - typedef empty_t sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, - sm_60, sm_61, sm_62, sm_70, sm_75; - - template<typename new_base_t> - using rebind = launch_box_default_t; -}; - -template<typename... params_v> -struct launch_box_t : inherit_t<params_v..., launch_box_default_t> { - typedef inherit_t<params_v..., launch_box_default_t> base_t; - - typedef typename conditional_typedef_t< - typename base_t::sm_20, typename base_t::sm_00 - >::type_t sm_20; - -#define INHERIT_LAUNCH_PARAMS(new_ver, old_ver) \ - typedef typename conditional_typedef_t< \ - typename base_t::sm_##new_ver, sm_##old_ver \ - >::type_t sm_##new_ver; - - INHERIT_LAUNCH_PARAMS(21, 20) - INHERIT_LAUNCH_PARAMS(30, 21) - INHERIT_LAUNCH_PARAMS(32, 30) - INHERIT_LAUNCH_PARAMS(35, 30) - INHERIT_LAUNCH_PARAMS(37, 35) - INHERIT_LAUNCH_PARAMS(50, 35) - INHERIT_LAUNCH_PARAMS(52, 50) - INHERIT_LAUNCH_PARAMS(53, 50) - INHERIT_LAUNCH_PARAMS(60, 53) - INHERIT_LAUNCH_PARAMS(61, 60) - INHERIT_LAUNCH_PARAMS(62, 60) - INHERIT_LAUNCH_PARAMS(70, 62) - INHERIT_LAUNCH_PARAMS(75, 70) - - // Overwrite the params defined for sm_00 so that the host-side compiler - // has all expected symbols available to it. - typedef sm_75 sm_00; - typedef MGPU_LAUNCH_PARAMS(launch_box_t) sm_ptx; - - static cta_dim_t cta_dim(int ptx_version) { - // Ptx version from cudaFuncGetAttributes. - if (ptx_version == 75) return cta_dim_t { sm_75::nt, sm_75::vt }; - else if(ptx_version >= 70) return cta_dim_t { sm_70::nt, sm_70::vt }; - else if(ptx_version == 62) return cta_dim_t { sm_62::nt, sm_62::vt }; - else if(ptx_version >= 61) return cta_dim_t { sm_61::nt, sm_61::vt }; - else if(ptx_version >= 60) return cta_dim_t { sm_60::nt, sm_60::vt }; - else if(ptx_version == 53) return cta_dim_t { sm_53::nt, sm_53::vt }; - else if(ptx_version >= 52) return cta_dim_t { sm_52::nt, sm_52::vt }; - else if(ptx_version >= 50) return cta_dim_t { sm_50::nt, sm_50::vt }; - else if(ptx_version == 37) return cta_dim_t { sm_37::nt, sm_37::vt }; - else if(ptx_version >= 35) return cta_dim_t { sm_35::nt, sm_35::vt }; - else if(ptx_version == 32) return cta_dim_t { sm_32::nt, sm_32::vt }; - else if(ptx_version >= 30) return cta_dim_t { sm_30::nt, sm_30::vt }; - else if(ptx_version >= 21) return cta_dim_t { sm_21::nt, sm_21::vt }; - else if(ptx_version >= 20) return cta_dim_t { sm_20::nt, sm_20::vt }; - else return cta_dim_t { -1, 0 }; - } - - static cta_dim_t cta_dim(const context_t& context) { - return cta_dim(context.ptx_version()); - } - - static int nv(const context_t& context) { - return cta_dim(context.ptx_version()).nv(); - } -}; - - -template<typename launch_box, typename func_t, typename... args_t> -int occupancy(func_t f, const context_t& context, args_t... args) { - int num_blocks; - int nt = launch_box::cta_dim(context).nt; - cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &num_blocks, - &launch_box_cta_k<launch_box, func_t, args_t...>, - nt, - (size_t)0 - ); - if(cudaSuccess != result) throw cuda_exception_t(result); - return context.props().multiProcessorCount * num_blocks; -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/launch_params.hxx b/src/util/cuda/moderngpu/launch_params.hxx deleted file mode 100644 index 9dc32b1d..00000000 --- a/src/util/cuda/moderngpu/launch_params.hxx +++ /dev/null @@ -1,152 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "meta.hxx" -#include "tuple.hxx" - -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ == 750 - #define MGPU_SM_TAG sm_75 -#elif __CUDA_ARCH__ >= 700 - #define MGPU_SM_TAG sm_70 -#elif __CUDA_ARCH__ == 620 - #define MGPU_SM_TAG sm_62 -#elif __CUDA_ARCH__ >= 610 - #define MGPU_SM_TAG sm_61 -#elif __CUDA_ARCH__ >= 600 - #define MGPU_SM_TAG sm_60 -#elif __CUDA_ARCH__ == 530 - #define MGPU_SM_TAG sm_53 -#elif __CUDA_ARCH__ >= 520 - #define MGPU_SM_TAG sm_52 -#elif __CUDA_ARCH__ >= 500 - #define MGPU_SM_TAG sm_50 -#elif __CUDA_ARCH__ == 370 - #define MGPU_SM_TAG sm_37 -#elif __CUDA_ARCH__ >= 350 - #define MGPU_SM_TAG sm_35 -#elif __CUDA_ARCH__ == 320 - #define MGPU_SM_TAG sm_32 -#elif __CUDA_ARCH__ >= 300 - #define MGPU_SM_TAG sm_30 -#elif __CUDA_ARCH__ >= 210 - #define MGPU_SM_TAG sm_21 -#elif __CUDA_ARCH__ >= 200 - #define MGPU_SM_TAG sm_20 -#else - #error "Modern GPU v3 does not support builds for sm_1.x" -#endif -#else // __CUDA_ARCH__ - #define MGPU_SM_TAG sm_00 -#endif - -#define MGPU_LAUNCH_PARAMS(launch_box) \ - typename launch_box::MGPU_SM_TAG -#define MGPU_LAUNCH_BOUNDS(launch_box) \ - __launch_bounds__(launch_box::sm_ptx::nt, launch_box::sm_ptx::occ) - -BEGIN_MGPU_NAMESPACE - -struct MGPU_ALIGN(8) cta_dim_t { - int nt, vt; - int nv() const { return nt * vt; } - int num_ctas(int count) const { - return div_up(count, nv()); - } -}; - -namespace detail { - -// Due to a bug in the compiler we need to expand make_restrict() before -// branching on cta < num_ctas. -template<typename func_t, typename... args_t> -MGPU_DEVICE void restrict_forward(func_t f, int tid, int cta, int num_ctas, - args_t... args) { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 - if(cta < num_ctas) -#endif - f(tid, cta, args...); -} - -} - -// Generic thread cta kernel. -template<typename launch_box, typename func_t, typename... args_t> -__global__ MGPU_LAUNCH_BOUNDS(launch_box) -void launch_box_cta_k(func_t f, int num_ctas, args_t... args) { - // Masking threadIdx.x by (nt - 1) may help strength reduction because the - // compiler now knows the range of tid: (0, nt). - typedef typename launch_box::sm_ptx params_t; - int tid = (int)(threadIdx.x % (unsigned)params_t::nt); - int cta = blockIdx.x; - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 - cta += gridDim.x * blockIdx.y; -#endif - - detail::restrict_forward(f, tid, cta, num_ctas, make_restrict(args)...); -} - -// Dummy kernel for retrieving PTX version. -template<int dummy_arg> -__global__ void dummy_k() { } - -template<int nt_, int vt_ = 1, int vt0_ = vt_, int occ_= 0> -struct launch_cta_t { - enum { nt = nt_, vt = vt_, vt0 = vt0_, occ = occ_ }; -}; - -#define DEF_ARCH_STRUCT(ver) \ - template<typename params_t, typename base_t = empty_t> \ - struct arch_##ver : base_t { \ - typedef params_t sm_##ver; \ - \ - template<typename new_base_t> \ - using rebind = arch_##ver<params_t, new_base_t>; \ - }; \ - \ - template<int nt, int vt = 1, int vt0 = vt, int occ = 0> \ - using arch_##ver##_cta = arch_##ver<launch_cta_t<nt, vt, vt0, occ> >; - -DEF_ARCH_STRUCT(20) -DEF_ARCH_STRUCT(21) -DEF_ARCH_STRUCT(30) -DEF_ARCH_STRUCT(32) -DEF_ARCH_STRUCT(35) -DEF_ARCH_STRUCT(37) -DEF_ARCH_STRUCT(50) -DEF_ARCH_STRUCT(52) -DEF_ARCH_STRUCT(53) -DEF_ARCH_STRUCT(60) -DEF_ARCH_STRUCT(61) -DEF_ARCH_STRUCT(62) -DEF_ARCH_STRUCT(70) -DEF_ARCH_STRUCT(75) - -#undef DEF_ARCH_STRUCT - -struct context_t; - -// Non-specializable launch parameters. -template<int nt, int vt, int vt0 = vt, int occ = 0> -struct launch_params_t : launch_cta_t<nt, vt, vt0, occ> { - typedef launch_params_t sm_ptx; - - static cta_dim_t cta_dim() { - return cta_dim_t { nt, vt }; - } - - static cta_dim_t cta_dim(int) { - return cta_dim(); - } - - static cta_dim_t cta_dim(const context_t& context) { - return cta_dim(); - } - - static int nv(const context_t& context) { - return cta_dim().nv(); - } -}; - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/loadstore.hxx b/src/util/cuda/moderngpu/loadstore.hxx deleted file mode 100644 index 836c2a34..00000000 --- a/src/util/cuda/moderngpu/loadstore.hxx +++ /dev/null @@ -1,188 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "types.hxx" -#include "intrinsics.hxx" - -BEGIN_MGPU_NAMESPACE - -//////////////////////////////////////////////////////////////////////////////// -// reg<->shared - -template<int nt, int vt, typename type_t, int shared_size> -MGPU_DEVICE void reg_to_shared_thread(array_t<type_t, vt> x, int tid, - type_t (&shared)[shared_size], bool sync = true) { - - static_assert(shared_size >= nt * vt, - "reg_to_shared_thread must have at least nt * vt storage"); - - thread_iterate<vt>([&](int i, int j) { - shared[j] = x[i]; - }, tid); - if(sync) __syncthreads(); -} - -template<int nt, int vt, typename type_t, int shared_size> -MGPU_DEVICE array_t<type_t, vt> shared_to_reg_thread( - const type_t (&shared)[shared_size], int tid, bool sync = true) { - - static_assert(shared_size >= nt * vt, - "reg_to_shared_thread must have at least nt * vt storage"); - - array_t<type_t, vt> x; - thread_iterate<vt>([&](int i, int j) { - x[i] = shared[j]; - }, tid); - if(sync) __syncthreads(); - return x; -} - -template<int nt, int vt, typename type_t, int shared_size> -MGPU_DEVICE void reg_to_shared_strided(array_t<type_t, vt> x, int tid, - type_t (&shared)[shared_size], bool sync = true) { - - static_assert(shared_size >= nt * vt, - "reg_to_shared_strided must have at least nt * vt storage"); - - strided_iterate<nt, vt>([&](int i, int j) { shared[j] = x[i]; }, tid); - if(sync) __syncthreads(); -} - -template<int nt, int vt, typename type_t, int shared_size> -MGPU_DEVICE array_t<type_t, vt> shared_to_reg_strided( - const type_t (&shared)[shared_size], int tid, bool sync = true) { - - static_assert(shared_size >= nt * vt, - "shared_to_reg_strided must have at least nt * vt storage"); - - array_t<type_t, vt> x; - strided_iterate<nt, vt>([&](int i, int j) { x[i] = shared[j]; }, tid); - if(sync) __syncthreads(); - return x; -} - -template<int nt, int vt, typename type_t, int shared_size> -MGPU_DEVICE array_t<type_t, vt> shared_gather(const type_t(&data)[shared_size], - array_t<int, vt> indices, bool sync = true) { - - static_assert(shared_size >= nt * vt, - "shared_gather must have at least nt * vt storage"); - - array_t<type_t, vt> x; - iterate<vt>([&](int i) { x[i] = data[indices[i]]; }); - if(sync) __syncthreads(); - return x; -} - -template<int nt, int vt, typename type_t, int shared_size> -MGPU_DEVICE array_t<type_t, vt> thread_to_strided(array_t<type_t, vt> x, - int tid, type_t (&shared)[shared_size]) { - - reg_to_shared_thread<nt, vt>(x, tid, shared); - return shared_to_reg_strided<nt, vt>(shared, tid); -} - - - -//////////////////////////////////////////////////////////////////////////////// -// reg<->memory - -template<int nt, int vt, int vt0 = vt, typename type_t, typename it_t> -MGPU_DEVICE void reg_to_mem_strided(array_t<type_t, vt> x, int tid, - int count, it_t mem) { - - strided_iterate<nt, vt, vt0>([=](int i, int j) { - mem[j] = x[i]; - }, tid, count); -} - -template<int nt, int vt, int vt0 = vt, typename it_t> -MGPU_DEVICE array_t<typename std::iterator_traits<it_t>::value_type, vt> -mem_to_reg_strided(it_t mem, int tid, int count) { - typedef typename std::iterator_traits<it_t>::value_type type_t; - array_t<type_t, vt> x; - strided_iterate<nt, vt, vt0>([&](int i, int j) { - x[i] = mem[j]; - }, tid, count); - return x; -} - -template<int nt, int vt, int vt0 = vt, typename type_t, typename it_t, - int shared_size> -MGPU_DEVICE void reg_to_mem_thread(array_t<type_t, vt> x, int tid, - int count, it_t mem, type_t (&shared)[shared_size]) { - - reg_to_shared_thread<nt>(x, tid, shared); - array_t<type_t, vt> y = shared_to_reg_strided<nt, vt>(shared, tid); - reg_to_mem_strided<nt, vt, vt0>(y, tid, count, mem); -} - -template<int nt, int vt, int vt0 = vt, typename type_t, typename it_t, - int shared_size> -MGPU_DEVICE array_t<type_t, vt> mem_to_reg_thread(it_t mem, int tid, - int count, type_t (&shared)[shared_size]) { - - array_t<type_t, vt> x = mem_to_reg_strided<nt, vt, vt0>(mem, tid, count); - reg_to_shared_strided<nt, vt>(x, tid, shared); - array_t<type_t, vt> y = shared_to_reg_thread<nt, vt>(shared, tid); - return y; -} - -template<int nt, int vt, int vt0 = vt, typename input_it, typename output_it> -MGPU_DEVICE void mem_to_mem(input_it input, int tid, int count, - output_it output) { - typedef typename std::iterator_traits<input_it>::value_type type_t; - type_t x[vt]; - - strided_iterate<nt, vt, vt0>([&](int i, int j) { - x[i] = input[j]; - }, tid, count); - strided_iterate<nt, vt, vt0>([&](int i, int j) { - output[j] = x[i]; - }, tid, count); -} - -//////////////////////////////////////////////////////////////////////////////// -// memory<->memory - -template<int nt, int vt, int vt0 = vt, typename type_t, typename it_t> -MGPU_DEVICE void mem_to_shared(it_t mem, int tid, int count, type_t* shared, - bool sync = true) { - - array_t<type_t, vt> x = mem_to_reg_strided<nt, vt, vt0>(mem, tid, count); - strided_iterate<nt, vt, vt0>([&](int i, int j) { - shared[j] = x[i]; - }, tid, count); - if(sync) __syncthreads(); -} - -template<int nt, int vt, typename type_t, typename it_t> -MGPU_DEVICE void shared_to_mem(const type_t* shared, int tid, int count, - it_t mem, bool sync = true) { - - strided_iterate<nt, vt>([&](int i, int j) { - mem[j] = shared[j]; - }, tid, count); - if(sync) __syncthreads(); -} - -//////////////////////////////////////////////////////////////////////////////// -// reg<->reg - -template<int nt, int vt, typename type_t, int shared_size> -MGPU_DEVICE array_t<type_t, vt> reg_thread_to_strided(array_t<type_t, vt> x, - int tid, type_t (&shared)[shared_size]) { - - reg_to_shared_thread<nt>(x, tid, shared); - return shared_to_reg_strided<nt, vt>(shared, tid); -} - -template<int nt, int vt, typename type_t, int shared_size> -MGPU_DEVICE array_t<type_t, vt> reg_strided_to_thread(array_t<type_t, vt> x, - int tid, type_t (&shared)[shared_size]) { - - reg_to_shared_strided<nt>(x, tid, shared); - return shared_to_reg_thread<nt, vt>(shared, tid); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/memory.hxx b/src/util/cuda/moderngpu/memory.hxx deleted file mode 100644 index ecde69e9..00000000 --- a/src/util/cuda/moderngpu/memory.hxx +++ /dev/null @@ -1,131 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "transform.hxx" -#include "context.hxx" - -BEGIN_MGPU_NAMESPACE - -//////////////////////////////////////////////////////////////////////////////// -// Memory functions on raw pointers. - -template<typename type_t> -cudaError_t htoh(type_t* dest, const type_t* source, size_t count) { - if(count) - memcpy(dest, source, sizeof(type_t) * count); - return cudaSuccess; -} - -template<typename type_t> -cudaError_t dtoh(type_t* dest, const type_t* source, size_t count) { - cudaError_t result = count ? - cudaMemcpy(dest, source, sizeof(type_t) * count, - cudaMemcpyDeviceToHost) : - cudaSuccess; - return result; -} - -template<typename type_t> -cudaError_t htod(type_t* dest, const type_t* source, size_t count) { - cudaError_t result = count ? - cudaMemcpy(dest, source, sizeof(type_t) * count, - cudaMemcpyHostToDevice) : - cudaSuccess; - return result; -} - -template<typename type_t> -cudaError_t dtod(type_t* dest, const type_t* source, size_t count) { - cudaError_t result = count ? - cudaMemcpy(dest, source, sizeof(type_t) * count, - cudaMemcpyDeviceToDevice) : - cudaSuccess; - return result; -} - -template<typename type_t> -cudaError_t dtoh(std::vector<type_t>& dest, const type_t* source, - size_t count) { - dest.resize(count); - return dtoh(dest.data(), source, count); -} - -template<typename type_t> -cudaError_t htod(type_t* dest, const std::vector<type_t>& source) { - return htod(dest, source.data(), source.size()); -} - -//////////////////////////////////////////////////////////////////////////////// -// Memory functions on mem_t. - -template<typename type_t> -mem_t<type_t> to_mem(const std::vector<type_t>& data, context_t& context) { - mem_t<type_t> mem(data.size(), context); - cudaError_t result = htod(mem.data(), data); - if(cudaSuccess != result) throw cuda_exception_t(result); - return mem; -} - -template<typename type_t> -std::vector<type_t> from_mem(const mem_t<type_t>& mem) { - std::vector<type_t> host; - cudaError_t result = dtoh(host, mem.data(), mem.size()); - if(cudaSuccess != result) throw cuda_exception_t(result); - return host; -} - -template<typename type_t, typename func_t> -mem_t<type_t> fill_function(func_t f, size_t count, context_t& context) { - mem_t<type_t> mem(count, context); - type_t* p = mem.data(); - transform([=]MGPU_DEVICE(int index) { - p[index] = f(index); - }, count, context); - return mem; -} - -template<typename type_t> -mem_t<type_t> fill(type_t value, size_t count, context_t& context) { - // We'd prefer to call fill_function and pass a lambda that returns value, - // but that can create tokens that are too long for VS2013. - mem_t<type_t> mem(count, context); - type_t* p = mem.data(); - transform([=]MGPU_DEVICE(int index) { - p[index] = value; - }, count, context); - return mem; -} - -template<typename it_t> -auto copy_to_mem(it_t input, size_t count, context_t& context) -> - mem_t<typename std::iterator_traits<it_t>::value_type> { - - typedef typename std::iterator_traits<it_t>::value_type type_t; - mem_t<type_t> mem(count, context); - type_t* p = mem.data(); - transform([=]MGPU_DEVICE(int index) { - p[index] = input[index]; - }, count, context); - return mem; -} - -inline std::mt19937& get_mt19937() { - static std::mt19937 mt19937; - return mt19937; -} - -mem_t<int> inline fill_random(int a, int b, size_t count, bool sorted, - context_t& context) { - - std::uniform_int_distribution<int> d(a, b); - std::vector<int> data(count); - - for(int& i : data) - i = d(get_mt19937()); - if(sorted) - std::sort(data.begin(), data.end()); - - return to_mem(data, context); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/meta.hxx b/src/util/cuda/moderngpu/meta.hxx deleted file mode 100644 index 369c303e..00000000 --- a/src/util/cuda/moderngpu/meta.hxx +++ /dev/null @@ -1,249 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include <typeinfo> -#include <type_traits> -#include <iterator> -#include <cassert> -#include <cfloat> -#include <cstdint> - -#ifdef __CUDACC__ - -#ifndef MGPU_HOST_DEVICE - #define MGPU_HOST_DEVICE __forceinline__ __device__ __host__ -#endif - -#ifndef MGPU_DEVICE - #define MGPU_DEVICE __device__ -#endif - -// Currently NVCC does not support __device__ __host__ tags on lambdas that -// are captured on the host and executed on the device. There is no good reason -// for this, as you can __device__ __host__ tag functor operators and use -// them in the same way. So for now, tag your functors with MGPU_LAMBDA. This -// means they are only supported in device code, but when a future version of -// CUDA lists this restriction MGPU_LAMBDA will be redefined to __device__ -// __host__. -#ifndef MGPU_LAMBDA - #define MGPU_LAMBDA __device__ -#endif - -#else // #ifndef __CUDACC__ - -#define MGPU_HOST_DEVICE - -#endif // #ifdef __CUDACC__ - -#ifndef PRAGMA_UNROLL -#if defined(__CUDA_ARCH__) && !defined(__clang__) - #define PRAGMA_UNROLL #pragma PRAGMA_UNROLL -#else - #define PRAGMA_UNROLL -#endif -#endif - -#define BEGIN_MGPU_NAMESPACE namespace mgpu { -#define END_MGPU_NAMESPACE } - -BEGIN_MGPU_NAMESPACE - -template< bool B, class T = void > -using enable_if_t = typename std::enable_if<B,T>::type; - -enum { warp_size = 32 }; - -#if defined(_MSC_VER) && _MSC_VER <= 1800 // VS 2013 is terrible. - -#define is_pow2(x) (0 == ((x) & ((x) - 1))) -#define div_up(x, y) (((x) + (y) - 1) / (y)) - -namespace details { -template<int i, bool recurse = (i > 1)> -struct s_log2_t { - enum { value = s_log2_t<i / 2>::value + 1 }; -}; -template<int i> struct s_log2_t<i, false> { - enum { value = 0 }; -}; -} // namespace details - -#define s_log2(x) details::s_log2_t<x>::value - -#else - -MGPU_HOST_DEVICE constexpr bool is_pow2(int x) { - return 0 == (x & (x - 1)); -} -MGPU_HOST_DEVICE constexpr int div_up(int x, int y) { - return (x + y - 1) / y; -} -MGPU_HOST_DEVICE constexpr int64_t div_up(int64_t x, int64_t y) { - return (x + y - 1) / y; -} -MGPU_HOST_DEVICE constexpr size_t div_up(size_t x, size_t y) { - return (x + y - 1) / y; -} -MGPU_HOST_DEVICE constexpr int s_log2(int x, int p = 0) { - return x > 1 ? s_log2(x / 2) + 1 : p; -} -MGPU_HOST_DEVICE constexpr size_t s_log2(size_t x, size_t p = 0) { - return x > 1 ? s_log2(x / 2) + 1 : p; -} - -#endif - -#ifdef _MSC_VER - #define MGPU_ALIGN(x) __declspec(align(x)) -#else - #define MGPU_ALIGN(x) __attribute__((aligned(x))) -#endif - -// Apparently not defined by CUDA. -template<typename real_t> -MGPU_HOST_DEVICE constexpr real_t min(real_t a, real_t b) { - return (b < a) ? b : a; -} -template<typename real_t> -MGPU_HOST_DEVICE constexpr real_t max(real_t a, real_t b) { - return (a < b) ? b : a; -} - -struct empty_t { }; - -template<typename... args_t> -MGPU_HOST_DEVICE void swallow(args_t...) { } - -template<typename... base_v> -struct inherit_t; - -template<typename base_t, typename... base_v> -struct inherit_t<base_t, base_v...> : - base_t::template rebind<inherit_t<base_v...> > { }; - -template<typename base_t> -struct inherit_t<base_t> : base_t { }; - -//////////////////////////////////////////////////////////////////////////////// -// Conditional typedefs. - -// Typedef type_a if type_a is not empty_t. -// Otherwise typedef type_b. -template<typename type_a, typename type_b> -struct conditional_typedef_t { - typedef typename std::conditional< - !std::is_same<type_a, empty_t>::value, - type_a, - type_b - >::type type_t; -}; - -//////////////////////////////////////////////////////////////////////////////// -// Code to treat __restrict__ as a CV qualifier. - -template<typename arg_t> -struct is_restrict { - enum { value = false }; -}; -template<typename arg_t> -struct is_restrict<arg_t __restrict__> { - enum { value = true }; -}; - -// Add __restrict__ only to pointers. -template<typename arg_t> -struct add_restrict { - typedef arg_t type; -}; -template<typename arg_t> -struct add_restrict<arg_t*> { - typedef arg_t* __restrict__ type; -}; - -template<typename arg_t> -struct remove_restrict { - typedef arg_t type; -}; -template<typename arg_t> -struct remove_restrict<arg_t __restrict__> { - typedef arg_t type; -}; - -template<typename arg_t> -MGPU_HOST_DEVICE typename add_restrict<arg_t>::type make_restrict(arg_t x) { - typename add_restrict<arg_t>::type y = x; - return y; -} - -//////////////////////////////////////////////////////////////////////////////// -// Template unrolled looping construct. - -template<int i, int count, bool valid = (i < count)> -struct iterate_t { - #pragma nv_exec_check_disable - template<typename func_t> - MGPU_HOST_DEVICE static void eval(func_t f) { - f(i); - iterate_t<i + 1, count>::eval(f); - } -}; -template<int i, int count> -struct iterate_t<i, count, false> { - template<typename func_t> - MGPU_HOST_DEVICE static void eval(func_t f) { } -}; -template<int begin, int end, typename func_t> -MGPU_HOST_DEVICE void iterate(func_t f) { - iterate_t<begin, end>::eval(f); -} -template<int count, typename func_t> -MGPU_HOST_DEVICE void iterate(func_t f) { - iterate<0, count>(f); -} - -template<int count, typename type_t> -MGPU_HOST_DEVICE type_t reduce(const type_t(&x)[count]) { - type_t y; - iterate<count>([&](int i) { y = i ? x[i] + y : x[i]; }); - return y; -} - -template<int count, typename type_t> -MGPU_HOST_DEVICE void fill(type_t(&x)[count], type_t val) { - iterate<count>([&](int i) { x[i] = val; }); -} - -#ifdef __CUDACC__ - -// Invoke unconditionally. -template<int nt, int vt, typename func_t> -MGPU_DEVICE void strided_iterate(func_t f, int tid) { - iterate<vt>([=](int i) { f(i, nt * i + tid); }); -} - -// Check range. -template<int nt, int vt, int vt0 = vt, typename func_t> -MGPU_DEVICE void strided_iterate(func_t f, int tid, int count) { - // Unroll the first vt0 elements of each thread. - if(vt0 > 1 && count >= nt * vt0) { - strided_iterate<nt, vt0>(f, tid); // No checking - } else { - iterate<vt0>([=](int i) { - int j = nt * i + tid; - if(j < count) f(i, j); - }); - } - - iterate<vt0, vt>([=](int i) { - int j = nt * i + tid; - if(j < count) f(i, j); - }); -} -template<int vt, typename func_t> -MGPU_DEVICE void thread_iterate(func_t f, int tid) { - iterate<vt>([=](int i) { f(i, vt * tid + i); }); -} - -#endif // ifdef __CUDACC__ - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/operators.hxx b/src/util/cuda/moderngpu/operators.hxx deleted file mode 100644 index 2178ce75..00000000 --- a/src/util/cuda/moderngpu/operators.hxx +++ /dev/null @@ -1,347 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "meta.hxx" - -BEGIN_MGPU_NAMESPACE - -namespace detail { - -template<typename it_t, - typename type_t = typename std::iterator_traits<it_t>::value_type, - bool use_ldg = - std::is_pointer<it_t>::value && - std::is_arithmetic<type_t>::value -> -struct ldg_load_t { - MGPU_HOST_DEVICE static type_t load(it_t it) { - return *it; - } -}; - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 - -template<typename it_t, typename type_t> -struct ldg_load_t<it_t, type_t, true> { - MGPU_HOST_DEVICE static type_t load(it_t it) { - return __ldg(it); - } -}; - -#endif - -} // namespace detail - -template<typename it_t> -MGPU_HOST_DEVICE typename std::iterator_traits<it_t>::value_type -ldg(it_t it) { - return detail::ldg_load_t<it_t>::load(it); -} - -template<typename real_t> -MGPU_HOST_DEVICE real_t sq(real_t x) { return x * x; } - -template<typename type_t> -MGPU_HOST_DEVICE void swap(type_t& a, type_t& b) { - type_t c = a; a = b; b = c; -} - -//////////////////////////////////////////////////////////////////////////////// -// Device-side comparison operators. - -template<typename type_t> -struct less_t : public std::binary_function<type_t, type_t, bool> { - MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { - return a < b; - } -}; -template<typename type_t> -struct less_equal_t : public std::binary_function<type_t, type_t, bool> { - MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { - return a <= b; - } -}; -template<typename type_t> -struct greater_t : public std::binary_function<type_t, type_t, bool> { - MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { - return a > b; - } -}; -template<typename type_t> -struct greater_equal_t : public std::binary_function<type_t, type_t, bool> { - MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { - return a >= b; - } -}; -template<typename type_t> -struct equal_to_t : public std::binary_function<type_t, type_t, bool> { - MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { - return a == b; - } -}; -template<typename type_t> -struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> { - MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { - return a != b; - } -}; - -//////////////////////////////////////////////////////////////////////////////// -// Device-side arithmetic operators. - -template<typename type_t> -struct plus_t : public std::binary_function<type_t, type_t, type_t> { - MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { - return a + b; - } -}; - -template<typename type_t> -struct minus_t : public std::binary_function<type_t, type_t, type_t> { - MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { - return a - b; - } -}; - -template<typename type_t> -struct multiplies_t : public std::binary_function<type_t, type_t, type_t> { - MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { - return a * b; - } -}; - -template<typename type_t> -struct maximum_t : public std::binary_function<type_t, type_t, type_t> { - MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { - return max(a, b); - } -}; - -template<typename type_t> -struct minimum_t : public std::binary_function<type_t, type_t, type_t> { - MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { - return min(a, b); - } -}; - -//////////////////////////////////////////////////////////////////////////////// -// iterator_t and const_iterator_t are base classes for customized iterators. - -template<typename outer_t, typename int_t, typename value_type> -struct iterator_t : public std::iterator_traits<const value_type*> { - - iterator_t() = default; - MGPU_HOST_DEVICE iterator_t(int_t i) : index(i) { } - - MGPU_HOST_DEVICE outer_t operator+(int_t diff) const { - outer_t next = *static_cast<const outer_t*>(this); - next += diff; - return next; - } - MGPU_HOST_DEVICE outer_t operator-(int_t diff) const { - outer_t next = *static_cast<const outer_t*>(this); - next -= diff; - return next; - } - MGPU_HOST_DEVICE outer_t& operator+=(int_t diff) { - index += diff; - return *static_cast<outer_t*>(this); - } - MGPU_HOST_DEVICE outer_t& operator-=(int_t diff) { - index -= diff; - return *static_cast<outer_t*>(this); - } - - int_t index; -}; - -template<typename outer_t, typename int_t, typename value_type> -struct const_iterator_t : public iterator_t<outer_t, int_t, value_type> { - typedef iterator_t<outer_t, int_t, value_type> base_t; - - const_iterator_t() = default; - MGPU_HOST_DEVICE const_iterator_t(int_t i) : base_t(i) { } - - // operator[] and operator* are tagged as DEVICE-ONLY. This is to ensure - // compatibility with lambda capture in CUDA 7.5, which does not support - // marking a lambda as __host__ __device__. - // We hope to relax this when a future CUDA fixes this problem. - MGPU_HOST_DEVICE value_type operator[](int_t diff) const { - return static_cast<const outer_t&>(*this)(base_t::index + diff); - } - MGPU_HOST_DEVICE value_type operator*() const { - return (*this)[0]; - } -}; - -//////////////////////////////////////////////////////////////////////////////// -// discard_iterator_t is a store iterator that discards its input. - -template<typename value_type> -struct discard_iterator_t : - iterator_t<discard_iterator_t<value_type>, int, value_type> { - - struct assign_t { - MGPU_HOST_DEVICE value_type operator=(value_type v) { - return value_type(); - } - }; - - MGPU_HOST_DEVICE assign_t operator[](int index) const { - return assign_t(); - } - MGPU_HOST_DEVICE assign_t operator*() const { return assign_t(); } -}; - -//////////////////////////////////////////////////////////////////////////////// -// counting_iterator_t returns index. - -template<typename type_t, typename int_t = int> -struct counting_iterator_t : - const_iterator_t<counting_iterator_t<type_t>, int_t, type_t> { - - counting_iterator_t() = default; - MGPU_HOST_DEVICE counting_iterator_t(type_t i) : - const_iterator_t<counting_iterator_t, int_t, type_t>(i) { } - - MGPU_HOST_DEVICE type_t operator()(int_t index) const { - return (type_t)index; - } -}; - -//////////////////////////////////////////////////////////////////////////////// -// strided_iterator_t returns offset + index * stride. - -template<typename type_t, typename int_t = int> -struct strided_iterator_t : - const_iterator_t<strided_iterator_t<type_t>, int_t, int> { - - strided_iterator_t() = default; - MGPU_HOST_DEVICE strided_iterator_t(type_t offset_, type_t stride_) : - const_iterator_t<strided_iterator_t, int_t, type_t>(0), - offset(offset_), stride(stride_) { } - - MGPU_HOST_DEVICE type_t operator()(int_t index) const { - return offset + index * stride; - } - - type_t offset, stride; -}; - -//////////////////////////////////////////////////////////////////////////////// -// constant_iterator_t returns the value it was initialized with. - -template<typename type_t> -struct constant_iterator_t : - const_iterator_t<constant_iterator_t<type_t>, int, type_t> { - - type_t value; - - MGPU_HOST_DEVICE constant_iterator_t(type_t value_) : value(value_) { } - - MGPU_HOST_DEVICE type_t operator()(int index) const { - return value; - } -}; - -// These types only supported with nvcc until CUDA 8.0 allows host-device -// lambdas and MGPU_LAMBDA is redefined to MGPU_HOST_DEVICE - -#ifdef __CUDACC__ - -//////////////////////////////////////////////////////////////////////////////// -// lambda_iterator_t - -template<typename load_t, typename store_t, typename value_type, typename int_t> -struct lambda_iterator_t : std::iterator_traits<const value_type*> { - - load_t load; - store_t store; - int_t base; - - lambda_iterator_t(load_t load_, store_t store_, int_t base_) : - load(load_), store(store_), base(base_) { } - - struct assign_t { - load_t load; - store_t store; - int_t index; - - MGPU_LAMBDA assign_t& operator=(value_type rhs) { - static_assert(!std::is_same<store_t, empty_t>::value, - "load_iterator is being stored to."); - store(rhs, index); - return *this; - } - MGPU_LAMBDA operator value_type() const { - static_assert(!std::is_same<load_t, empty_t>::value, - "store_iterator is being loaded from."); - return load(index); - } - }; - - MGPU_LAMBDA assign_t operator[](int_t index) const { - return assign_t { load, store, base + index }; - } - MGPU_LAMBDA assign_t operator*() const { - return assign_t { load, store, base }; - } - - MGPU_HOST_DEVICE lambda_iterator_t operator+(int_t offset) const { - lambda_iterator_t cp = *this; - cp += offset; - return cp; - } - - MGPU_HOST_DEVICE lambda_iterator_t& operator+=(int_t offset) { - base += offset; - return *this; - } - - MGPU_HOST_DEVICE lambda_iterator_t operator-(int_t offset) const { - lambda_iterator_t cp = *this; - cp -= offset; - return cp; - } - - MGPU_HOST_DEVICE lambda_iterator_t& operator-=(int_t offset) { - base -= offset; - return *this; - } -}; - -template<typename value_type> -struct trivial_load_functor { - template<typename int_t> - MGPU_HOST_DEVICE value_type operator()(int_t index) const { - return value_type(); - } -}; - -template<typename value_type> -struct trivial_store_functor { - template<typename int_t> - MGPU_HOST_DEVICE void operator()(value_type v, int_t index) const { } -}; - -template<typename value_type, typename int_t = int, typename load_t, - typename store_t> -lambda_iterator_t<load_t, store_t, value_type, int_t> - make_load_store_iterator(load_t load, store_t store, int_t base = 0) { - return lambda_iterator_t<load_t, store_t, value_type, int_t>(load, store, base); -} - -template<typename value_type, typename int_t = int, typename load_t> -lambda_iterator_t<load_t, empty_t, value_type, int_t> -make_load_iterator(load_t load, int_t base = 0) { - return make_load_store_iterator<value_type>(load, empty_t(), base); -} - -template<typename value_type, typename int_t = int, typename store_t> -lambda_iterator_t<empty_t, store_t, value_type, int_t> -make_store_iterator(store_t store, int_t base = 0) { - return make_load_store_iterator<value_type>(empty_t(), store, base); -} - -#endif // #ifdef __CUDACC__ - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/search.hxx b/src/util/cuda/moderngpu/search.hxx deleted file mode 100644 index 2d8b6b40..00000000 --- a/src/util/cuda/moderngpu/search.hxx +++ /dev/null @@ -1,53 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "loadstore.hxx" -#include "operators.hxx" -#include "cta_search.hxx" -#include "memory.hxx" -#include "context.hxx" - -BEGIN_MGPU_NAMESPACE - -template<bounds_t bounds, typename a_keys_it, typename b_keys_it, - typename comp_t> -mem_t<int> merge_path_partitions(a_keys_it a, int64_t a_count, b_keys_it b, - int64_t b_count, int64_t spacing, comp_t comp, context_t& context) { - - typedef int int_t; - int num_partitions = (int)div_up(a_count + b_count, spacing) + 1; - mem_t<int_t> mem(num_partitions, context); - int_t* p = mem.data(); - transform([=]MGPU_DEVICE(int index) { - int_t diag = (int_t)min(spacing * index, a_count + b_count); - p[index] = merge_path<bounds>(a, (int_t)a_count, b, (int_t)b_count, - diag, comp); - }, num_partitions, context); - return mem; -} - -template<typename segments_it> -auto load_balance_partitions(int64_t dest_count, segments_it segments, - int num_segments, int spacing, context_t& context) -> - mem_t<typename std::iterator_traits<segments_it>::value_type> { - - typedef typename std::iterator_traits<segments_it>::value_type int_t; - return merge_path_partitions<bounds_upper>(counting_iterator_t<int_t>(0), - dest_count, segments, num_segments, spacing, less_t<int_t>(), context); -} - -template<bounds_t bounds, typename keys_it> -mem_t<int> binary_search_partitions(keys_it keys, int count, int num_items, - int spacing, context_t& context) { - - int num_partitions = div_up(count, spacing) + 1; - mem_t<int> mem(num_partitions, context); - int* p = mem.data(); - transform([=]MGPU_DEVICE(int index) { - int key = min(spacing * index, count); - p[index] = binary_search<bounds>(keys, num_items, key, less_t<int>()); - }, num_partitions, context); - return mem; -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/sort_networks.hxx b/src/util/cuda/moderngpu/sort_networks.hxx deleted file mode 100644 index 38686edf..00000000 --- a/src/util/cuda/moderngpu/sort_networks.hxx +++ /dev/null @@ -1,57 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once -#include "operators.hxx" - -BEGIN_MGPU_NAMESPACE - -//////////////////////////////////////////////////////////////////////////////// -// Odd-even transposition sorting network. Sorts keys and values in-place in -// register. -// http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort - -template<typename type_t, int vt, typename comp_t> -MGPU_HOST_DEVICE array_t<type_t, vt> -odd_even_sort(array_t<type_t, vt> x, comp_t comp, int flags = 0) { - iterate<vt>([&](int I) { - PRAGMA_UNROLL - for(int i = 1 & I; i < vt - 1; i += 2) { - if((0 == ((2<< i) & flags)) && comp(x[i + 1], x[i])) - swap(x[i], x[i + 1]); - } - }); - return x; -} - -template<typename key_t, typename val_t, int vt, typename comp_t> -MGPU_HOST_DEVICE kv_array_t<key_t, val_t, vt> -odd_even_sort(kv_array_t<key_t, val_t, vt> x, comp_t comp, int flags = 0) { - iterate<vt>([&](int I) { - PRAGMA_UNROLL - for(int i = 1 & I; i < vt - 1; i += 2) { - if((0 == ((2<< i) & flags)) && comp(x.keys[i + 1], x.keys[i])) { - swap(x.keys[i], x.keys[i + 1]); - swap(x.vals[i], x.vals[i + 1]); - } - } - }); - return x; -} - -//////////////////////////////////////////////////////////////////////////////// -// TODO: Batcher Odd-Even Mergesort network -// Unstable but executes much faster than the transposition sort. -// http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort -#if 0 -template<int width, int low, int count> -struct odd_even_mergesort_t { - -}; - -template<typename key_t, typename val_t, int vt, typename comp_t> -MGPU_HOST_DEVICE kv_array_t<key_t, val_t, vt> -odd_even_mergesort(kv_array_t<key_t, val_t, vt> x, int flags = 0) { - return kv_array_t<key_t, val_t, vt>(); -} -#endif - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/transform.hxx b/src/util/cuda/moderngpu/transform.hxx deleted file mode 100644 index 99295a81..00000000 --- a/src/util/cuda/moderngpu/transform.hxx +++ /dev/null @@ -1,107 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - - -#include <random> -#include <algorithm> -#include <cuda.h> -#include "launch_box.hxx" - -BEGIN_MGPU_NAMESPACE - -//////////////////////////////////////////////////////////////////////////////// -// Launch a grid given a number of CTAs. - -template<typename launch_box, typename func_t, typename... args_t> -void cta_launch(func_t f, int num_ctas, context_t& context, args_t... args) { - cta_dim_t cta = launch_box::cta_dim(context.ptx_version()); - dim3 grid_dim(num_ctas); - if(context.ptx_version() < 30 && num_ctas > 65535) - grid_dim = dim3(256, div_up(num_ctas, 256)); - - if(num_ctas) - { - launch_box_cta_k<launch_box, func_t> - <<<grid_dim, cta.nt,0,context.stream()>>>(f, num_ctas, args...); - } -} - -template<int nt, int vt = 1, typename func_t, typename... args_t> -void cta_launch(func_t f, int num_ctas, context_t& context, args_t... args) { - cta_launch<launch_params_t<nt, vt> >(f, num_ctas, context, args...); -} - -//////////////////////////////////////////////////////////////////////////////// -// Launch a grid given a number of work-items. - -template<typename launch_box, typename func_t, typename... args_t> -void cta_transform(func_t f, int count, context_t& context, args_t... args) { - cta_dim_t cta = launch_box::cta_dim(context.ptx_version()); - int num_ctas = div_up(count, cta.nv()); - cta_launch<launch_box>(f, num_ctas, context, args...); -} - -template<int nt, int vt = 1, typename func_t, typename... args_t> -void cta_transform(func_t f, int count, context_t& context, args_t... args) { - cta_transform<launch_params_t<nt, vt> >(f, count, context, args...); -} - -//////////////////////////////////////////////////////////////////////////////// -// Launch persistent CTAs and loop through num_ctas values. - -template<typename launch_box, typename func_t, typename... args_t> -void cta_launch(func_t f, const int* num_tiles, context_t& context, - args_t... args) { - - // Over-subscribe the device by a factor of 8. - // This reduces the penalty if we can't schedule all the CTAs to run - // concurrently. - int num_ctas = 8 * occupancy<launch_box>(f, context); - - auto k = [=] MGPU_DEVICE(int tid, int cta, args_t... args) { - int count = *num_tiles; - while(cta < count) { - f(tid, cta, args...); - cta += num_ctas; - } - }; - cta_launch<launch_box>(k, num_ctas, context, args...); -} - -//////////////////////////////////////////////////////////////////////////////// -// Ordinary transform launch. This uses the standard launch box mechanism -// so we can query its occupancy and other things. - -namespace detail { - -template<typename launch_t> -struct transform_f { - template<typename func_t, typename... args_t> - MGPU_DEVICE void operator()(int tid, int cta, func_t f, - size_t count, args_t... args) { - - typedef typename launch_t::sm_ptx params_t; - enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 }; - - range_t range = get_tile(cta, nt * vt, count); - - strided_iterate<nt, vt, vt0>([=](int i, int j) { - f(range.begin + j, args...); - }, tid, range.count()); - } -}; - -} - -template<typename launch_t, typename func_t, typename... args_t> -void transform(func_t f, size_t count, context_t& context, args_t... args) { - cta_transform<launch_t>(detail::transform_f<launch_t>(), count, - context, f, count, args...); -} - -template<size_t nt = 128, int vt = 1, typename func_t, typename... args_t> -void transform(func_t f, size_t count, context_t& context, args_t... args) { - transform<launch_params_t<nt, vt> >(f, count, context, args...); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/tuple.hxx b/src/util/cuda/moderngpu/tuple.hxx deleted file mode 100644 index 2e381f52..00000000 --- a/src/util/cuda/moderngpu/tuple.hxx +++ /dev/null @@ -1,393 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "meta.hxx" - -BEGIN_MGPU_NAMESPACE - -template<typename type_t> -using decay_t = typename std::decay<type_t>::type; - -///////////////// -// index_sequence - -// Improved linear index_sequence from -// http://talesofcpp.fusionfenix.com/post-22/true-story-efficient-packing -template<size_t... int_s> -struct index_sequence { - enum { size = sizeof...(int_s) }; -}; - -namespace detail { -template<typename seq_t> -struct _next; - -template<size_t... seq_i> -struct _next<index_sequence<seq_i...> > { - // grow the sequence by one element. - typedef index_sequence<seq_i..., sizeof...(seq_i)> type; -}; - -template<size_t count> -struct _make_index_sequence : - _next<typename _make_index_sequence<count - 1>::type> { }; - -template<> struct _make_index_sequence<0> { - typedef index_sequence<> type; -}; -} // namespace detail - -template<size_t count> -using make_index_sequence = - typename detail::_make_index_sequence<count>::type; - -////////// -// var_and - -template<bool... args_b> -struct var_and; - -template<bool arg_a, bool... args_b> -struct var_and<arg_a, args_b...> { - enum { value = arg_a && var_and<args_b...>::value }; -}; -template<bool arg_a> -struct var_and<arg_a> { - enum { value = arg_a }; -}; -template<> -struct var_and<> { - enum { value = true }; -}; - -////////// -// var_or - -template<bool... args_b> -struct var_or; - -template<bool arg_a, bool... args_b> -struct var_or<arg_a, args_b...> { - enum { value = arg_a || var_or<args_b...>::value }; -}; -template<bool arg_a> -struct var_or<arg_a> { - enum { value = arg_a }; -}; -template<> -struct var_or<> { - enum { value = false }; -}; - - - -// Forward declare the tuple. -template<typename... args_t> -struct tuple; - -//////////////// -// tuple_element - -template<size_t i, typename tpl_t> -struct tuple_element; - -template<size_t i, typename arg_t, typename... args_t> -struct tuple_element<i, tuple<arg_t, args_t...> > : - tuple_element<i - 1, tuple<args_t...> > { }; - -template<typename arg_t, typename... args_t> -struct tuple_element<0, tuple<arg_t, args_t...> > { - typedef arg_t type; -}; - -template<size_t i, typename tpl_t> -using tuple_element_t = typename tuple_element<i, tpl_t>::type; - -///////////// -// tuple_size - -template<typename tpl_t> -struct tuple_size; - -template<typename... args_t> -struct tuple_size<tuple<args_t...> > { - enum { value = sizeof...(args_t) }; -}; - - -namespace detail { - -template<size_t i, typename arg_t, bool is_empty = std::is_empty<arg_t>::value> -struct tuple_leaf { - arg_t x; - - MGPU_HOST_DEVICE arg_t& get() { return x; } - MGPU_HOST_DEVICE const arg_t& get() const { return x; } - - tuple_leaf() = default; - tuple_leaf(const tuple_leaf&) = default; - - template<typename arg2_t, - typename = typename std::enable_if< - std::is_constructible<arg_t, arg2_t&&>::value - >::type - > MGPU_HOST_DEVICE - tuple_leaf(arg2_t&& arg) : x(std::forward<arg2_t>(arg)) { } - - template<typename arg2_t, - typename = typename std::enable_if< - std::is_constructible<arg_t, const arg2_t&>::value - >::type - > MGPU_HOST_DEVICE - tuple_leaf(const arg2_t& arg) : x(arg) { } -}; - -template<size_t i, typename arg_t> -struct tuple_leaf<i, arg_t, true> : arg_t { - arg_t& get() { return *this; } - const arg_t& get() const { return *this; } - - template<typename arg2_t, - typename = typename std::enable_if< - std::is_constructible<arg_t, const arg2_t&>::value - >::type - > MGPU_HOST_DEVICE - tuple_leaf(const arg2_t& arg) : arg_t(arg) { } -}; - -template<size_t i, typename... args_t> -struct tuple_impl; - -template<size_t i> -struct tuple_impl<i> { }; - -template<size_t i, typename arg_t, typename... args_t> -struct tuple_impl<i, arg_t, args_t...> : - tuple_leaf<i, arg_t>, - tuple_impl<i + 1, args_t...> { - - typedef tuple_leaf<i, arg_t> head_t; - typedef tuple_impl<i + 1, args_t...> tail_t; - - MGPU_HOST_DEVICE arg_t& head() { return head_t::get(); } - MGPU_HOST_DEVICE const arg_t& head() const { return head_t::get(); } - - MGPU_HOST_DEVICE tail_t& tail() { return *this; } - MGPU_HOST_DEVICE const tail_t& tail() const { return *this; } - - // Constructors. - tuple_impl() = default; - explicit tuple_impl(const tuple_impl&) = default; - - template<typename... args2_t> MGPU_HOST_DEVICE - explicit tuple_impl(const tuple_impl<i, args2_t...>& rhs) : - head_t(rhs.head()), tail_t(rhs.tail()) { } - - template<typename... args2_t> MGPU_HOST_DEVICE - explicit tuple_impl(tuple_impl<i, args2_t...>&& rhs) : - head_t(std::move(rhs.head())), - tail_t(std::move(rhs.tail())) { } - - template<typename arg2_t, typename... args2_t, - typename = typename std::enable_if< - sizeof...(args_t) == sizeof...(args2_t) && - std::is_constructible<arg_t, arg2_t&&>::value && - var_and<std::is_constructible<args_t, args2_t&&>::value...>::value - >::type - > MGPU_HOST_DEVICE - tuple_impl(arg2_t&& arg, args2_t&&... args) : - head_t(std::forward<arg2_t>(arg)), - tail_t(std::forward<args2_t>(args)...) { } - - template<typename arg2_t, typename... args2_t, - typename = typename std::enable_if< - std::is_constructible<arg_t, const arg2_t&>::value && - var_and<std::is_constructible<args_t, const args2_t&>::value...>::value - >::type - > MGPU_HOST_DEVICE - tuple_impl(const arg2_t& arg, const args2_t&... args) : - head_t(arg), tail_t(args...) { } - - // Assignment -}; - -template<size_t i, typename arg_t> MGPU_HOST_DEVICE -tuple_leaf<i, arg_t>& get_leaf(tuple_leaf<i, arg_t>& leaf) { - return leaf; -} - -template<size_t i, typename arg_t> MGPU_HOST_DEVICE -const tuple_leaf<i, arg_t>& get_leaf(const tuple_leaf<i, arg_t>& leaf) { - return leaf; -} - -} // namespace detail - -template<typename... args_t> -struct tuple : detail::tuple_impl<0, args_t...> { - typedef detail::tuple_impl<0, args_t...> impl_t; - - tuple() = default; - tuple(const tuple&) = default; - - template<typename... args2_t, - typename = typename std::enable_if< - sizeof...(args2_t) == sizeof...(args_t) && - var_and<std::is_constructible<args_t, const args2_t&>::value...>::value - >::type - > MGPU_HOST_DEVICE - tuple(const tuple<args2_t...>& rhs) : impl_t(rhs) { } - - template<typename... args2_t, - typename = typename std::enable_if< - sizeof...(args2_t) == sizeof...(args_t) && - var_and<std::is_constructible<args_t, args2_t&&>::value...>::value - >::type - > MGPU_HOST_DEVICE - tuple(args2_t&&... args) : impl_t(std::forward<args2_t>(args)...) { } - - template<typename... args2_t, - typename = typename std::enable_if< - sizeof...(args2_t) == sizeof...(args_t) && - var_and<std::is_constructible<args_t, const args2_t&>::value...>::value - >::type - > MGPU_HOST_DEVICE - tuple(const args2_t&... args) : impl_t(args...) { } -} __attribute__((aligned)); - -namespace detail { - -template<size_t i, typename arg_t> MGPU_HOST_DEVICE -arg_t& _get(tuple_leaf<i, arg_t>& leaf) { - return leaf.get(); -} - -template<size_t i, typename arg_t> MGPU_HOST_DEVICE -const arg_t& _get(const tuple_leaf<i, arg_t>& leaf) { - return leaf.get(); -} - -} - -template<size_t i, typename... args_t> MGPU_HOST_DEVICE -tuple_element_t<i, tuple<args_t...> >& -get(tuple<args_t...>& tpl) { - return detail::_get<i>(tpl); -} - -template<size_t i, typename... args_t> MGPU_HOST_DEVICE -const tuple_element_t<i, tuple<args_t...> >& -get(const tuple<args_t...>& tpl) { - return detail::_get<i>(tpl); -} - -template<size_t i, typename... args_t> MGPU_HOST_DEVICE -typename std::add_rvalue_reference< - tuple_element_t<i, tuple<args_t...> > ->::type -get(tuple<args_t...>&& tpl) { - return std::forward<tuple_element_t<i, tuple<args_t...> >&&>(get<i>(tpl)); -} - -template<typename... args_t> MGPU_HOST_DEVICE -tuple<decay_t<args_t>...> make_tuple(args_t&&... args) { - return tuple<decay_t<args_t>...>(std::forward<args_t>(args)...); -} - -template<typename... args_t> MGPU_HOST_DEVICE -tuple<args_t&&...> forward_as_tuple(args_t&&... args) { - return tuple<args_t&&...>(std::forward<args_t>(args)...); -} - -//////////// -// tuple_cat - -namespace detail { - -template<typename tuple_t> -struct _make_tuple { - typedef typename std::remove_cv< - typename std::remove_reference<tuple_t>::type - >::type type; -}; - -template<typename... tuples_t> -struct _combine_type; - -template<typename... args_t> -struct _combine_type<tuple<args_t...> > { - typedef tuple<args_t...> type; -}; - -template<typename... args1_t, typename... args2_t, typename... tuples_t> -struct _combine_type<tuple<args1_t...>, tuple<args2_t...>, tuples_t...> { - typedef typename _combine_type< - tuple<args1_t..., args2_t...>, - tuples_t... - >::type type; -}; - -template<typename... tpls_t> -struct _tuple_cat_ret { - typedef typename _combine_type< - typename _make_tuple<tpls_t>::type... - >::type type; -}; - -template<typename tpl1_t, typename seq1_t, typename tpl2_t, typename seq2_t> -struct _tuple_cat; - -template<typename tpl1_t, size_t... seq1_i, typename tpl2_t, size_t... seq2_i> -struct _tuple_cat<tpl1_t, index_sequence<seq1_i...>, - tpl2_t, index_sequence<seq2_i...> > { - - typedef typename _tuple_cat_ret<tpl1_t, tpl2_t>::type ret_t; - - MGPU_HOST_DEVICE static ret_t cat(tpl1_t&& tpl1, tpl2_t&& tpl2) { - return make_tuple( - get<seq1_i>(std::forward<tpl1_t>(tpl1))..., - get<seq2_i>(std::forward<tpl2_t>(tpl2))... - ); - } -}; - -} // namespace detail - -template<typename tpl1_t> MGPU_HOST_DEVICE -typename detail::_tuple_cat_ret<tpl1_t>::type -tuple_cat(tpl1_t&& tpl1) { - return std::forward<tpl1_t>(tpl1); -} - -template<typename tpl1_t, typename tpl2_t, typename... tpls_t> MGPU_HOST_DEVICE -typename detail::_tuple_cat_ret<tpl1_t, tpl2_t, tpls_t...>::type -tuple_cat(tpl1_t&& tpl1, tpl2_t&& tpl2, tpls_t&&... tpls) { - typedef typename detail::_make_tuple<tpl1_t>::type tpl1_stripped; - typedef typename detail::_make_tuple<tpl2_t>::type tpl2_stripped; - - enum { - size1 = tuple_size<tpl1_stripped>::value, - size2 = tuple_size<tpl2_stripped>::value - }; - - return tuple_cat( - detail::_tuple_cat< - tpl1_t, make_index_sequence<size1>, - tpl2_t, make_index_sequence<size2> - >::cat( - std::forward<tpl1_t>(tpl1), - std::forward<tpl2_t>(tpl2) - ), - std::forward<tpls_t>(tpls)... - ); -} - -/////// -// tie - -template<typename... args_t> -MGPU_HOST_DEVICE tuple<args_t&...> tie(args_t&... args) { - return tuple<args_t&...>(args...); -} - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/types.hxx b/src/util/cuda/moderngpu/types.hxx deleted file mode 100644 index 5fa78592..00000000 --- a/src/util/cuda/moderngpu/types.hxx +++ /dev/null @@ -1,147 +0,0 @@ -// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com -#pragma once - -#include "meta.hxx" -#include "operators.hxx" - -BEGIN_MGPU_NAMESPACE - -struct cuda_exception_t : std::exception { - cudaError_t result; - - cuda_exception_t(cudaError_t result_) : result(result_) { } - virtual const char* what() const noexcept { - return cudaGetErrorString(result); - } -}; - - -template<typename type_t, int size> -struct array_t { - type_t data[size]; - - MGPU_HOST_DEVICE type_t operator[](int i) const { return data[i]; } - MGPU_HOST_DEVICE type_t& operator[](int i) { return data[i]; } - - array_t() = default; - array_t(const array_t&) = default; - array_t& operator=(const array_t&) = default; - - // Fill the array with x. - MGPU_HOST_DEVICE array_t(type_t x) { - iterate<size>([&](int i) { data[i] = x; }); - } -}; - -template<typename type_t> -struct array_t<type_t, 0> { - MGPU_HOST_DEVICE type_t operator[](int i) const { return type_t(); } - MGPU_HOST_DEVICE type_t& operator[](int i) { return *(type_t*)nullptr; } -}; - -// Reduce on components of array_t. -template<typename type_t, int size, typename op_t = plus_t<type_t> > -MGPU_HOST_DEVICE type_t reduce(array_t<type_t, size> x, op_t op = op_t()) { - type_t a; - iterate<size>([&](int i) { - a = i ? op(a, x[i]) : x[i]; - }); - return a; -} - -// Call the operator component-wise on all components. -template<typename type_t, int size, typename op_t> -MGPU_HOST_DEVICE array_t<type_t, size> combine(array_t<type_t, size> x, - array_t<type_t, size> y, op_t op) { - - array_t<type_t, size> z; - iterate<size>([&](int i) { z[i] = op(x[i], y[i]); }); - return z; -} - -template<typename type_t, int size> -MGPU_HOST_DEVICE array_t<type_t, size> operator+( - array_t<type_t, size> a, array_t<type_t, size> b) { - return combine(a, b, plus_t<type_t>()); -} - -template<typename type_t, int size> -MGPU_HOST_DEVICE array_t<type_t, size> operator-( - array_t<type_t, size> a, array_t<type_t, size> b) { - return combine(a, b, minus_t<type_t>()); -} - - -template<typename key_t, typename val_t, int size> -struct kv_array_t { - array_t<key_t, size> keys; - array_t<val_t, size> vals; -}; - -enum bounds_t { - bounds_lower, - bounds_upper -}; - -struct MGPU_ALIGN(8) range_t { - int begin, end; - MGPU_HOST_DEVICE int size() const { return end - begin; } - MGPU_HOST_DEVICE int count() const { return size(); } - MGPU_HOST_DEVICE bool valid() const { return end > begin; } -}; - -MGPU_HOST_DEVICE range_t get_tile(int cta, int nv, int count) { - return range_t { nv * cta, min(count, nv * (cta + 1)) }; -} - - -struct MGPU_ALIGN(16) merge_range_t { - int a_begin, a_end, b_begin, b_end; - - MGPU_HOST_DEVICE int a_count() const { return a_end - a_begin; } - MGPU_HOST_DEVICE int b_count() const { return b_end - b_begin; } - MGPU_HOST_DEVICE int total() const { return a_count() + b_count(); } - - MGPU_HOST_DEVICE range_t a_range() const { - return range_t { a_begin, a_end }; - } - MGPU_HOST_DEVICE range_t b_range() const { - return range_t { b_begin, b_end }; - } - - MGPU_HOST_DEVICE merge_range_t to_local() const { - return merge_range_t { 0, a_count(), a_count(), total() }; - } - - // Partition from mp to the end. - MGPU_HOST_DEVICE merge_range_t partition(int mp0, int diag) const { - return merge_range_t { a_begin + mp0, a_end, b_begin + diag - mp0, b_end }; - } - - // Partition from mp0 to mp1. - MGPU_HOST_DEVICE merge_range_t partition(int mp0, int diag0, - int mp1, int diag1) const { - return merge_range_t { - a_begin + mp0, - a_begin + mp1, - b_begin + diag0 - mp0, - b_begin + diag1 - mp1 - }; - } - - MGPU_HOST_DEVICE bool a_valid() const { - return a_begin < a_end; - } - MGPU_HOST_DEVICE bool b_valid() const { - return b_begin < b_end; - } -}; - -template<typename type_t, int size> -struct merge_pair_t { - array_t<type_t, size> keys; - array_t<int, size> indices; -}; - - -END_MGPU_NAMESPACE diff --git a/src/util/cuda/moderngpu/util.hxx b/src/util/cuda/moderngpu/util.hxx deleted file mode 100644 index 3675d7b9..00000000 --- a/src/util/cuda/moderngpu/util.hxx +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once -#include "types.hxx" -#include <cstdarg> -#include <string> - -BEGIN_MGPU_NAMESPACE - -namespace detail { - -inline std::string stringprintf(const char* format, ...) { - va_list args; - va_start(args, format); - int len = vsnprintf(0, 0, format, args); - va_end(args); - - // allocate space. - std::string text; - text.resize(len); - - va_start(args, format); - vsnprintf(&text[0], len + 1, format, args); - va_end(args); - - return text; -} - -} // namespace detail - -END_MGPU_NAMESPACE - diff --git a/src/util/cuda/ofp_context.hxx b/src/util/cuda/ofp_context.hxx deleted file mode 100644 index 70c4ed9e..00000000 --- a/src/util/cuda/ofp_context.hxx +++ /dev/null @@ -1,322 +0,0 @@ -/* - * ofp_context.hxx - * - * Created on: Nov 15, 2018 - * Author: i-bird - */ - -#ifndef OFP_CONTEXT_HXX_ -#define OFP_CONTEXT_HXX_ - -#include <iostream> - -#ifdef CUDA_ON_CPU - -namespace mgpu -{ - enum gpu_context_opt - { - no_print_props,//!< no_print_props - print_props, //!< print_props - dummy //!< dummy - }; - - struct context_t {}; - - class ofp_context_t : public context_t - { - protected: - - std::string _props; - - openfpm::vector<aggregate<unsigned char>> tmem; - - // Making this a template argument means we won't generate an instance - // of dummy_k for each translation unit. - template<int dummy_arg = 0> - void init(int dev_num, gpu_context_opt opt) - {} - - public: - - /*! \brief gpu context constructor - * - * \param opt options for this gpu context - * - */ - ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, int stream_ = 0) - {} - - ~ofp_context_t() - {} - - virtual const std::string& props() const - { - return _props; - } - - virtual int ptx_version() const - { - return 0; - } - - virtual int stream() - { - std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; - return 0; - } - - // Alloc GPU memory. - virtual void* alloc(size_t size, int space) - { - std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; - return NULL; - } - - virtual void free(void* p, int space) - { - std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; - } - - virtual void synchronize() - { - std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; - } - - virtual int event() - { - std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; - return 0; - } - - virtual void timer_begin() - { - std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; - } - - virtual double timer_end() - { - std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; - return 0.0; - } - - virtual int getDevice() - { - std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl; - return 0; - } - }; - -} - -#else - - #ifdef CUDA_GPU - - #ifdef __NVCC__ - #include "util/cuda/moderngpu/context.hxx" - #else - #include "util/cuda/moderngpu/context_reduced.hxx" - #endif - - namespace mgpu - { - enum gpu_context_opt - { - no_print_props,//!< no_print_props - print_props, //!< print_props - dummy //!< dummy - }; - - - //////////////////////////////////////////////////////////////////////////////// - // standard_context_t is a trivial implementation of context_t. Users can - // derive this type to provide a custom allocator. - - class ofp_context_t : public context_t - { - protected: - cudaDeviceProp _props; - int _ptx_version; - cudaStream_t _stream; - - cudaEvent_t _timer[2]; - cudaEvent_t _event; - - openfpm::vector_gpu<aggregate<unsigned char>> tmem; - openfpm::vector_gpu<aggregate<unsigned char>> tmem2; - openfpm::vector_gpu<aggregate<unsigned char>> tmem3; - - // Making this a template argument means we won't generate an instance - // of dummy_k for each translation unit. - template<int dummy_arg = 0> - void init(int dev_num, gpu_context_opt opt) - { - cudaFuncAttributes attr; - #ifdef __NVCC__ - cudaError_t result = cudaFuncGetAttributes(&attr, (void *)dummy_k<0>); - if(cudaSuccess != result) throw cuda_exception_t(result); - _ptx_version = attr.ptxVersion; - #else - _ptx_version = 60; - //std::cout << __FILE__ << ":" << __LINE__ << " Warning initialization of GPU context has been done from a standard Cpp file, rather than a CUDA or HIP file" << std::endl; - #endif - - int num_dev; - cudaGetDeviceCount(&num_dev); - - if (num_dev == 0) {return;} - - if (opt != gpu_context_opt::dummy) - { - cudaSetDevice(dev_num % num_dev); - } - - int ord; - cudaGetDevice(&ord); - cudaGetDeviceProperties(&_props, ord); - - cudaEventCreate(&_timer[0]); - cudaEventCreate(&_timer[1]); - cudaEventCreate(&_event); - } - - public: - - - /*! \brief gpu context constructor - * - * \param opt options for this gpu context - * - */ - ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, cudaStream_t stream_ = 0) - :context_t(), _stream(stream_) - { - init(dev_num,opt); - if(opt == gpu_context_opt::print_props) - { - printf("%s\n", device_prop_string(_props).c_str()); - } - } - - ~ofp_context_t() - { - cudaEventDestroy(_timer[0]); - cudaEventDestroy(_timer[1]); - cudaEventDestroy(_event); - } - - virtual const cudaDeviceProp& props() const { return _props; } - virtual int ptx_version() const { return _ptx_version; } - virtual cudaStream_t stream() { return _stream; } - - // Alloc GPU memory. - virtual void* alloc(size_t size, memory_space_t space) - { - void* p = nullptr; - if(size) - { - cudaError_t result = (memory_space_device == space) ?cudaMalloc(&p, size) : cudaMallocHost(&p, size); - if(cudaSuccess != result) throw cuda_exception_t(result); - } - return p; - } - - virtual void free(void* p, memory_space_t space) - { - if(p) - { - cudaError_t result = (memory_space_device == space) ? cudaFree(p) : cudaFreeHost(p); - if(cudaSuccess != result) throw cuda_exception_t(result); - } - } - - virtual void synchronize() - { - cudaError_t result = _stream ? - cudaStreamSynchronize(_stream) : - cudaDeviceSynchronize(); - if(cudaSuccess != result) throw cuda_exception_t(result); - } - - virtual cudaEvent_t event() - { - return _event; - } - - virtual void timer_begin() - { - cudaEventRecord(_timer[0], _stream); - } - - virtual double timer_end() - { - cudaEventRecord(_timer[1], _stream); - cudaEventSynchronize(_timer[1]); - float ms; - cudaEventElapsedTime(&ms, _timer[0], _timer[1]); - return ms / 1.0e3; - } - - virtual int getDevice() - { - int dev = 0; - - cudaGetDevice(&dev); - - return dev; - } - - virtual int getNDevice() - { - int num_dev; - cudaGetDeviceCount(&num_dev); - - return num_dev; - } - - openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB() - { - return tmem; - } - - openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB2() - { - return tmem2; - } - - openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB3() - { - return tmem3; - } - }; - - } - - #else - - namespace mgpu - { - - enum gpu_context_opt - { - no_print_props,//!< no_print_props - print_props, //!< print_props - dummy //!< dummy - }; - - // Stub class for modern gpu - - struct ofp_context_t - { - ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0) - {} - }; - } - - #endif - -#endif - - -#endif /* OFP_CONTEXT_HXX_ */ diff --git a/src/util/cuda/reduce_ofp.cuh b/src/util/cuda/reduce_ofp.cuh index 6a4fb65f..53c1c741 100644 --- a/src/util/cuda/reduce_ofp.cuh +++ b/src/util/cuda/reduce_ofp.cuh @@ -11,32 +11,26 @@ #ifdef __NVCC__ #include "util/cuda_launch.hpp" +#include "util/ofp_context.hpp" #if CUDART_VERSION >= 11000 - #ifndef CUDA_ON_CPU // Here we have for sure CUDA >= 11 - #ifdef __HIP__ - #include "hipcub/hipcub.hpp" - #else - #include "cub/cub.cuh" - #endif - #ifndef REDUCE_WITH_CUB - #define REDUCE_WITH_CUB - #endif + #ifndef CUDA_ON_CPU + #ifdef __HIP__ + #include "hipcub/hipcub.hpp" + #else + #include "cub/cub.cuh" + #endif #endif #else - // Here we have old CUDA #include "cub_old/cub.cuh" - //#include "util/cuda/moderngpu/kernel_reduce.hxx" - #define REDUCE_WITH_CUB #endif -#include "util/cuda/ofp_context.hxx" namespace openfpm { template<typename input_it, typename output_it, typename reduce_op> - void reduce(input_it input, int count, output_it output, reduce_op op, mgpu::ofp_context_t& context) + void reduce(input_it input, int count, output_it output, reduce_op op, gpu::ofp_context_t& context) { #ifdef CUDA_ON_CPU @@ -47,51 +41,30 @@ namespace openfpm } #else - #ifdef REDUCE_WITH_CUB - #ifdef __HIP__ + #ifdef __HIP__ - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - hipcub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input, - output, - count, - op, - false); - - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); - - // Run - hipcub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, - output, - count, - op, - false); - #else + size_t temp_storage_bytes = 0; + hipcub::DeviceReduce::Reduce(NULL, + temp_storage_bytes,input, output, count, op, false); - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input, - output, - count, - op, - false); + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); + hipcub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), + temp_storage_bytes,input, output, count, op, false); + #else - // Run - cub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, - output, - count, - op, - false); + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Reduce(NULL, + temp_storage_bytes, input, output, count, op, false); - #endif + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + cub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), + temp_storage_bytes, input, output, count, op, false); - #else - mgpu::reduce(input,count,output,op,context); #endif #endif } diff --git a/src/util/cuda/scan_ofp.cuh b/src/util/cuda/scan_ofp.cuh index f91d64a1..97648164 100644 --- a/src/util/cuda/scan_ofp.cuh +++ b/src/util/cuda/scan_ofp.cuh @@ -11,32 +11,26 @@ #ifdef __NVCC__ #include "util/cuda_launch.hpp" +#include "util/ofp_context.hpp" #if CUDART_VERSION >= 11000 - #ifndef CUDA_ON_CPU // Here we have for sure CUDA >= 11 - #ifdef __HIP__ - #include "hipcub/hipcub.hpp" - #else - #include "cub/cub.cuh" - #endif - #ifndef SCAN_WITH_CUB - #define SCAN_WITH_CUB - #endif + #ifndef CUDA_ON_CPU + #ifdef __HIP__ + #include "hipcub/hipcub.hpp" + #else + #include "cub/cub.cuh" + #endif #endif #else - // Here we have old CUDA #include "cub_old/cub.cuh" - //#include "util/cuda/moderngpu/kernel_scan.hxx" - #define SCAN_WITH_CUB #endif -#include "util/cuda/ofp_context.hxx" namespace openfpm { template<typename input_it, typename output_it> - void scan(input_it input, int count, output_it output, mgpu::ofp_context_t& context) + void scan(input_it input, int count, output_it output, gpu::ofp_context_t& context) { #ifdef CUDA_ON_CPU @@ -52,46 +46,32 @@ namespace openfpm } #else - #ifdef SCAN_WITH_CUB - - #ifdef __HIP__ + if (count == 0) return; - if (count == 0) {return;} - - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input, - output, - count); + #ifdef __HIP__ - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); + size_t temp_storage_bytes = 0; + hipcub::DeviceScan::ExclusiveSum(NULL, + temp_storage_bytes,input, output, count); - // Run - hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, - output, - count); + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); - #else + hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), + temp_storage_bytes, input, output, count); - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input, - output, - count); + #else - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); + size_t temp_storage_bytes = 0; + cub::DeviceScan::ExclusiveSum(NULL, + temp_storage_bytes, input, output, count); - // Run - cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input, - output, - count); + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); - #endif + cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), + temp_storage_bytes,input, output, count); - #else - mgpu::scan(input,count,output,context); #endif #endif } diff --git a/src/util/cuda/scan_sort_cuda_unit_tests.cu b/src/util/cuda/scan_sort_cuda_unit_tests.cu index 7314e874..991f7b65 100644 --- a/src/util/cuda/scan_sort_cuda_unit_tests.cu +++ b/src/util/cuda/scan_sort_cuda_unit_tests.cu @@ -8,8 +8,6 @@ #include "util/cuda_util.hpp" #include "Vector/map_vector.hpp" -#define SORT_WITH_CUB - #include "sort_ofp.cuh" #include "scan_ofp.cuh" #include "segreduce_ofp.cuh" @@ -37,7 +35,7 @@ BOOST_AUTO_TEST_CASE( test_scan_cub_wrapper ) input.template hostToDevice<0>(); - mgpu::ofp_context_t context; + gpu::ofp_context_t context; openfpm::scan((unsigned int *)input.template getDeviceBuffer<0>(),input.size(),(unsigned int *)output.template getDeviceBuffer<0>(),context); output.template deviceToHost<0>(); @@ -78,11 +76,11 @@ BOOST_AUTO_TEST_CASE( test_sort_cub_wrapper ) input.template hostToDevice<0>(); input_id.template hostToDevice<0>(); - mgpu::ofp_context_t context; + gpu::ofp_context_t context; openfpm::sort((unsigned int *)input.template getDeviceBuffer<0>(), (unsigned int *)input_id.template getDeviceBuffer<0>(), - input.size(),mgpu::template less_t<unsigned int>(),context); + input.size(),gpu::template less_t<unsigned int>(),context); input.template deviceToHost<0>(); input_id.template deviceToHost<0>(); @@ -94,7 +92,7 @@ BOOST_AUTO_TEST_CASE( test_sort_cub_wrapper ) openfpm::sort((unsigned int *)input.template getDeviceBuffer<0>(), (unsigned int *)input_id.template getDeviceBuffer<0>(), - input.size(),mgpu::template greater_t<unsigned int>(),context); + input.size(),gpu::template greater_t<unsigned int>(),context); input.template deviceToHost<0>(); input_id.template deviceToHost<0>(); @@ -113,7 +111,7 @@ BOOST_AUTO_TEST_CASE( test_seg_reduce_wrapper ) { std::cout << "Test gpu segmented reduce" << "\n"; - mgpu::ofp_context_t context; + gpu::ofp_context_t context; int count = 130; @@ -144,23 +142,25 @@ BOOST_AUTO_TEST_CASE( test_seg_reduce_wrapper ) base += c; } + segment_offset.add(); + segment_offset.template get<0>(segment_offset.size() - 1) = vgpu.size(); vgpu.hostToDevice<0>(); segment_offset.hostToDevice<0>(); - output.resize(segment_offset.size()); + output.resize(segment_offset.size()-1); openfpm::segreduce((int *)vgpu.template getDeviceBuffer<0>(), vgpu.size(), - (int *)segment_offset.template getDeviceBuffer<0>(), segment_offset.size(), + (int *)segment_offset.template getDeviceBuffer<0>(), segment_offset.size()-1, (int *)output.template getDeviceBuffer<0>(), - mgpu::plus_t<int>(), init, context); + gpu::plus_t<int>(), init, context); output.template deviceToHost<0>(); bool match = true; size_t i = 0; - for ( ; i < segment_offset.size()-1 ; i++) + for ( ; i < segment_offset.size()-2 ; i++) { size_t red = 0; for (size_t j = 0 ; j < segment_offset.template get<0>(i+1) - segment_offset.template get<0>(i) ; j++) @@ -173,7 +173,7 @@ BOOST_AUTO_TEST_CASE( test_seg_reduce_wrapper ) BOOST_REQUIRE_EQUAL(match,true); size_t red2 = 0; - for (size_t j = 0 ; j < vgpu.size() - segment_offset.template get<0>(i) ; j++) + for (size_t j = 0 ; j < segment_offset.template get<0>(i+1) - segment_offset.template get<0>(i) ; j++) { red2 += vgpu.template get<0>(segment_offset.template get<0>(i) + j); } diff --git a/src/util/cuda/segreduce_ofp.cuh b/src/util/cuda/segreduce_ofp.cuh index 9a0c3764..57eae7cd 100644 --- a/src/util/cuda/segreduce_ofp.cuh +++ b/src/util/cuda/segreduce_ofp.cuh @@ -10,64 +10,22 @@ #ifdef __NVCC__ - #include "Vector/map_vector.hpp" #include "util/cuda_launch.hpp" - #include "util/cuda/segreduce_ofp.cuh" + #include "util/ofp_context.hpp" #if CUDART_VERSION >= 11000 - #ifndef CUDA_ON_CPU - // Here we have for sure CUDA >= 11 - #ifdef __HIP__ - #undef __CUDACC__ - #undef __CUDA__ - #include <thrust/reduce.h> - #define __CUDACC__ - #define __CUDA__ - #else - #include "util/cuda/moderngpu/kernel_segreduce.hxx" - #endif - #endif - #else - #include "util/cuda/moderngpu/kernel_segreduce.hxx" - #endif - #include "util/cuda/ofp_context.hxx" + // Here we have for sure CUDA >= 11 + #ifndef CUDA_ON_CPU + #ifdef __HIP__ + #include "hipcub/hipcub.hpp" + #else + #include "cub/cub.cuh" + #endif + #endif +#else + #include "cub_old/cub.cuh" +#endif -template<typename segments_it, typename keys_type, typename output_it, typename seg_type, typename type_t> -__global__ void seg_to_keys(segments_it segs, keys_type keys, seg_type seg_out ,output_it output, int n_count, int num_segments,type_t init) -{ - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - if (tid >= num_segments) {return;} - - int s = segs[tid]; - int s_p1 = (tid == num_segments -1)?n_count:segs[tid+1]; - - int n_ele = s_p1 - s; - - seg_out.template get<1>(tid) = (s != s_p1); - output[tid] = init; - - for (int j = 0 ; j < n_ele ; j++) - { - keys.template get<0>(s + j) = tid; - } -} - -template<typename output_it, typename out_tmp_type ,typename segs_type> -__global__ void realign_output(output_it out, out_tmp_type out_tmp, segs_type segs, int num_segments) -{ - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - if (tid >= num_segments) {return;} - - int t = segs.template get<2>(tid); - int to_copy = segs.template get<1>(tid); - - auto op = out_tmp.template get<0>(t); - - if (to_copy == 1) - {out[tid] = op;} -} namespace openfpm { @@ -75,7 +33,7 @@ __global__ void realign_output(output_it out, out_tmp_type out_tmp, segs_type se typename segments_it, typename output_it, typename op_t, typename type_t> void segreduce(input_it input, int count, segments_it segments, int num_segments, output_it output, op_t op, type_t init, - mgpu::ofp_context_t & context) + gpu::ofp_context_t & context) { #ifdef CUDA_ON_CPU @@ -106,47 +64,33 @@ __global__ void realign_output(output_it out, out_tmp_type out_tmp, segs_type se } #else - #ifdef __HIP__ - typedef typename std::remove_pointer<segments_it>::type index_type; - typedef typename std::remove_pointer<output_it>::type out_type; - - openfpm::vector_gpu<aggregate<index_type>> keys; - keys.resize(count); - - openfpm::vector_gpu<aggregate<index_type,index_type,index_type>> segs_out; - segs_out.resize(num_segments); + size_t temp_storage_bytes = 0; - openfpm::vector_gpu<aggregate<out_type>> out_tmp; - out_tmp.resize(num_segments); + hipcub::DeviceSegmentedReduce::Reduce(NULL, temp_storage_bytes, input, output, + num_segments, segments, segments + 1, op, init); - grid_sm<1,void> g(num_segments); + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); - auto it = g.getGPUIterator(); + hipcub::DeviceSegmentedReduce::Reduce(temporal.getDeviceBuffer<0>(), temp_storage_bytes, input, output, + num_segments, segments, segments + 1, op, init); - CUDA_LAUNCH(seg_to_keys,it,segments,keys.toKernel(),segs_out.toKernel(),output,count,num_segments,init); - - openfpm::scan((index_type *)segs_out.template getDeviceBuffer<1>(),num_segments,(index_type *)segs_out.template getDeviceBuffer<2>(),context); + #else - thrust::pair<index_type *,out_type *> new_end; - new_end = thrust::reduce_by_key(thrust::device, (segments_it)keys.template getDeviceBuffer<0>(),((segments_it)keys.template getDeviceBuffer<0>()) + count, - input, - (segments_it)segs_out.template getDeviceBuffer<0>(), - (output_it)out_tmp.template getDeviceBuffer<0>(), - thrust::equal_to<int>(), - op); + size_t temp_storage_bytes = 0; - // .. Not so easy to emulate a segmented reduce we have to track the zeros segments and realign the output + cub::DeviceSegmentedReduce::Reduce(NULL, temp_storage_bytes, input, output, + num_segments, segments, segments + 1, op, init); - CUDA_LAUNCH(realign_output,it,output,out_tmp.toKernel(),segs_out.toKernel(),num_segments); + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); - #else - - mgpu::segreduce(input,count,segments,num_segments,output,op,init,context); + cub::DeviceSegmentedReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes, input, output, + num_segments, segments, segments + 1, op, init); #endif - #endif } } diff --git a/src/util/cuda/sort_ofp.cuh b/src/util/cuda/sort_ofp.cuh index 52deb9d6..d689668b 100644 --- a/src/util/cuda/sort_ofp.cuh +++ b/src/util/cuda/sort_ofp.cuh @@ -12,27 +12,22 @@ #ifdef __NVCC__ #include "util/cuda_launch.hpp" +#include "util/ofp_context.hpp" #if CUDART_VERSION >= 11000 - #ifndef CUDA_ON_CPU // Here we have for sure CUDA >= 11 - #ifdef __HIP__ - #include "hipcub/hipcub.hpp" - #else - #include "cub/cub.cuh" - #endif - #ifndef SORT_WITH_CUB - #define SORT_WITH_CUB - #endif + #ifndef CUDA_ON_CPU + #ifdef __HIP__ + #include "hipcub/hipcub.hpp" + #else + #include "cub/cub.cuh" + #endif #endif #else // Here we have old CUDA #include "cub_old/cub.cuh" - //#include "util/cuda/moderngpu/kernel_mergesort.hxx" - #define SORT_WITH_CUB #endif -#include "util/cuda/ofp_context.hxx" template<typename key_t, typename val_t> struct key_val_ref; @@ -266,7 +261,7 @@ namespace openfpm template<typename key_t, typename val_t, typename comp_t> void sort(key_t* keys_input, val_t* vals_input, int count, - comp_t comp, mgpu::ofp_context_t& context) + comp_t comp, gpu::ofp_context_t& context) { #ifdef CUDA_ON_CPU @@ -275,142 +270,133 @@ namespace openfpm std::sort(kv,kv+count,comp); #else + #ifdef __HIP__ - #ifdef SORT_WITH_CUB - - #ifdef __HIP__ - - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - - auto & temporal2 = context.getTemporalCUB2(); - temporal2.resize(sizeof(key_t)*count); - - auto & temporal3 = context.getTemporalCUB3(); - temporal3.resize(sizeof(val_t)*count); - - if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true) - { - hipcub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); - - d_temp_storage = temporal.template getDeviceBuffer<0>(); - - // Run - hipcub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - } - else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true) - { - hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); - - d_temp_storage = temporal.template getDeviceBuffer<0>(); - - // Run - hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - } - - cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); - cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); - - - #else - - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - - auto & temporal2 = context.getTemporalCUB2(); - temporal2.resize(sizeof(key_t)*count); - - auto & temporal3 = context.getTemporalCUB3(); - temporal3.resize(sizeof(val_t)*count); - - if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true) - { - cub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); - - d_temp_storage = temporal.template getDeviceBuffer<0>(); - - // Run - cub::DeviceRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - } - else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true) - { - cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - - auto & temporal = context.getTemporalCUB(); - temporal.resize(temp_storage_bytes); - - d_temp_storage = temporal.template getDeviceBuffer<0>(); - - // Run - cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - keys_input, - (key_t *)temporal2.template getDeviceBuffer<0>(), - vals_input, - (val_t *)temporal3.template getDeviceBuffer<0>(), - count); - } - - cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); - cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); - - #endif + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + + auto & temporal2 = context.getTemporalCUB2(); + temporal2.resize(sizeof(key_t)*count); + + auto & temporal3 = context.getTemporalCUB3(); + temporal3.resize(sizeof(val_t)*count); + + if (std::is_same<gpu::template less_t<key_t>,comp_t>::value == true) + { + hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + hipcub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + else if (std::is_same<gpu::template greater_t<key_t>,comp_t>::value == true) + { + hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + + cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); + cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); #else - mgpu::mergesort(keys_input,vals_input,count,comp,context); - #endif + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + + auto & temporal2 = context.getTemporalCUB2(); + temporal2.resize(sizeof(key_t)*count); + + auto & temporal3 = context.getTemporalCUB3(); + temporal3.resize(sizeof(val_t)*count); + + if (std::is_same<gpu::template less_t<key_t>,comp_t>::value == true) + { + cub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + cub::DeviceRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + else if (std::is_same<gpu::template greater_t<key_t>,comp_t>::value == true) + { + cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + + auto & temporal = context.getTemporalCUB(); + temporal.resize(temp_storage_bytes); + + d_temp_storage = temporal.template getDeviceBuffer<0>(); + + // Run + cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + keys_input, + (key_t *)temporal2.template getDeviceBuffer<0>(), + vals_input, + (val_t *)temporal3.template getDeviceBuffer<0>(), + count); + } + + cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice); + cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice); + + #endif #endif } } diff --git a/src/util/cuda/test/segreduce_block_cuda_tests.cu b/src/util/cuda/test/segreduce_block_cuda_tests.cu index 573f7f91..5d67eefa 100644 --- a/src/util/cuda/test/segreduce_block_cuda_tests.cu +++ b/src/util/cuda/test/segreduce_block_cuda_tests.cu @@ -165,8 +165,8 @@ BOOST_AUTO_TEST_SUITE(segreduce_block_cuda_tests) // template<unsigned int chunksPerBlock, typename op, typename SegType, typename DataType, typename MaskType> // segreduce(DataType *data, SegType *segments, MaskType *masks, DataType *output, MaskType *outputMasks) -// segreduce<2, mgpu::maximum_t<ScalarT>> <<< outputData.size(), 2*BlockT::size >>> ( - CUDA_LAUNCH_DIM3((segreduce_block<2, mgpu::plus_t<ScalarT>>), outputData.size(), 2*BlockT::size, +// segreduce<2, gpu::maximum_t<ScalarT>> <<< outputData.size(), 2*BlockT::size >>> ( + CUDA_LAUNCH_DIM3((segreduce_block<2, gpu::plus_t<ScalarT>>), outputData.size(), 2*BlockT::size, (BlockT *) data.template getDeviceBuffer<BLOCK>(), (int *) segments.template getDeviceBuffer<0>(), (MaskBlockT *) data.template getDeviceBuffer<BITMASK>(), @@ -174,7 +174,7 @@ BOOST_AUTO_TEST_SUITE(segreduce_block_cuda_tests) ); // Segreduce on mask - CUDA_LAUNCH_DIM3((segreduce_block<2, mgpu::maximum_t<unsigned char>>), outputData.size(), 2*BlockT::size, + CUDA_LAUNCH_DIM3((segreduce_block<2, gpu::maximum_t<unsigned char>>), outputData.size(), 2*BlockT::size, (MaskBlockT *) data.template getDeviceBuffer<BITMASK>(), (int *) segments.template getDeviceBuffer<0>(), (MaskBlockT *) data.template getDeviceBuffer<BITMASK>(), -- GitLab