From 29305a648f37f6e1f8adf2d0f719406044ea1565 Mon Sep 17 00:00:00 2001
From: Serhii Yaskovets <yaskovet@mpi-cbg.de>
Date: Mon, 31 Jul 2023 18:48:29 +0200
Subject: [PATCH] Move parallel primitives lib: moderngpu to CUB

---
 src/CMakeLists.txt                            |  46 +-
 src/Grid/cuda/cuda_grid_gpu_int.cu            |   2 +-
 src/NN/CellList/CellList_gpu_test.cu          |  16 +-
 src/NN/CellList/CellList_util.hpp             |  14 +-
 src/NN/CellList/cuda/CellList_gpu.hpp         |  28 +-
 src/NN/VerletList/VerletListFast.hpp          |   2 +-
 src/SparseGridGpu/BlockMapGpu.hpp             |   2 +-
 src/SparseGridGpu/BlockMapGpu_kernels.cuh     |   6 +-
 src/SparseGridGpu/SparseGridGpu.hpp           |  34 +-
 .../SparseGridGpu_performance_get_nn.cu       |   2 +-
 .../SparseGridGpu_performance_get_single.cu   |   2 +-
 .../SparseGridGpu_performance_heat_stencil.cu |   2 +-
 ...arseGridGpu_performance_heat_stencil_3d.cu |   4 +-
 ...GridGpu_performance_heat_stencil_sparse.cu |   2 +-
 .../SparseGridGpu_performance_insert_block.cu |   2 +-
 ...SparseGridGpu_performance_insert_single.cu |   2 +-
 .../SparseGridGpu_performance_tests.cu        |   4 +-
 .../tests/BlockMapGpu_kernels_tests.cu        |   8 +-
 src/SparseGridGpu/tests/BlockMapGpu_tests.cu  |   6 +-
 .../tests/SparseGridGpu_tests.cu              |  46 +-
 .../tests/utils/SparseGridGpu_util_test.cuh   |   4 +-
 .../map_vector_sparse_cuda_ker_unit_tests.cu  |  12 +-
 .../cuda/map_vector_sparse_cuda_kernels.cuh   |  50 +-
 ...p_vector_sparse_cuda_kernels_unit_tests.cu |  14 +-
 src/Vector/map_vector_sparse.hpp              |  71 ++-
 src/Vector/map_vector_sparse_unit_tests.cu    |   2 +-
 src/util/cuda/merge_ofp.cuh                   |  13 +-
 src/util/cuda/modern_gpu_tests.cu             | 222 ---------
 src/util/cuda/moderngpu/context.hxx           | 221 ---------
 src/util/cuda/moderngpu/context_reduced.hxx   | 107 -----
 src/util/cuda/moderngpu/cpp11.hxx             | 154 ------
 src/util/cuda/moderngpu/cta_load_balance.hxx  | 263 -----------
 src/util/cuda/moderngpu/cta_merge.hxx         | 209 ---------
 src/util/cuda/moderngpu/cta_mergesort.hxx     | 140 ------
 src/util/cuda/moderngpu/cta_reduce.hxx        | 134 ------
 src/util/cuda/moderngpu/cta_scan.hxx          | 231 ---------
 src/util/cuda/moderngpu/cta_search.hxx        | 100 ----
 src/util/cuda/moderngpu/cta_segscan.hxx       | 119 -----
 src/util/cuda/moderngpu/cta_segsort.hxx       | 226 ---------
 src/util/cuda/moderngpu/intrinsics.hxx        | 363 --------------
 src/util/cuda/moderngpu/kernel_bulkinsert.hxx |  18 -
 src/util/cuda/moderngpu/kernel_bulkremove.hxx |  91 ----
 src/util/cuda/moderngpu/kernel_compact.hxx    | 139 ------
 .../cuda/moderngpu/kernel_intervalmove.hxx    |  67 ---
 src/util/cuda/moderngpu/kernel_join.hxx       |  50 --
 .../cuda/moderngpu/kernel_load_balance.hxx    |  88 ----
 src/util/cuda/moderngpu/kernel_merge.hxx      |  92 ----
 src/util/cuda/moderngpu/kernel_mergesort.hxx  | 150 ------
 src/util/cuda/moderngpu/kernel_reduce.hxx     |  70 ---
 src/util/cuda/moderngpu/kernel_scan.hxx       | 198 --------
 src/util/cuda/moderngpu/kernel_segreduce.hxx  | 406 ----------------
 src/util/cuda/moderngpu/kernel_segsort.hxx    | 444 ------------------
 .../cuda/moderngpu/kernel_sortedsearch.hxx    |  64 ---
 src/util/cuda/moderngpu/kernel_workcreate.hxx | 272 -----------
 src/util/cuda/moderngpu/launch_box.hxx        |  93 ----
 src/util/cuda/moderngpu/launch_params.hxx     | 152 ------
 src/util/cuda/moderngpu/loadstore.hxx         | 188 --------
 src/util/cuda/moderngpu/memory.hxx            | 131 ------
 src/util/cuda/moderngpu/meta.hxx              | 249 ----------
 src/util/cuda/moderngpu/operators.hxx         | 347 --------------
 src/util/cuda/moderngpu/search.hxx            |  53 ---
 src/util/cuda/moderngpu/sort_networks.hxx     |  57 ---
 src/util/cuda/moderngpu/transform.hxx         | 107 -----
 src/util/cuda/moderngpu/tuple.hxx             | 393 ----------------
 src/util/cuda/moderngpu/types.hxx             | 147 ------
 src/util/cuda/moderngpu/util.hxx              |  30 --
 src/util/cuda/ofp_context.hxx                 | 322 -------------
 src/util/cuda/reduce_ofp.cuh                  |  77 +--
 src/util/cuda/scan_ofp.cuh                    |  70 +--
 src/util/cuda/scan_sort_cuda_unit_tests.cu    |  24 +-
 src/util/cuda/segreduce_ofp.cuh               | 112 ++---
 src/util/cuda/sort_ofp.cuh                    | 276 ++++++-----
 .../cuda/test/segreduce_block_cuda_tests.cu   |   6 +-
 73 files changed, 405 insertions(+), 7463 deletions(-)
 delete mode 100644 src/util/cuda/modern_gpu_tests.cu
 delete mode 100644 src/util/cuda/moderngpu/context.hxx
 delete mode 100644 src/util/cuda/moderngpu/context_reduced.hxx
 delete mode 100644 src/util/cuda/moderngpu/cpp11.hxx
 delete mode 100644 src/util/cuda/moderngpu/cta_load_balance.hxx
 delete mode 100644 src/util/cuda/moderngpu/cta_merge.hxx
 delete mode 100644 src/util/cuda/moderngpu/cta_mergesort.hxx
 delete mode 100644 src/util/cuda/moderngpu/cta_reduce.hxx
 delete mode 100644 src/util/cuda/moderngpu/cta_scan.hxx
 delete mode 100644 src/util/cuda/moderngpu/cta_search.hxx
 delete mode 100644 src/util/cuda/moderngpu/cta_segscan.hxx
 delete mode 100644 src/util/cuda/moderngpu/cta_segsort.hxx
 delete mode 100644 src/util/cuda/moderngpu/intrinsics.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_bulkinsert.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_bulkremove.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_compact.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_intervalmove.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_join.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_load_balance.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_merge.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_mergesort.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_reduce.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_scan.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_segreduce.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_segsort.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_sortedsearch.hxx
 delete mode 100644 src/util/cuda/moderngpu/kernel_workcreate.hxx
 delete mode 100644 src/util/cuda/moderngpu/launch_box.hxx
 delete mode 100644 src/util/cuda/moderngpu/launch_params.hxx
 delete mode 100644 src/util/cuda/moderngpu/loadstore.hxx
 delete mode 100644 src/util/cuda/moderngpu/memory.hxx
 delete mode 100644 src/util/cuda/moderngpu/meta.hxx
 delete mode 100644 src/util/cuda/moderngpu/operators.hxx
 delete mode 100644 src/util/cuda/moderngpu/search.hxx
 delete mode 100644 src/util/cuda/moderngpu/sort_networks.hxx
 delete mode 100644 src/util/cuda/moderngpu/transform.hxx
 delete mode 100644 src/util/cuda/moderngpu/tuple.hxx
 delete mode 100644 src/util/cuda/moderngpu/types.hxx
 delete mode 100644 src/util/cuda/moderngpu/util.hxx
 delete mode 100644 src/util/cuda/ofp_context.hxx

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cc9991a0..2aa84f19 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,8 +3,6 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
 
 ########################### Executables
 
-add_definitions(-DSCAN_WITH_CUB)
-
 
 if (TEST_PERFORMANCE)
 	set(CUDA_SOURCES SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu
@@ -31,7 +29,6 @@ if (NOT CUDA_ON_BACKEND STREQUAL "None")
             NN/CellList/CellList_gpu_test.cu 
 	    util/cuda/scan_sort_cuda_unit_tests.cu
             Grid/cuda/cuda_grid_unit_tests_func.cu
-            util/cuda/modern_gpu_tests.cu
             Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu
             NN/CellList/tests/CellDecomposer_gpu_ker_unit_test.cu
 	    SparseGridGpu/tests/BlockMapGpu_tests.cu
@@ -448,8 +445,7 @@ install(FILES util/multi_array_openfpm/array_openfpm.hpp
 	COMPONENT OpenFPM)
 
 
-install(FILES util/cuda/ofp_context.hxx
-        util/cuda/kernels.cuh
+install(FILES util/cuda/kernels.cuh
         util/cuda/scan_ofp.cuh
 	util/cuda/sort_ofp.cuh
 	util/cuda/reduce_ofp.cuh
@@ -490,46 +486,6 @@ install (FILES  SparseGridGpu/TemplateUtils/mathUtils.hpp
 		DESTINATION openfpm_data/include/SparseGridGpu/TemplateUtils/ 
 		COMPONENT OpenFPM)
 
-install(FILES util/cuda/moderngpu/context.hxx
-        util/cuda/moderngpu/context_reduced.hxx
-        util/cuda/moderngpu/cpp11.hxx
-        util/cuda/moderngpu/cta_load_balance.hxx
-        util/cuda/moderngpu/cta_merge.hxx
-        util/cuda/moderngpu/cta_mergesort.hxx
-        util/cuda/moderngpu/cta_reduce.hxx
-        util/cuda/moderngpu/cta_scan.hxx
-        util/cuda/moderngpu/cta_search.hxx
-        util/cuda/moderngpu/cta_segscan.hxx
-        util/cuda/moderngpu/intrinsics.hxx
-        util/cuda/moderngpu/kernel_bulkinsert.hxx
-        util/cuda/moderngpu/kernel_bulkremove.hxx
-        util/cuda/moderngpu/kernel_compact.hxx
-        util/cuda/moderngpu/kernel_intervalmove.hxx
-        util/cuda/moderngpu/kernel_join.hxx
-        util/cuda/moderngpu/kernel_load_balance.hxx
-        util/cuda/moderngpu/kernel_merge.hxx
-        util/cuda/moderngpu/kernel_mergesort.hxx
-        util/cuda/moderngpu/kernel_reduce.hxx
-        util/cuda/moderngpu/kernel_scan.hxx
-        util/cuda/moderngpu/kernel_segreduce.hxx
-        util/cuda/moderngpu/kernel_segsort.hxx
-        util/cuda/moderngpu/kernel_sortedsearch.hxx
-        util/cuda/moderngpu/kernel_workcreate.hxx
-        util/cuda/moderngpu/launch_box.hxx
-        util/cuda/moderngpu/launch_params.hxx
-        util/cuda/moderngpu/loadstore.hxx
-        util/cuda/moderngpu/memory.hxx
-        util/cuda/moderngpu/meta.hxx
-        util/cuda/moderngpu/operators.hxx
-        util/cuda/moderngpu/search.hxx
-        util/cuda/moderngpu/sort_networks.hxx
-        util/cuda/moderngpu/transform.hxx
-        util/cuda/moderngpu/tuple.hxx
-        util/cuda/moderngpu/types.hxx
-        util/cuda/moderngpu/util.hxx
-        DESTINATION openfpm_data/include/util/cuda/moderngpu/
-	COMPONENT OpenFPM)
-
 #if(BUILD_TESTING)
 
 #  add_executable(particle_test test.cu)
diff --git a/src/Grid/cuda/cuda_grid_gpu_int.cu b/src/Grid/cuda/cuda_grid_gpu_int.cu
index aaab8292..74976b8e 100644
--- a/src/Grid/cuda/cuda_grid_gpu_int.cu
+++ b/src/Grid/cuda/cuda_grid_gpu_int.cu
@@ -172,7 +172,7 @@ BOOST_AUTO_TEST_CASE (gpu_p2m)
 
 	size_t g_m = pos.size();
 
-	mgpu::ofp_context_t context(false);
+	gpu::ofp_context_t context(false);
 
 	const size_t (& sz)[3] = grid.getGrid().getSize();
 
diff --git a/src/NN/CellList/CellList_gpu_test.cu b/src/NN/CellList/CellList_gpu_test.cu
index 2d19dd2a..d6cd4844 100644
--- a/src/NN/CellList/CellList_gpu_test.cu
+++ b/src/NN/CellList/CellList_gpu_test.cu
@@ -464,7 +464,7 @@ void test_cell_count_n()
 
 	CUDA_LAUNCH_DIM3(construct_cells,1,1,vs.toKernel(),gs);
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 
 	vs.flush<sadd_<0>>(ctx,flush_type::FLUSH_ON_DEVICE);
 
@@ -786,8 +786,8 @@ template<unsigned int dim, typename T, typename CellS> void Test_cell_gpu(SpaceB
 	pl.template hostToDevice<0>();
 	pl_prp.template hostToDevice<0,1,2>();
 
-	// create an mgpu context
-	mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
+	// create an gpu context
+	gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props);
 	cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context);
 
 	// Check
@@ -1358,7 +1358,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di
 
 	size_t g_m = pl.size() / 2;
 
-	mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
+	gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props);
 	cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
 
 	auto & s_t_ns = cl2.getSortToNonSort();
@@ -1564,7 +1564,7 @@ void Test_cell_gpu_force_split(SpaceBox<dim,T> & box, size_t npart, const size_t
 
 	size_t g_m = pl.size() / 2;
 
-	mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
+	gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props);
 	cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size()/2);
 	cl2_split2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,pl.size()/2,pl.size());
 	auto & s_t_ns_s1 = cl2_split1.getSortToNonSort();
@@ -1809,7 +1809,7 @@ BOOST_AUTO_TEST_CASE( CellList_gpu_use_calc_force_box_split)
 
 	size_t g_m = pl.size() / 2;
 
-	mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
+	gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props);
 
 	cl2_split1.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,0,pl.size()/2);
 	cl2_split2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m,pl.size()/2,pl.size());
@@ -2013,7 +2013,7 @@ BOOST_AUTO_TEST_CASE( CellList_use_cpu_offload_test )
 	openfpm::vector_gpu<aggregate<int>> os_scan;
 	os_scan.resize(v.size());
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	openfpm::scan((int *)os.template getDeviceBuffer<0>(),os.size(),(int *)os_scan.template getDeviceBuffer<0>(),ctx);
 
 	os_scan.deviceToHost<0>();
@@ -2089,7 +2089,7 @@ BOOST_AUTO_TEST_CASE( CellList_swap_test )
 
 	size_t g_m = pl.size() / 2;
 
-	mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
+	gpu::ofp_context_t context(gpu::gpu_context_opt::no_print_props);
 	cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
 	cl4.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
 
diff --git a/src/NN/CellList/CellList_util.hpp b/src/NN/CellList/CellList_util.hpp
index 63caffbf..9f63cb0c 100644
--- a/src/NN/CellList/CellList_util.hpp
+++ b/src/NN/CellList/CellList_util.hpp
@@ -17,7 +17,7 @@ enum cl_construct_opt
 	Only_reorder
 };
 
-#include "util/cuda/ofp_context.hxx"
+#include "util/ofp_context.hpp"
 
 
 /*! \brief populate the Cell-list with particles non symmetric case on GPU
@@ -48,7 +48,7 @@ struct populate_cell_list_no_sym_impl
 						   openfpm::vector<prop,Memory,layout_base > & v_prp,
 						   openfpm::vector<prop,Memory,layout_base > & v_prp_out,
 			   	   	   	   CellList & cli,
-			   	   	   	   mgpu::ofp_context_t & context,
+						   gpu::ofp_context_t & context,
 			   	   	   	   size_t g_m,
 			   	   	   	   cl_construct_opt optc)
 	{
@@ -70,7 +70,7 @@ struct populate_cell_list_no_sym_impl<true>
 						 openfpm::vector<prop,Memory,layout_base > & v_prp,
 						 openfpm::vector<prop,Memory,layout_base > & v_prp_out,
 			   	   	   	   CellList & cli,
-			   	   	   	   mgpu::ofp_context_t & context,
+						   gpu::ofp_context_t & context,
 			   	   	   	   size_t g_m,
 			   	   	   	   cl_construct_opt optc)
 	{
@@ -138,12 +138,12 @@ void populate_cell_list_no_sym(openfpm::vector<Point<dim,T>,Memory,layout_base >
 		 	 	 	 	 	   openfpm::vector<prop,Memory,layout_base > & v_prp,
 		 	 	 	 	 	   openfpm::vector<prop,Memory,layout_base > & v_prp_out,
 							   CellList & cli,
-							   mgpu::ofp_context_t & mgpu,
+							   gpu::ofp_context_t & gpu,
 							   size_t g_m,
 							   cl_construct_opt optc)
 {
 	populate_cell_list_no_sym_impl<is_gpu_celllist<CellList>::value>
-								  ::template populate<dim,T,prop,Memory,layout_base,CellList, prp ...>(pos,v_pos_out,v_prp,v_prp_out,cli,mgpu,g_m,optc);
+								  ::template populate<dim,T,prop,Memory,layout_base,CellList, prp ...>(pos,v_pos_out,v_prp,v_prp_out,cli,gpu,g_m,optc);
 }
 
 /*! \brief populate the Cell-list with particles symmetric case
@@ -189,7 +189,7 @@ void populate_cell_list(openfpm::vector<Point<dim,T>,Memory,layout_base> & pos,
  	 	   	   	   	    openfpm::vector<prop,Memory,layout_base > & v_prp,
  	 	   	   	   	    openfpm::vector<prop,Memory,layout_base > & v_prp_out,
 						CellList & cli,
-						mgpu::ofp_context_t & context,
+						gpu::ofp_context_t & context,
 						size_t g_m,
 						size_t opt,
 						cl_construct_opt optc)
@@ -222,7 +222,7 @@ template<unsigned int dim,
 		 unsigned int ... prp>
 void populate_cell_list(openfpm::vector<Point<dim,T>,Memory,layout_base> & pos,
 						CellList & cli,
-						mgpu::ofp_context_t & context,
+						gpu::ofp_context_t & context,
 						size_t g_m,
 						size_t opt,
 						cl_construct_opt optc)
diff --git a/src/NN/CellList/cuda/CellList_gpu.hpp b/src/NN/CellList/cuda/CellList_gpu.hpp
index 5ee0e2b0..886a7f8a 100644
--- a/src/NN/CellList/cuda/CellList_gpu.hpp
+++ b/src/NN/CellList/cuda/CellList_gpu.hpp
@@ -237,7 +237,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
   	   	   	 	 	 	  vector & pl_out,
   	   	   	 	 	 	  vector_prp & pl_prp,
   	   	   	 	 	 	  vector_prp & pl_prp_out,
-  	   	   	 	 	 	  mgpu::ofp_context_t & mgpuContext,
+						  gpu::ofp_context_t & gpuContext,
   	   	   	 	 	 	  size_t g_m,
  			   	   	   	  size_t start,
  			   	   	   	  size_t stop,
@@ -276,7 +276,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 		cl_sparse.template setBackground<0>((cnt_type)-1);
 		cl_sparse.setGPUInsertBuffer(ite_gpu.wthr.x,ite_gpu.thr.x);
 		CUDA_LAUNCH((fill_cells_sparse),ite_gpu,cl_sparse.toKernel(),starts.toKernel());
-		cl_sparse.template flush_vd<sstart_<0>>(cells,mgpuContext,FLUSH_ON_DEVICE);
+		cl_sparse.template flush_vd<sstart_<0>>(cells,gpuContext,FLUSH_ON_DEVICE);
 
 		cells_nn.resize(cl_sparse.size()+1);
 		cells_nn.template fill<0>(0);
@@ -286,7 +286,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 		CUDA_LAUNCH((count_nn_cells),itgg,cl_sparse.toKernel(),cells_nn.toKernel(),cells_nn_test.toKernel());
 
 		// now we scan
-		openfpm::scan((cnt_type *)cells_nn.template getDeviceBuffer<0>(), cells_nn.size(), (cnt_type *)cells_nn.template getDeviceBuffer<0>() , mgpuContext);
+		openfpm::scan((cnt_type *)cells_nn.template getDeviceBuffer<0>(), cells_nn.size(), (cnt_type *)cells_nn.template getDeviceBuffer<0>() , gpuContext);
 
 		cells_nn.template deviceToHost<0>(cells_nn.size() - 1, cells_nn.size() - 1);
 		size_t n_nn_cells = cells_nn.template get<0>(cells_nn.size() - 1);
@@ -316,7 +316,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 
 		if (opt == cl_construct_opt::Full)
 		{
-			construct_domain_ids(mgpuContext,start,stop,g_m);
+			construct_domain_ids(gpuContext,start,stop,g_m);
 		}
 
 	#else
@@ -328,10 +328,10 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 
 	/*! \brief Construct the ids of the particles domain in the sorted array
 	 *
-	 * \param mgpuContext mgpu context
+	 * \param gpuContext gpu context
 	 *
 	 */
-	void construct_domain_ids(mgpu::ofp_context_t & mgpuContext, size_t start, size_t stop, size_t g_m)
+	void construct_domain_ids(gpu::ofp_context_t & gpuContext, size_t start, size_t stop, size_t g_m)
 	{
 #ifdef __NVCC__
 		sorted_domain_particles_dg.resize(stop-start+1);
@@ -341,7 +341,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 		CUDA_LAUNCH((mark_domain_particles),ite,sorted_to_not_sorted.toKernel(),sorted_domain_particles_dg.toKernel(),g_m);
 
 		// lets scan
-		openfpm::scan((unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),sorted_domain_particles_dg.size(),(unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),mgpuContext);
+		openfpm::scan((unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),sorted_domain_particles_dg.size(),(unsigned int *)sorted_domain_particles_dg.template getDeviceBuffer<0>(),gpuContext);
 
 		sorted_domain_particles_dg.template deviceToHost<0>(sorted_domain_particles_dg.size()-1,sorted_domain_particles_dg.size()-1);
 		auto sz = sorted_domain_particles_dg.template get<0>(sorted_domain_particles_dg.size()-1);
@@ -361,7 +361,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 			   	   	   	 vector & pl_out,
 			   	   	   	 vector_prp & pl_prp,
 			   	   	   	 vector_prp & pl_prp_out,
-			   	   	   	 mgpu::ofp_context_t & mgpuContext,
+						 gpu::ofp_context_t & gpuContext,
 			   	   	   	 size_t g_m,
 			   	   	   	 size_t start,
 			   	   	   	 size_t stop,
@@ -398,7 +398,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 
 		// now we scan
 		starts.resize(cl_n.size());
-		openfpm::scan((cnt_type *)cl_n.template getDeviceBuffer<0>(), cl_n.size(), (cnt_type *)starts.template getDeviceBuffer<0>() , mgpuContext);
+		openfpm::scan((cnt_type *)cl_n.template getDeviceBuffer<0>(), cl_n.size(), (cnt_type *)starts.template getDeviceBuffer<0>() , gpuContext);
 
 		// now we construct the cells
 
@@ -414,7 +414,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 
                 // sort
 
-                mgpu::mergesort(static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()),pl.size(),mgpu::less_t<cnt_type>(),mgpuContext);
+                gpu::mergesort(static_cast<cnt_type *>(part_ids.template getDeviceBuffer<0>()),static_cast<cnt_type *>(cells.template getDeviceBuffer<0>()),pl.size(),gpu::less_t<cnt_type>(),gpuContext);
 
 #else
 
@@ -470,7 +470,7 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
 
 		if (opt == cl_construct_opt::Full)
 		{
-			construct_domain_ids(mgpuContext,start,stop,g_m);
+			construct_domain_ids(gpuContext,start,stop,g_m);
 		}
 
 	#else
@@ -630,7 +630,7 @@ public:
 				   vector & pl_out,
 				   vector_prp & pl_prp,
 				   vector_prp & pl_prp_out,
-				   mgpu::ofp_context_t & mgpuContext,
+				   gpu::ofp_context_t & gpuContext,
 				   size_t g_m = 0,
 				   size_t start = 0,
 				   size_t stop = (size_t)-1,
@@ -640,8 +640,8 @@ public:
 		if (stop == (size_t)-1)
 		{stop = pl.size();}
 
-		if (is_sparse == false) {construct_dense<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,mgpuContext,g_m,start,stop,opt);}
-		else {construct_sparse<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,mgpuContext,g_m,start,stop,opt);}
+		if (is_sparse == false) {construct_dense<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,gpuContext,g_m,start,stop,opt);}
+		else {construct_sparse<vector,vector_prp,prp...>(pl,pl_out,pl_prp,pl_prp_out,gpuContext,g_m,start,stop,opt);}
 	}
 
 	CellList_gpu_ker<dim,T,cnt_type,ids_type,transform,is_sparse> toKernel()
diff --git a/src/NN/VerletList/VerletListFast.hpp b/src/NN/VerletList/VerletListFast.hpp
index c6869025..0e74faab 100644
--- a/src/NN/VerletList/VerletListFast.hpp
+++ b/src/NN/VerletList/VerletListFast.hpp
@@ -322,7 +322,7 @@ private:
 	 */
 	void initCl(CellListImpl & cli, vector_pos_type & pos, size_t g_m, size_t opt)
 	{
-		mgpu::ofp_context_t context(mgpu::gpu_context_opt::dummy);
+		gpu::ofp_context_t context(gpu::gpu_context_opt::dummy);
 		if (opt & VL_SYMMETRIC || opt & VL_CRS_SYMMETRIC)
 		{populate_cell_list(pos,cli,context,g_m,CL_SYMMETRIC,cl_construct_opt::Full);}
 		else
diff --git a/src/SparseGridGpu/BlockMapGpu.hpp b/src/SparseGridGpu/BlockMapGpu.hpp
index b8c3a65b..9de735da 100644
--- a/src/SparseGridGpu/BlockMapGpu.hpp
+++ b/src/SparseGridGpu/BlockMapGpu.hpp
@@ -287,7 +287,7 @@ public:
     }
 
     template<typename ... v_reduce>
-    void flush(mgpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST)
+    void flush(gpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST)
     {
 #ifdef SE_CLASS1
 
diff --git a/src/SparseGridGpu/BlockMapGpu_kernels.cuh b/src/SparseGridGpu/BlockMapGpu_kernels.cuh
index 525cabde..dd0a8174 100644
--- a/src/SparseGridGpu/BlockMapGpu_kernels.cuh
+++ b/src/SparseGridGpu/BlockMapGpu_kernels.cuh
@@ -584,7 +584,7 @@ struct sparse_vector_reduction_solve_conflict
 	vector_segolddata_type & segments_oldData;
 
 	//! gpu context
-	mgpu::ofp_context_t & context;
+	gpu::ofp_context_t & context;
 
 	/*! \brief constructor
 	 *
@@ -600,7 +600,7 @@ struct sparse_vector_reduction_solve_conflict
 								   vector_segoffset_type & segment_offset,
 								   vector_outmap_type & out_map,
 								   vector_segolddata_type & segments_oldData,
-								   mgpu::ofp_context_t & context)
+								   gpu::ofp_context_t & context)
 	:vector_data_red(vector_data_red),
 	 vector_data(vector_data),
 	 vector_data_unsorted(vector_data_unsorted),
@@ -701,7 +701,7 @@ namespace BlockMapGpuFunctors
         bool solve_conflicts(vector_index_type &keys, vector_index_type &mergeIndices, vector_index_type2 &segments_new, vector_index_type &data_map,
                                     vector_data_type &dataOld, vector_data_type &dataNew,
                                     vector_index_type &keysOut, vector_data_type &dataOut,
-                                    mgpu::ofp_context_t & context)
+                                    gpu::ofp_context_t & context)
         {
 #ifdef __NVCC__
             typedef ValueTypeOf<vector_data_type> AggregateT;
diff --git a/src/SparseGridGpu/SparseGridGpu.hpp b/src/SparseGridGpu/SparseGridGpu.hpp
index a315f7f7..b872a746 100644
--- a/src/SparseGridGpu/SparseGridGpu.hpp
+++ b/src/SparseGridGpu/SparseGridGpu.hpp
@@ -770,7 +770,7 @@ public:
     }
 
     template<typename ... v_reduce>
-    void flush(mgpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST)
+    void flush(gpu::ofp_context_t &context, flush_type opt = FLUSH_ON_HOST)
     {
         BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>
                 ::template flush<v_reduce ...>(context, opt);
@@ -1094,7 +1094,7 @@ private:
     }
 
     template<typename MemType, unsigned int ... prp>
-    void preUnpack(ExtPreAlloc<MemType> * prAlloc_prp, mgpu::ofp_context_t & ctx, int opt)
+    void preUnpack(ExtPreAlloc<MemType> * prAlloc_prp, gpu::ofp_context_t & ctx, int opt)
     {
 		if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
 		{
@@ -1114,14 +1114,14 @@ private:
 
 
 	template<unsigned int ... prp>
-	void removeCopyToFinalize_phase1(mgpu::ofp_context_t & ctx, int opt)
+	void removeCopyToFinalize_phase1(gpu::ofp_context_t & ctx, int opt)
 	{
 		if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
 		{removePoints(ctx);}
 	}
 
 	template<unsigned int ... prp>
-	void removeCopyToFinalize_phase2(mgpu::ofp_context_t & ctx, int opt)
+	void removeCopyToFinalize_phase2(gpu::ofp_context_t & ctx, int opt)
 	{
 		// Pack information
 		Pack_stat sts;
@@ -1173,7 +1173,7 @@ private:
 	}
 
 	template<unsigned int ... prp>
-	void removeCopyToFinalize_phase3(mgpu::ofp_context_t & ctx, int opt, bool is_unpack_remote)
+	void removeCopyToFinalize_phase3(gpu::ofp_context_t & ctx, int opt, bool is_unpack_remote)
 	{
 		ite_gpu<1> ite;
 
@@ -1417,7 +1417,7 @@ private:
 	void addAndConvertPackedChunkToTmp(ExtPreAlloc<S2> & mem,
 				SparseGridGpu_iterator_sub<dim,self> & sub_it,
 				Unpack_stat & ps,
-				mgpu::ofp_context_t &context)
+				gpu::ofp_context_t &context)
 	{
     	sparsegridgpu_pack_request<AggregateT,prp ...> spq;
     	boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prp)>>(spq);
@@ -2019,7 +2019,7 @@ public:
 	 * \param grid_dw grid level down
 	 *
 	 */
-    void construct_link(self & grid_up, self & grid_dw, mgpu::ofp_context_t &context)
+    void construct_link(self & grid_up, self & grid_dw, gpu::ofp_context_t &context)
     {
 /*        // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
         auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
@@ -2106,7 +2106,7 @@ public:
 	 * \param gpu context
 	 *
 	 */
-    void construct_link_dw(self & grid_dw, const Box<dim,int> & db_, Point<dim,int> p_dw, mgpu::ofp_context_t &context)
+    void construct_link_dw(self & grid_dw, const Box<dim,int> & db_, Point<dim,int> p_dw, gpu::ofp_context_t &context)
     {
     	Box<dim,int> db = db_;
 
@@ -2184,7 +2184,7 @@ public:
 	 * \praram grid_up grid level up
 	 *
 	 */
-    void construct_link_up(self & grid_up,  const Box<dim,int> & db_, Point<dim,int> p_up, mgpu::ofp_context_t &context)
+    void construct_link_up(self & grid_up,  const Box<dim,int> & db_, Point<dim,int> p_up, gpu::ofp_context_t &context)
     {
     	Box<dim,int> db = db_;
 
@@ -2284,7 +2284,7 @@ public:
 	}
 
     template<typename stencil_type = NNStar<dim>, typename checker_type = No_check>
-    void tagBoundaries(mgpu::ofp_context_t &context, checker_type chk = checker_type(), tag_boundaries opt = tag_boundaries::NO_CALCULATE_EXISTING_POINTS)
+    void tagBoundaries(gpu::ofp_context_t &context, checker_type chk = checker_type(), tag_boundaries opt = tag_boundaries::NO_CALCULATE_EXISTING_POINTS)
     {
         // Here it is crucial to use "auto &" as the type, as we need to be sure to pass the reference to the actual buffers!
         auto & indexBuffer = BlockMapGpu<AggregateInternalT, threadBlockSize, indexT, layout_base>::blockMap.getIndexBuffer();
@@ -2977,7 +2977,7 @@ public:
      *
      */
 	template<int ... prp> inline
-	void packRequest(size_t & req, mgpu::ofp_context_t &context) const
+	void packRequest(size_t & req, gpu::ofp_context_t &context) const
     {
     	ite_gpu<1> ite;
 
@@ -3070,7 +3070,7 @@ public:
 	 *
 	 */
 	template<int ... prp> inline
-	void packCalculate(size_t & req, mgpu::ofp_context_t &context)
+	void packCalculate(size_t & req, gpu::ofp_context_t &context)
 	{
     	ite_gpu<1> ite;
 		pack_subs.template hostToDevice<0,1>();
@@ -3308,7 +3308,7 @@ public:
 	 *
 	 */
 	template<unsigned int ... prp>
-	void removeCopyToFinalize(mgpu::ofp_context_t & ctx, int opt)
+	void removeCopyToFinalize(gpu::ofp_context_t & ctx, int opt)
 	{
 		if ((opt & 0x3) == rem_copy_opt::PHASE1)
 		{
@@ -3410,7 +3410,7 @@ public:
 	 * \param context modern gpu context
 	 *
 	 */
-	void removePoints(mgpu::ofp_context_t& context)
+	void removePoints(gpu::ofp_context_t& context)
 	{
     	auto & indexBuffer = private_get_index_array();
     	auto & dataBuffer = private_get_data_array();
@@ -3485,7 +3485,7 @@ public:
 	 *
 	 */
 	template<unsigned int ... prp>
-	void removeAddUnpackFinalize(mgpu::ofp_context_t& context, int opt)
+	void removeAddUnpackFinalize(gpu::ofp_context_t& context, int opt)
 	{
 		if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false)
 		{removePoints(context);}
@@ -3593,7 +3593,7 @@ public:
 				header_type & headers,
 				int ih,
 				Unpack_stat & ps,
-				mgpu::ofp_context_t &context,
+				gpu::ofp_context_t &context,
 				rem_copy_opt opt = rem_copy_opt::NONE_OPT)
 	{
 		////////////////////////////////////////////////////////////
@@ -3661,7 +3661,7 @@ public:
 	void unpack(ExtPreAlloc<S2> & mem,
 				SparseGridGpu_iterator_sub<dim,self> & sub_it,
 				Unpack_stat & ps,
-				mgpu::ofp_context_t &context,
+				gpu::ofp_context_t &context,
 				rem_copy_opt opt = rem_copy_opt::NONE_OPT)
 	{
 		////////////////////////////////////////////////////////////
diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_get_nn.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_get_nn.cu
index 768f9ea8..d673edd4 100644
--- a/src/SparseGridGpu/performance/SparseGridGpu_performance_get_nn.cu
+++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_get_nn.cu
@@ -43,7 +43,7 @@ void testGetNeighbourhood(std::string testURI, unsigned int i)
     dim3 blockSizeBlockedInsert(1, 1);
     grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
     SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry);
-    mgpu::ofp_context_t ctx;
+    gpu::ofp_context_t ctx;
     sparseGrid.template setBackgroundValue<0>(0);
 
     // Now fill the grid once
diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_get_single.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_get_single.cu
index a2db1e64..20104776 100644
--- a/src/SparseGridGpu/performance/SparseGridGpu_performance_get_single.cu
+++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_get_single.cu
@@ -44,7 +44,7 @@ void testGetSingle(std::string testURI, unsigned int i)
     dim3 blockSizeBlockedInsert(1, 1);
     grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
     SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry);
-    mgpu::ofp_context_t ctx;
+    gpu::ofp_context_t ctx;
     sparseGrid.template setBackgroundValue<0>(0);
 
     // Now fill the grid once
diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil.cu
index d3d5164d..86105e05 100644
--- a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil.cu
+++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil.cu
@@ -41,7 +41,7 @@ void testStencilHeat_perf(unsigned int i, std::string base)
 	dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_);
 	typename SparseGridZ::grid_info blockGeometry(gridSize);
 	SparseGridZ sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
     unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_;
diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_3d.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_3d.cu
index 73eed9db..8431728c 100644
--- a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_3d.cu
+++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_3d.cu
@@ -42,7 +42,7 @@ void testStencilHeat3D_perf(unsigned int i, std::string base)
 
     typename SparseGridZ::grid_info blockGeometry(gridSize);
     SparseGridZ sparseGrid(blockGeometry);
-    mgpu::ofp_context_t ctx;
+    gpu::ofp_context_t ctx;
     sparseGrid.template setBackgroundValue<0>(0);
 
     unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_
@@ -166,7 +166,7 @@ void testStencilHeat3DSparse_perf(unsigned int i, std::string base, float fillMu
     size_t sz[3] = {spatialEdgeSize, spatialEdgeSize, spatialEdgeSize};
     typename SparseGridZ::grid_info blockGeometry(sz);
     SparseGridZ sparseGrid(blockGeometry);
-    mgpu::ofp_context_t ctx;
+    gpu::ofp_context_t ctx;
     sparseGrid.template setBackgroundValue<0>(0);
 
     ///// Insert sparse content, a set of concentric spheres /////
diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu
index abec6ef5..901a3439 100644
--- a/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu
+++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu
@@ -45,7 +45,7 @@ void testStencilHeatSparse_perf(unsigned int i, std::string base, float fillMult
     size_t sz[2] = {spatialEdgeSize, spatialEdgeSize};
     typename SparseGridZ::grid_info blockGeometry(sz);
     SparseGridZ sparseGrid(blockGeometry);
-    mgpu::ofp_context_t ctx;
+    gpu::ofp_context_t ctx;
     sparseGrid.template setBackgroundValue<0>(0);
 
     ///// Insert sparse content, a set of concentric spheres /////
diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_block.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_block.cu
index 40270931..0e281e71 100644
--- a/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_block.cu
+++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_block.cu
@@ -46,7 +46,7 @@ void test_insert_block(std::string testURI, unsigned int i)
 	dim3 blockSizeBlockedInsert(1, 1);
 	grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// Warmup
diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_single.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_single.cu
index afacc2d3..aeb58b1b 100644
--- a/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_single.cu
+++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_insert_single.cu
@@ -43,7 +43,7 @@ void testInsertSingle(std::string testURI, unsigned int i)
 	dim3 blockSize(blockEdgeSize, blockEdgeSize);
 	grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	if (prePopulateGrid)
diff --git a/src/SparseGridGpu/performance/SparseGridGpu_performance_tests.cu b/src/SparseGridGpu/performance/SparseGridGpu_performance_tests.cu
index af6dcf8e..7f432300 100644
--- a/src/SparseGridGpu/performance/SparseGridGpu_performance_tests.cu
+++ b/src/SparseGridGpu/performance/SparseGridGpu_performance_tests.cu
@@ -56,7 +56,7 @@ void testStencilHeatGet_perf(unsigned int i, std::string base)
     dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_);
     typename SparseGridZ::grid_info blockGeometry(gridSize);
     SparseGridZ sparseGrid(blockGeometry);
-    mgpu::ofp_context_t ctx;
+    gpu::ofp_context_t ctx;
     sparseGrid.template setBackgroundValue<0>(0);
 
     unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_;
@@ -180,7 +180,7 @@ void testStencilSkeleton_perf(unsigned int i, std::string base)
     dim3 blockSize(SparseGridZ::blockEdgeSize_,SparseGridZ::blockEdgeSize_);
     typename SparseGridZ::grid_info blockGeometry(gridSize);
     SparseGridZ sparseGrid(blockGeometry);
-    mgpu::ofp_context_t ctx;
+    gpu::ofp_context_t ctx;
     sparseGrid.template setBackgroundValue<0>(0);
 
     unsigned long long numElements = gridEdgeSize*SparseGridZ::blockEdgeSize_*gridEdgeSize*SparseGridZ::blockEdgeSize_;
diff --git a/src/SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu b/src/SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu
index f94cc157..037a270b 100644
--- a/src/SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu
+++ b/src/SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu
@@ -128,7 +128,7 @@ BOOST_AUTO_TEST_CASE(testSegreduce_total)
 		openfpm::vector_gpu<aggregate<MaskBlockT, BlockT>> outputData;
 		outputData.resize(100);
 
-		CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<BLOCK, 0, BITMASK, 2, mgpu::plus_t<ScalarT>>),segments.size()-1, 2*BlockT::size,
+		CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<BLOCK, 0, BITMASK, 2, gpu::plus_t<ScalarT>>),segments.size()-1, 2*BlockT::size,
 		data_new.toKernel(),
 		data_old.toKernel(),
 		segments.toKernel(),
@@ -138,7 +138,7 @@ BOOST_AUTO_TEST_CASE(testSegreduce_total)
 		outputData.toKernel());
 
 		// Segreduce on mask
-		CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<BITMASK, 0, BITMASK, 2, mgpu::maximum_t<unsigned char>>),segments.size()-1, 2*BlockT::size,
+		CUDA_LAUNCH_DIM3((BlockMapGpuKernels::segreduce_total<BITMASK, 0, BITMASK, 2, gpu::maximum_t<unsigned char>>),segments.size()-1, 2*BlockT::size,
 		data_new.toKernel(),
 		data_old.toKernel(),
 		segments.toKernel(),
@@ -258,7 +258,7 @@ BOOST_AUTO_TEST_CASE(test_maps_create)
 
 	CUDA_LAUNCH(BlockMapGpuKernels::compute_predicate,ite,merge_keys.toKernel(),merge_indexes.toKernel(),9,p_ids.toKernel());
 
-	mgpu::ofp_context_t context;
+	gpu::ofp_context_t context;
 	openfpm::scan((int *)p_ids.template getDeviceBuffer<0>(),
 				s_ids.size(),
 	            (int *)s_ids.template getDeviceBuffer<0>(),
@@ -350,7 +350,7 @@ BOOST_AUTO_TEST_CASE (testSolve_conflicts)
 	openfpm::vector_gpu<aggregate<unsigned int>> keys, mergeIndices, tmpIndices, keysOut, trivial_map;
 	openfpm::vector_gpu<aggregate<unsigned int,unsigned int>> segments_new;
 	openfpm::vector_gpu<aggregate<BlockT, MaskBlockT>> dataOld, dataNew, tmpData, dataOut;
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 
 	// Keys
 	keys.resize(14);
diff --git a/src/SparseGridGpu/tests/BlockMapGpu_tests.cu b/src/SparseGridGpu/tests/BlockMapGpu_tests.cu
index 0628108c..fb16d290 100644
--- a/src/SparseGridGpu/tests/BlockMapGpu_tests.cu
+++ b/src/SparseGridGpu/tests/BlockMapGpu_tests.cu
@@ -143,7 +143,7 @@ BOOST_AUTO_TEST_CASE(testInsert)
 	CUDA_LAUNCH_DIM3((insertValues<0>), gridSize, blockSizeInsert ,blockMap.toKernel());
 
 	// Flush inserts
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	blockMap.flush<smax_<0>>(ctx, flush_type::FLUSH_ON_DEVICE);
 
 	// Get output
@@ -188,7 +188,7 @@ BOOST_AUTO_TEST_CASE(testInsert_halfBlock)
 	CUDA_LAUNCH_DIM3((insertValuesHalfBlock<0>), gridSize, blockSizeInsert, blockMap.toKernel());
 
 	// Flush inserts
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	blockMap.flush<smax_<0>>(ctx, flush_type::FLUSH_ON_DEVICE);
 
 	// Get output
@@ -239,7 +239,7 @@ BOOST_AUTO_TEST_CASE(testInsert_blocked)
 	CUDA_LAUNCH_DIM3((insertValuesBlocked<0, 2>), gridSize, blockSizeInsert,sparseGrid.toKernel());
 
 	// Flush inserts
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.flush<smax_<0>>(ctx, flush_type::FLUSH_ON_DEVICE);
 
 	// Get output
diff --git a/src/SparseGridGpu/tests/SparseGridGpu_tests.cu b/src/SparseGridGpu/tests/SparseGridGpu_tests.cu
index efe86503..4b0e5fc4 100644
--- a/src/SparseGridGpu/tests/SparseGridGpu_tests.cu
+++ b/src/SparseGridGpu/tests/SparseGridGpu_tests.cu
@@ -225,7 +225,7 @@ BOOST_AUTO_TEST_CASE(testInsert)
 
 	CUDA_LAUNCH_DIM3((insertValues<0>),gridSize, blockSizeInsert,sparseGrid.toKernel());
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
 
 	sparseGrid.template deviceToHost<0>();
@@ -263,7 +263,7 @@ BOOST_AUTO_TEST_CASE(testInsert3D)
 
 	CUDA_LAUNCH_DIM3((insertValues<0>),gridSize, blockSizeInsert,sparseGrid.toKernel());
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
 
 	sparseGrid.template deviceToHost<0>();
@@ -295,7 +295,7 @@ BOOST_AUTO_TEST_CASE(testTagBoundaries)
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry);
 
 	sparseGrid.template setBackgroundValue<0>(666);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 
 	sparseGrid.setGPUInsertBuffer(gridSize, blockSizeInsert);
 	dim3 pt1(0, 0, 0);
@@ -382,7 +382,7 @@ BOOST_AUTO_TEST_CASE(testTagBoundaries2)
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry);
 
 	sparseGrid.template setBackgroundValue<0>(666);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 
 	///////
 	{
@@ -480,7 +480,7 @@ BOOST_AUTO_TEST_CASE(testStencilHeat)
 
 	grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// Insert values on the grid
@@ -529,7 +529,7 @@ BOOST_AUTO_TEST_CASE(testStencil_lap_simplified)
 
 	grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// Insert values on the grid
@@ -586,7 +586,7 @@ BOOST_AUTO_TEST_CASE(testStencil_lap_no_cross_simplified)
 
 	grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// Insert values on the grid
@@ -662,7 +662,7 @@ BOOST_AUTO_TEST_CASE(testStencil_lap_no_cross_simplified2)
 
 	grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// Insert values on the grid
@@ -751,7 +751,7 @@ BOOST_AUTO_TEST_CASE(testStencil_lap_no_cross_simplified_subset)
 
 	grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// Insert values on the grid
@@ -809,7 +809,7 @@ BOOST_AUTO_TEST_CASE(testFlushInsert)
 	size_t sz[] = {137,100,57};
 
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64> sparseGrid(sz);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	sparseGrid.insertFlush<0>(grid_key_dx<3>({3,6,7})) = 2.0;
@@ -1015,7 +1015,7 @@ void test_convolution_3x3x3()
 	size_t sz[] = {1000,1000,1000};
 
 	SparseGridZ sparseGrid(sz);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// now create 3 3D sphere
@@ -1082,7 +1082,7 @@ void test_convolution_3x3x3_no_shared()
 	size_t sz[] = {1000,1000,1000};
 
 	SparseGridZ sparseGrid(sz);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// now create 3 3D sphere
@@ -1187,7 +1187,7 @@ BOOST_AUTO_TEST_CASE(test_sparse_grid_iterator_sub_host)
 
 	grid_smb<dim, blockEdgeSize> blockGeometry(sz);
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	///// Insert sparse content, a set of 3 hollow spheres /////
@@ -1242,7 +1242,7 @@ BOOST_AUTO_TEST_CASE(test_sparse_grid_iterator_host)
 
 	grid_smb<dim, blockEdgeSize> blockGeometry(sz);
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	///// Insert sparse content, a set of 3 hollow spheres /////
@@ -1291,7 +1291,7 @@ BOOST_AUTO_TEST_CASE(test_pack_request)
 	typedef SparseGridGpu<dim, aggregate<float>, blockEdgeSize, 64, long int> SparseGridZ;
 
 	SparseGridZ sparseGrid(sz);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// now create a 3D sphere
@@ -1331,7 +1331,7 @@ BOOST_AUTO_TEST_CASE(test_MergeIndexMap)
 	typedef SparseGridGpu<dim, aggregate<float>, blockEdgeSize, 64, long int> SparseGridZ;
 
 	SparseGridZ sparseGrid(sz);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// now create a 3D sphere
@@ -1389,7 +1389,7 @@ BOOST_AUTO_TEST_CASE(test_pack_request_with_iterator)
 	typedef SparseGridGpu<dim, aggregate<float>, blockEdgeSize, 64, long int> SparseGridZ;
 
 	SparseGridZ sparseGrid(sz);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// now create a 3D sphere
@@ -1482,7 +1482,7 @@ BOOST_AUTO_TEST_CASE(sparsegridgpu_remove_test)
 	typedef SparseGridGpu<dim, aggregate<float>, blockEdgeSize, 64, long int> SparseGridZ;
 
 	SparseGridZ sparseGrid(sz);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// now create a 3D sphere
@@ -1555,7 +1555,7 @@ void pack_unpack_test(SG_type & sparseGridDst, SG_type & sparseGridSrc,
 		Box<3,size_t> & box2_dst,
 		Box<3,size_t> & box3_dst,
 		Box<3,size_t> & box4_dst,
-		mgpu::ofp_context_t & ctx,
+		gpu::ofp_context_t & ctx,
 		bool test_pack)
 {
     Box<3,size_t> box1_src({256,256,256},{273,390,390});
@@ -1704,7 +1704,7 @@ BOOST_AUTO_TEST_CASE(sparsegridgpu_pack_unpack)
 
 	SparseGridZ sparseGridSrc(sz);
 	SparseGridZ sparseGridDst(sz);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGridSrc.template setBackgroundValue<0>(0);
 	sparseGridDst.template setBackgroundValue<0>(0);
 
@@ -1947,7 +1947,7 @@ BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput3DHeatStencil)
 
 	grid_smb<dim, blockEdgeSize> blockGeometry(sz);
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	///// Insert sparse content, a set of 3 hollow spheres /////
@@ -2021,7 +2021,7 @@ BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput)
 
 	grid_smb<dim, blockEdgeSize> blockGeometry(sz);
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	grid_key_dx<2,int> start({500000,500000});
@@ -2051,7 +2051,7 @@ BOOST_AUTO_TEST_CASE(testSparseGridGpuOutput3D)
 
 	grid_smb<dim, blockEdgeSize> blockGeometry(sz);
 	SparseGridGpu<dim, AggregateT, blockEdgeSize, 64, long int> sparseGrid(blockGeometry);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	grid_key_dx<3,int> start({256,256,256});
diff --git a/src/SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh b/src/SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh
index 73d904ce..0f380112 100644
--- a/src/SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh
+++ b/src/SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh
@@ -498,7 +498,7 @@ void testConv3x3x3_perf(std::string testName)
 	size_t sz[] = {1000,1000,1000};
 
 	SparseGridZ sparseGrid(sz);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// now create 3 3D sphere
@@ -592,7 +592,7 @@ static void testConv3x3x3_no_shared_perf(std::string testName)
 	size_t sz[] = {1000,1000,1000};
 
 	SparseGridZ sparseGrid(sz);
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	sparseGrid.template setBackgroundValue<0>(0);
 
 	// now create 3 3D sphere
diff --git a/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu b/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu
index 6a9a1537..38d4af70 100644
--- a/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu
+++ b/src/Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu
@@ -204,7 +204,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu )
 	// we launch a kernel to insert data
 	CUDA_LAUNCH_DIM3(test_insert_sparse,10,100,vs.toKernel());
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE);
 
 	vs.setGPUInsertBuffer(10,1024);
@@ -285,7 +285,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_incremental_add )
 	CUDA_LAUNCH_DIM3(test_insert_sparse2_inc,10,100,vs.toKernel());
 	CUDA_LAUNCH_DIM3(test_insert_sparse2_inc,10,100,vs.toKernel());
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 
 	vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE);
 
@@ -352,7 +352,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_get )
 	CUDA_LAUNCH_DIM3(test_insert_sparse,10,100,vs.toKernel());
 
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE);
 
 	vs.template deviceToHost<0,1,2>();
@@ -475,7 +475,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_special_function )
 	CUDA_LAUNCH_DIM3(test_insert_sparse2_inc,10,100,vs.toKernel());
 	CUDA_LAUNCH_DIM3(test_insert_sparse2_inc,10,100,vs.toKernel());
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 
 	vs.flush<sstart_<0>>(ctx,flush_type::FLUSH_ON_DEVICE);
 
@@ -604,7 +604,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_remove )
 	// we launch a kernel to insert data
 	CUDA_LAUNCH_DIM3(test_insert_sparse,10,100,vs.toKernel());
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	vs.flush<sadd_<0>,smin_<1>,smax_<2> >(ctx,flush_type::FLUSH_ON_DEVICE);
 
 	vs.setGPUInsertBuffer(10,1024);
@@ -686,7 +686,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_gpu_remove_incremental )
 	CUDA_LAUNCH_DIM3(test_insert_sparse2_inc,10,100,vs.toKernel());
 	CUDA_LAUNCH_DIM3(test_insert_sparse2_inc,10,100,vs.toKernel());
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 
 	vs.flush<sadd_<0>,sadd_<1>,sadd_<2>>(ctx,flush_type::FLUSH_ON_DEVICE);
 
diff --git a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh
index 33b0a68a..0079b452 100644
--- a/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh
+++ b/src/Vector/cuda/map_vector_sparse_cuda_kernels.cuh
@@ -12,19 +12,34 @@
 
 #include "config.h"
 
+#include <limits>
+
 #if CUDART_VERSION < 11000
 #include "util/cuda/cub_old/util_type.cuh"
 #include "util/cuda/cub_old/block/block_scan.cuh"
-#include "util/cuda/moderngpu/operators.hxx"
 #include "util/cuda_launch.hpp"
-#else
-	#if !defined(CUDA_ON_CPU)	
-	#include "util/cuda/moderngpu/operators.hxx"
-	#endif
+#endif
+
+#if !defined(CUDA_ON_CPU)
+#include "util/cudify/cuda/operators.hpp"
 #endif
 
 #endif
 
+template<typename type_t>
+struct zero_t {
+  __device__ __host__ type_t operator()() const {
+    return 0;
+  }
+};
+
+template<typename type_t>
+struct limit_max_t {
+  __device__ __host__ type_t operator()() const {
+    return std::numeric_limits<type_t>::max();
+  }
+};
+
 template<typename type_t>
 struct rightOperand_t  : public std::binary_function<type_t, type_t, type_t> {
   __device__ __host__ type_t operator()(type_t a, type_t b) const {
@@ -93,7 +108,8 @@ struct sadd_
 	typedef boost::mpl::int_<prp> prop;
 
 #ifdef __NVCC__
-	template<typename red_type> using op_red = mgpu::plus_t<red_type>;
+	template<typename red_type> using op_red = gpu::plus_t<red_type>;
+	template<typename red_type> using op_initial_value = zero_t<red_type>;
 #endif
 
 	template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
@@ -147,6 +163,7 @@ struct sadd_block_
 
 #ifdef __NVCC__
 	template<typename red_type> using op_red = plus_block_t<red_type, blockLength>;
+	template<typename red_type> using op_initial_value = zero_t<red_type>;
 #endif
 
 	template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
@@ -176,7 +193,8 @@ struct smax_
 	typedef boost::mpl::int_<prp> prop;
 
 #ifdef __NVCC__
-	template<typename red_type> using op_red = mgpu::maximum_t<red_type>;
+	template<typename red_type> using op_red = gpu::maximum_t<red_type>;
+	template<typename red_type> using op_initial_value = zero_t<red_type>;
 #endif
 
 	template<typename red_type>
@@ -200,7 +218,7 @@ struct smax_
 
 template<typename type_t, unsigned int blockLength>
 struct maximum_block_t  : public std::binary_function<type_t, type_t, type_t> {
-  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
+  __forceinline__ __device__ __host__ type_t operator()(type_t a, type_t b) const {
   	type_t res;
   	for (int i=0; i<blockLength; ++i)
   	{
@@ -219,6 +237,7 @@ struct smax_block_
 
 #ifdef __NVCC__
 	template<typename red_type> using op_red = maximum_block_t<red_type, blockLength>;
+	template<typename red_type> using op_initial_value = zero_t<red_type>;
 #endif
 
 	template<typename red_type>
@@ -251,7 +270,8 @@ struct smin_
 	typedef boost::mpl::int_<prp> prop;
 
 #ifdef __NVCC__
-	template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
+	template<typename red_type> using op_red = gpu::minimum_t<red_type>;
+	template<typename red_type> using op_initial_value = limit_max_t<red_type>;
 #endif
 
 	template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
@@ -274,7 +294,7 @@ struct smin_
 
 template<typename type_t, unsigned int blockLength>
 struct minimum_block_t  : public std::binary_function<type_t, type_t, type_t> {
-  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
+  __forceinline__ __device__ __host__ type_t operator()(type_t a, type_t b) const {
   	type_t res;
   	for (int i=0; i<blockLength; ++i)
   	{
@@ -293,6 +313,7 @@ struct smin_block_
 
 #ifdef __NVCC__
 	template<typename red_type> using op_red = minimum_block_t<red_type, blockLength>;
+	template<typename red_type> using op_initial_value = limit_max_t<red_type>;
 #endif
 
 	template<typename red_type>
@@ -322,7 +343,7 @@ struct smin_block_
 
 template<typename type_t>
 struct bitwiseOr_t  : public std::binary_function<type_t, type_t, type_t> {
-  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
+  __forceinline__ __device__ __host__ type_t operator()(type_t a, type_t b) const {
     return a|b;
   }
 };
@@ -357,7 +378,8 @@ struct sstart_
 {
 	typedef boost::mpl::int_<prp> prop;
 
-	template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
+	template<typename red_type> using op_red = gpu::minimum_t<red_type>;
+	template<typename red_type> using op_initial_value = zero_t<red_type>;
 
 	template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
 	{
@@ -382,7 +404,7 @@ struct sstop_
 {
 	typedef boost::mpl::int_<prp> prop;
 
-	template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
+	template<typename red_type> using op_red = gpu::minimum_t<red_type>;
 
 	template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
 	{
@@ -407,7 +429,7 @@ struct snum_
 {
 	typedef boost::mpl::int_<prp> prop;
 
-	template<typename red_type> using op_red = mgpu::minimum_t<red_type>;
+	template<typename red_type> using op_red = gpu::minimum_t<red_type>;
 
 	template<typename red_type> __device__ __host__ static red_type red(red_type & r1, red_type & r2)
 	{
diff --git a/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu b/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu
index 0dd883bc..bbffab38 100644
--- a/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu
+++ b/src/Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu
@@ -35,7 +35,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_use )
 	block_insert.template hostToDevice<0>();
 	block_n.template hostToDevice<0>();
 
-	mgpu::ofp_context_t context;
+	gpu::ofp_context_t context;
 	openfpm::scan((int *)block_n.template getDeviceBuffer<0>(), block_n.size(), (int *)block_n_scan.template getDeviceBuffer<0>() , context);
 
 	block_n_scan.template deviceToHost<0>(block_n_scan.size()-1,block_n_scan.size()-1);
@@ -105,7 +105,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_use_small_pool )
 	block_insert.template hostToDevice<0>();
 	block_n.template hostToDevice<0>();
 
-	mgpu::ofp_context_t context;
+	gpu::ofp_context_t context;
 	openfpm::scan((int *)block_n.template getDeviceBuffer<0>(), block_n.size(), (int *)block_n_scan.template getDeviceBuffer<0>() , context);
 
 	block_n_scan.template deviceToHost<0>(block_n_scan.size()-1,block_n_scan.size()-1);
@@ -168,7 +168,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_merge_use )
 
 	vct_index.resize(vct_add_index.size() + vct_index_old.size());
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 
 	// host to device
 	vct_index_old.template hostToDevice<0,1>();
@@ -176,7 +176,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_merge_use )
 
 	openfpm::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(),
 			    (int *)vct_add_index.template getDeviceBuffer<0>(),(int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.size(),
-			    (int *)vct_index.template getDeviceBuffer<0>(),(int *)vct_index.template getDeviceBuffer<1>(),mgpu::less_t<int>(),ctx);
+			    (int *)vct_index.template getDeviceBuffer<0>(),(int *)vct_index.template getDeviceBuffer<1>(),gpu::less_t<int>(),ctx);
 
 	vct_index.template deviceToHost<0,1>();
 
@@ -262,7 +262,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_solve_conflicts_use )
 	vct_index.resize(vct_add_index.size() + vct_index_old.size());
 	merge_indexes.resize(vct_index.size());
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 
 	// host to device
 	vct_index_old.template hostToDevice<0,1>();
@@ -272,7 +272,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_solve_conflicts_use )
 
 	openfpm::merge((int *)vct_index_old.template getDeviceBuffer<0>(),(int *)vct_index_old.template getDeviceBuffer<1>(),vct_index_old.size(),
 			    (int *)vct_add_index.template getDeviceBuffer<0>(),(int *)vct_add_index.template getDeviceBuffer<1>(),vct_add_index.size(),
-			    (int *)vct_index.template getDeviceBuffer<0>(),(int *)merge_indexes.template getDeviceBuffer<0>(),mgpu::less_t<int>(),ctx);
+			    (int *)vct_index.template getDeviceBuffer<0>(),(int *)merge_indexes.template getDeviceBuffer<0>(),gpu::less_t<int>(),ctx);
 
 	constexpr int bdim = 128;
 
@@ -378,7 +378,7 @@ BOOST_AUTO_TEST_CASE( vector_sparse_cuda_kernels_realign_use )
 	vct_data.template hostToDevice<0,1,2>();
 	vct_tot_out.template hostToDevice<0,2>();
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	openfpm::scan((int *)vct_tot_out.getDeviceBuffer<0>(),vct_tot_out.size(),(int *)vct_tot_out.getDeviceBuffer<1>(),ctx);
 
 	vct_tot_out.deviceToHost<0,1>();
diff --git a/src/Vector/map_vector_sparse.hpp b/src/Vector/map_vector_sparse.hpp
index c82e41db..3d782ae1 100644
--- a/src/Vector/map_vector_sparse.hpp
+++ b/src/Vector/map_vector_sparse.hpp
@@ -12,15 +12,11 @@
 #include "Vector/map_vector.hpp"
 #include "Vector/cuda/map_vector_sparse_cuda_ker.cuh"
 #include "Vector/cuda/map_vector_sparse_cuda_kernels.cuh"
-#include "util/cuda/ofp_context.hxx"
+#include "util/ofp_context.hpp"
 #include <iostream>
 #include <limits>
 
 #if defined(__NVCC__)
-  #if !defined(CUDA_ON_CPU) && !defined(__HIP__)
-	#include "util/cuda/moderngpu/kernel_segreduce.hxx"
-	#include "util/cuda/moderngpu/kernel_merge.hxx"
-  #endif
  #include "util/cuda/kernels.cuh"
 #endif
 
@@ -128,7 +124,10 @@ namespace openfpm
         static void extendSegments(vector_index_type & segments, size_t dataSize)
         {
 #ifdef __NVCC__
-            // Pass as there is nothing to append for mgpu
+            // Append trailing element to segment (marks end of last segment)
+            segments.resize(segments.size()+1);
+            segments.template get<p>(segments.size() - 1) = dataSize;
+            segments.template hostToDevice<p>(segments.size() - 1, segments.size() - 1);
 #else // __NVCC__
             std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
 #endif // __NVCC__
@@ -141,23 +140,22 @@ namespace openfpm
                 vector_index_type2 & segment_offset,
                 vector_data_type & vector_data_red,
                 block_functor & blf,
-                mgpu::ofp_context_t & context)
+                gpu::ofp_context_t & context)
         {
 #ifdef __NVCC__
             typedef typename boost::mpl::at<vector_reduction, T>::type reduction_type;
             typedef typename boost::mpl::at<typename vector_data_type::value_type::type,typename reduction_type::prop>::type red_type;
             typedef typename reduction_type::template op_red<red_type> red_op;
             typedef typename boost::mpl::at<typename vector_index_type::value_type::type,boost::mpl::int_<0>>::type seg_type;
-            red_type init;
-            init = 0;
+            typename reduction_type::template op_initial_value<red_type> initial_value_functor;
 
             assert((std::is_same<seg_type,int>::value == true));
 
             openfpm::segreduce(
                     (red_type *)vector_data.template getDeviceBuffer<reduction_type::prop::value>(), vector_data.size(),
-                    (int *)segment_offset.template getDeviceBuffer<1>(), segment_offset.size(),
+                    (int *)segment_offset.template getDeviceBuffer<1>(), segment_offset.size()-1,
                     (red_type *)vector_data_red.template getDeviceBuffer<reduction_type::prop::value>(),
-                    red_op(), init, context);
+                    red_op(), initial_value_functor(), context);
 #else // __NVCC__
     std::cout << __FILE__ << ":" << __LINE__ << " error: this file is supposed to be compiled with nvcc" << std::endl;
 #endif // __NVCC__
@@ -200,7 +198,7 @@ namespace openfpm
                 vector_data_type & vct_data_out,
                 ite_gpu<1> & itew,
                 block_functor & blf,
-                mgpu::ofp_context_t & context
+                gpu::ofp_context_t & context
                 )
         {
 #ifdef __NVCC__
@@ -268,7 +266,7 @@ namespace openfpm
         					  vector_index_type2 & segment_offset,
         					  vector_data_type & vector_data_red,
         					  block_functor & blf,
-        					  mgpu::ofp_context_t & context)
+						  gpu::ofp_context_t & context)
         {
 
         }
@@ -294,7 +292,7 @@ namespace openfpm
                 vector_data_type & vct_data_out,
                 ite_gpu<1> & itew,
                 block_functor & blf,
-                mgpu::ofp_context_t & context
+                gpu::ofp_context_t & context
         )
         {
 #ifdef __NVCC__
@@ -634,7 +632,7 @@ namespace openfpm
 		block_functor & blf;
 
 		//! gpu context
-		mgpu::ofp_context_t & context;
+		gpu::ofp_context_t & context;
 
 		/*! \brief constructor
 		 *
@@ -648,7 +646,7 @@ namespace openfpm
 									   vector_index_type & vector_data_map,
 									   vector_index_type2 & segment_offset,
 									   block_functor & blf,
-									   mgpu::ofp_context_t & context)
+									   gpu::ofp_context_t & context)
 		:vector_data_red(vector_data_red),
 		 vector_data(vector_data),
 		 vector_data_unsorted(vector_data_unsorted),
@@ -697,7 +695,7 @@ namespace openfpm
                                     vector_data_type &data1, vector_data_type &data2,
                                     vector_index_type &indices_tmp, vector_data_type &data_tmp,
                                     vector_index_type &keysOut, vector_data_type &dataOut,
-                                    mgpu::ofp_context_t & context)
+                                    gpu::ofp_context_t & context)
 		{
 			return true;
 		}
@@ -738,7 +736,7 @@ namespace openfpm
 		vector_index_type & segment_offset;
 
 		//! gpu context
-		mgpu::ofp_context_t & context;
+		gpu::ofp_context_t & context;
 
 		/*! \brief constructor
 		 *
@@ -749,7 +747,7 @@ namespace openfpm
 		inline sparse_vector_special(vector_data_type & vector_data_red,
 									   vector_data_type & vector_data,
 									   vector_index_type & segment_offset,
-									   mgpu::ofp_context_t & context)
+									   gpu::ofp_context_t & context)
 		:vector_data_red(vector_data_red),vector_data(vector_data),segment_offset(segment_offset),context(context)
 		{};
 
@@ -881,7 +879,7 @@ namespace openfpm
 		 * \param vct_add_cont_index output continuos array of inserted indexes
 		 * \param vct_add_data array of added data
 		 * \param vct_add_data_cont continuos array of inserted data
-		 * \param contect mgpu context
+		 * \param contect gpu context
 		 *
 		 */
 		size_t make_continuos(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_nadd_index,
@@ -890,7 +888,7 @@ namespace openfpm
 							  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map,
 							  vector<T,Memory,layout_base,grow_p> & vct_add_data,
 							  vector<T,Memory,layout_base,grow_p> & vct_add_data_cont,
-							  mgpu::ofp_context_t & context)
+							  gpu::ofp_context_t & context)
 		{
 #ifdef __NVCC__
 
@@ -970,7 +968,7 @@ namespace openfpm
 							 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_cont_index_map,
 							 vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
 							 vector<T,Memory,layout_base,grow_p> & vct_add_data_cont,
-							 mgpu::ofp_context_t & context)
+							 gpu::ofp_context_t & context)
 		{
 #ifdef __NVCC__
 			ite_gpu<1> itew;
@@ -990,7 +988,7 @@ namespace openfpm
 			        (Ti *)vct_add_cont_index.template getDeviceBuffer<0>(),
                     (Ti *)vct_add_cont_index_map.template getDeviceBuffer<0>(),
 					vct_add_cont_index.size(),
-					mgpu::template less_t<Ti>(),
+					gpu::template less_t<Ti>(),
                     context);
 
 			auto ite = vct_add_cont_index.getGPUIterator();
@@ -1017,7 +1015,7 @@ namespace openfpm
 						   vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & vct_add_index_unique,
 				  	  	   vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index,
 				  	  	   vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_merge_index_map,
-				  	  	   mgpu::ofp_context_t & context)
+						   gpu::ofp_context_t & context)
 		{
 #ifdef __NVCC__
 
@@ -1112,7 +1110,7 @@ namespace openfpm
 
 			openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
 						(Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp4.template getDeviceBuffer<0>(),vct_add_index_unique.size(),
-						(Ti *)vct_merge_index.template getDeviceBuffer<0>(),(Ti *)vct_merge_index_map.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context);
+						(Ti *)vct_merge_index.template getDeviceBuffer<0>(),(Ti *)vct_merge_index_map.template getDeviceBuffer<0>(),gpu::less_t<Ti>(),context);
 
 
 #endif
@@ -1125,7 +1123,7 @@ namespace openfpm
 						 vector<aggregate<Ti,Ti>,Memory,layout_base,grow_p> & segments_new,
 						 vector<T,Memory,layout_base,grow_p> & vct_add_data,
 						 vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_data_reord_map,
-				  	  	   mgpu::ofp_context_t & context)
+						 gpu::ofp_context_t & context)
 		{
 #ifdef __NVCC__
 			ite_gpu<1> itew;
@@ -1159,6 +1157,7 @@ namespace openfpm
 			                context);
 
 				boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(v_reduce)>>(svr);
+				vct_add_index_unique.remove(vct_add_index_unique.size()-1);
 			}
 
 			sparse_vector_special<typename std::remove_reference<decltype(vct_add_data)>::type,
@@ -1204,7 +1203,7 @@ namespace openfpm
 		void flush_on_gpu_insert(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
 				  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
 				  vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
-				  mgpu::ofp_context_t & context)
+				  gpu::ofp_context_t & context)
 		{
 #ifdef __NVCC__
 
@@ -1237,7 +1236,7 @@ namespace openfpm
 
 
 		void flush_on_gpu_remove(
-				  mgpu::ofp_context_t & context)
+				  gpu::ofp_context_t & context)
 		{
 #ifdef __NVCC__
 
@@ -1275,7 +1274,7 @@ namespace openfpm
 
 			// now we sort
 			openfpm::sort((Ti *)vct_add_index_cont_0.template getDeviceBuffer<0>(),(Ti *)vct_add_index_cont_1.template getDeviceBuffer<0>(),
-					vct_add_index_cont_0.size(), mgpu::template less_t<Ti>(), context);
+					vct_add_index_cont_0.size(), gpu::template less_t<Ti>(), context);
 
 			auto ite = vct_add_index_cont_0.getGPUIterator();
 
@@ -1297,7 +1296,7 @@ namespace openfpm
 			vct_add_index_unique.resize(n_ele_unique);
 
 			openfpm::sort((Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),
-							vct_add_index_unique.size(),mgpu::template less_t<Ti>(),context);
+							vct_add_index_unique.size(),gpu::template less_t<Ti>(),context);
 
 			// Then we merge the two list vct_index and vct_add_index_unique
 
@@ -1329,7 +1328,7 @@ namespace openfpm
 			//
 			openfpm::merge((Ti *)vct_index.template getDeviceBuffer<0>(),(Ti *)vct_m_index.template getDeviceBuffer<0>(),vct_index.size(),
 						(Ti *)vct_add_index_unique.template getDeviceBuffer<0>(),(Ti *)vct_add_index_unique.template getDeviceBuffer<1>(),vct_add_index_unique.size(),
-						(Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),mgpu::less_t<Ti>(),context);
+						(Ti *)vct_index_tmp.template getDeviceBuffer<0>(),(Ti *)vct_index_tmp2.template getDeviceBuffer<0>(),gpu::less_t<Ti>(),context);
 
 			vct_index_tmp3.resize(128*itew.wthr.x);
 
@@ -1377,7 +1376,7 @@ namespace openfpm
 		void flush_on_gpu(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
 						  vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_1,
 						  vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
-						  mgpu::ofp_context_t & context)
+						  gpu::ofp_context_t & context)
 		{
 			flush_on_gpu_insert<v_reduce ... >(vct_add_index_cont_0,vct_add_index_cont_1,vct_add_data_reord,context);
 		}
@@ -1747,7 +1746,7 @@ namespace openfpm
 		 */
 		template<typename ... v_reduce>
 		void flush_v(vector<aggregate<Ti>,Memory,layout_base,grow_p> & vct_add_index_cont_0,
-				     mgpu::ofp_context_t & context,
+				     gpu::ofp_context_t & context,
 				     flush_type opt = FLUSH_ON_HOST,
 				     int i = 0)
 		{
@@ -1771,7 +1770,7 @@ namespace openfpm
 		 */
 		template<typename ... v_reduce>
 		void flush_vd(vector<T,Memory,layout_base,grow_p> & vct_add_data_reord,
-				     mgpu::ofp_context_t & context,
+				     gpu::ofp_context_t & context,
 				     flush_type opt = FLUSH_ON_HOST)
 		{
 			// Eliminate background
@@ -1791,7 +1790,7 @@ namespace openfpm
 		 *
 		 */
 		template<typename ... v_reduce>
-		void flush(mgpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST)
+		void flush(gpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST)
 		{
 			// Eliminate background
 			vct_data.resize(vct_index.size());
@@ -1809,7 +1808,7 @@ namespace openfpm
 		 * \param opt options
 		 *
 		 */
-		void flush_remove(mgpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST)
+		void flush_remove(gpu::ofp_context_t & context, flush_type opt = FLUSH_ON_HOST)
 		{
 			vct_data.resize(vct_data.size()-1);
 
diff --git a/src/Vector/map_vector_sparse_unit_tests.cu b/src/Vector/map_vector_sparse_unit_tests.cu
index 474005b8..1b6e4f06 100644
--- a/src/Vector/map_vector_sparse_unit_tests.cu
+++ b/src/Vector/map_vector_sparse_unit_tests.cu
@@ -34,7 +34,7 @@ BOOST_AUTO_TEST_CASE ( test_sparse_vector_use )
 	vs.template insert<0>(35) = 35;
 	vs.template insert<0>(28) = 28;
 
-	mgpu::ofp_context_t ctx;
+	gpu::ofp_context_t ctx;
 	vs.template flush<sadd_<0>>(ctx);
 
 	BOOST_REQUIRE_EQUAL(vs.get<0>(5),5);
diff --git a/src/util/cuda/merge_ofp.cuh b/src/util/cuda/merge_ofp.cuh
index 367da428..f5a917cb 100644
--- a/src/util/cuda/merge_ofp.cuh
+++ b/src/util/cuda/merge_ofp.cuh
@@ -13,8 +13,7 @@
  #include "Vector/map_vector.hpp"
  #include "util/cuda_launch.hpp"
  
- #if CUDART_VERSION >= 11000
-     #ifndef CUDA_ON_CPU 
+ #ifndef CUDA_ON_CPU
      // Here we have for sure CUDA >= 11
      #ifdef __HIP__
         #undef __CUDACC__
@@ -27,13 +26,7 @@
         #include <thrust/merge.h>
         #include <thrust/execution_policy.h>
      #endif
-     #endif
- #else
-    #include <thrust/merge.h>
-    #include <thrust/execution_policy.h>
-//    #include "util/cuda/moderngpu/kernel_merge.hxx"
  #endif
- #include "util/cuda/ofp_context.hxx"
  
 
  namespace openfpm
@@ -101,10 +94,6 @@
 
         #else
 
-//            It seems broken on some CUDA on some hardware. Anyway is not anymore supported 
-//            on some hardware ... we move to thrust
-//            mgpu::merge(a_keys,a_vals,a_count,b_keys,b_vals,b_count,c_keys,c_vals,comp,context);
-
             thrust::merge_by_key(thrust::device, a_keys,a_keys + a_count, 
                                                  b_keys,b_keys + b_count, 
                                                  a_vals,b_vals,
diff --git a/src/util/cuda/modern_gpu_tests.cu b/src/util/cuda/modern_gpu_tests.cu
deleted file mode 100644
index 383d46ab..00000000
--- a/src/util/cuda/modern_gpu_tests.cu
+++ /dev/null
@@ -1,222 +0,0 @@
-#include "config.h"
-#define BOOST_TEST_DYN_LINK
-#include <boost/test/unit_test.hpp>
-
-#include "util/cuda_util.hpp"
-#include "Vector/map_vector.hpp"
-
-#ifndef CUDA_ON_CPU
-
-#ifndef __HIP__
-#include "util/cuda/moderngpu/kernel_load_balance.hxx"
-#include "util/cuda/moderngpu/kernel_mergesort.hxx"
-#include "util/cuda/moderngpu/kernel_reduce.hxx"
-#include "util/cuda/moderngpu/kernel_segreduce.hxx"
-
-
-BOOST_AUTO_TEST_SUITE( modern_gpu_tests )
-
-BOOST_AUTO_TEST_CASE( modern_gpu_loadbalance_lbs )
-{
-	std::cout << "Test modern gpu test tansform_lbs" << "\n";
-
-	mgpu::standard_context_t context(false);
-
-	int count = 200030;
-	int spacing = 100;
-
-	int num_segments = mgpu::div_up(count, spacing);
-	openfpm::vector_gpu<aggregate<int>> segments(num_segments);
-	for(int i = 0; i < num_segments; ++i)
-	{segments.template get<0>(i) = i * spacing;}
-
-	openfpm::vector_gpu<aggregate<int>>  lbs(count);
-
-	segments.template hostToDevice<0>();
-
-	mgpu::load_balance_search(count, (int *)segments.template getDeviceBuffer<0>(), num_segments, (int *)lbs.template getDeviceBuffer<0>(),context);
-
-	lbs.deviceToHost<0>();
-
-	bool check = true;
-	for(size_t i = 0; i < lbs.size(); ++i)
-	{
-	    check &= lbs.template get<0>(i) == i / spacing;
-	}
-
-	BOOST_REQUIRE_EQUAL(check,true);
-
-	std::cout << "End test modern gpu test tansform_lbs" << "\n";
-
-	// Test the cell list
-}
-
-BOOST_AUTO_TEST_CASE( modern_gpu_sort )
-{
-	std::cout << "Test modern gpu test tansform_lbs" << "\n";
-
-	mgpu::standard_context_t context(false);
-
-	int count = 200030;
-
-	openfpm::vector_gpu<aggregate<unsigned int,unsigned int>> vgpu;
-	openfpm::vector_gpu<aggregate<unsigned int>> gpu_ns;
-
-	vgpu.resize(count);
-	gpu_ns.resize(count);
-
-	for (size_t i = 0 ; i < count ; i++)
-	{
-		vgpu.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 17;
-		vgpu.template get<1>(i) = i;
-
-		gpu_ns.template get<0>(i) = vgpu.template get<0>(i);
-	}
-
-	vgpu.hostToDevice<0,1>();
-
-    mergesort((unsigned int *)vgpu.getDeviceBuffer<0>(),(unsigned int *)vgpu.getDeviceBuffer<1>(), count, mgpu::less_t<unsigned int>(), context);
-
-    vgpu.deviceToHost<0,1>();
-
-    // print
-
-    bool match = true;
-    for (int i = 0 ; i < count - 1 ; i++)
-    {
-    	match &= vgpu.template get<0>(i) <= vgpu.template get<0>(i+1);
-    	match &= gpu_ns.template get<0>(vgpu.template get<1>(i)) == vgpu.template get<0>(i);
-    }
-
-    BOOST_REQUIRE_EQUAL(match,true);
-
-	std::cout << "End test modern gpu test tansform_lbs" << "\n";
-
-	// Test the cell list
-}
-
-BOOST_AUTO_TEST_CASE( modern_gpu_reduce )
-{
-	std::cout << "Test modern gpu reduce" << "\n";
-
-	mgpu::standard_context_t context(false);
-
-	int count = 200030;
-
-	openfpm::vector_gpu<aggregate<int>> vgpu;
-
-	vgpu.resize(count);
-
-	for (size_t i = 0 ; i < count ; i++)
-	{
-		vgpu.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 17;
-	}
-
-	vgpu.hostToDevice<0>();
-
-	CudaMemory mem;
-	mem.allocate(sizeof(int));
-	mgpu::reduce((int *)vgpu.template getDeviceBuffer<0>(), count, (int *)mem.getDevicePointer(), mgpu::plus_t<int>(), context);
-
-    mem.deviceToHost();
-    int red_p = *(int *)mem.getPointer();
-
-    // print
-
-    int red = 0;
-    for (int i = 0 ; i < count ; i++)
-    {
-    	red += vgpu.template get<0>(i);
-    }
-
-    BOOST_REQUIRE_EQUAL(red,red_p);
-
-	std::cout << "End test modern gpu test reduce" << "\n";
-
-	// Test the cell list
-}
-
-
-BOOST_AUTO_TEST_CASE( modern_gpu_seg_reduce )
-{
-	std::cout << "Test modern gpu segmented reduce" << "\n";
-
-	mgpu::standard_context_t context(false);
-
-	int count = 130;
-
-	openfpm::vector_gpu<aggregate<int>> vgpu;
-	openfpm::vector_gpu<aggregate<int>> segment_offset;
-	openfpm::vector_gpu<aggregate<int>> output;
-	int init = 0;
-
-	vgpu.resize(count);
-
-	for (size_t i = 0 ; i < count ; i++)
-	{
-		vgpu.template get<0>(i) = ((float)rand() / (float)RAND_MAX) * 17;
-	}
-
-	segment_offset.add();
-	segment_offset.template get<0>(0) = 0;
-	size_t base = 0;
-	while (1)
-	{
-		int c = ((float)rand() / (float)RAND_MAX) * 17;
-
-		if (c + base >= count)
-		{break;}
-
-		segment_offset.add();
-		segment_offset.template get<0>(segment_offset.size() - 1) = c + segment_offset.template get<0>(segment_offset.size() - 2);
-
-		base += c;
-	}
-
-	vgpu.hostToDevice<0>();
-	segment_offset.hostToDevice<0>();
-	output.resize(segment_offset.size());
-
-	mgpu::segreduce((int *)vgpu.template getDeviceBuffer<0>(), vgpu.size(),
-					(int *)segment_offset.template getDeviceBuffer<0>(), segment_offset.size(),
-					(int *)output.template getDeviceBuffer<0>(),
-					mgpu::plus_t<int>(), init, context);
-
-
-	output.template deviceToHost<0>();
-
-	bool match = true;
-	size_t i = 0;
-	for ( ; i < segment_offset.size()-1 ; i++)
-	{
-		size_t red = 0;
-		for (size_t j = 0 ; j < segment_offset.template get<0>(i+1) - segment_offset.template get<0>(i)  ; j++)
-		{
-			red += vgpu.template get<0>(segment_offset.template get<0>(i) + j);
-		}
-		match &= red == output.template get<0>(i);
-	}
-
-	BOOST_REQUIRE_EQUAL(match,true);
-
-	size_t red2 = 0;
-	for (size_t j = 0 ; j < vgpu.size() - segment_offset.template get<0>(i)  ; j++)
-	{
-		red2 += vgpu.template get<0>(segment_offset.template get<0>(i) + j);
-	}
-	match &= red2 == output.template get<0>(i);
-
-	BOOST_REQUIRE_EQUAL(match,true);
-
-	std::cout << "End test modern gpu test reduce" << "\n";
-
-	// Test the cell list
-}
-
-
-BOOST_AUTO_TEST_SUITE_END()
-
-#endif
-
-#endif
-
diff --git a/src/util/cuda/moderngpu/context.hxx b/src/util/cuda/moderngpu/context.hxx
deleted file mode 100644
index 93af53d9..00000000
--- a/src/util/cuda/moderngpu/context.hxx
+++ /dev/null
@@ -1,221 +0,0 @@
-#pragma once
-
-#include <vector>
-#include <memory>
-#include <cassert>
-#include <exception>
-#include "util.hxx"
-#include "launch_params.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-enum memory_space_t { 
-  memory_space_device = 0, 
-  memory_space_host = 1 
-};
-
-
-inline std::string device_prop_string(cudaDeviceProp prop) {
-  int ordinal;
-  cudaGetDevice(&ordinal);
-
-  size_t freeMem, totalMem;
-  cudaError_t result = cudaMemGetInfo(&freeMem, &totalMem);
-  if(cudaSuccess != result) throw cuda_exception_t(result);  
-
-  double memBandwidth = (prop.memoryClockRate * 1000.0) *
-    (prop.memoryBusWidth / 8 * 2) / 1.0e9;
-
-  std::string s = detail::stringprintf(
-    "%s : %8.3lf Mhz   (Ordinal %d)\n"
-    "%d SMs enabled. Compute Capability sm_%d%d\n"
-    "FreeMem: %6dMB   TotalMem: %6dMB   %2d-bit pointers.\n"
-    "Mem Clock: %8.3lf Mhz x %d bits   (%5.1lf GB/s)\n"
-    "ECC %s\n\n",
-    prop.name, prop.clockRate / 1000.0, ordinal,
-    prop.multiProcessorCount, prop.major, prop.minor,
-    (int)(freeMem / (1<< 20)), (int)(totalMem / (1<< 20)), 8 * sizeof(int*),
-    prop.memoryClockRate / 1000.0, prop.memoryBusWidth, memBandwidth,
-    prop.ECCEnabled ? "Enabled" : "Disabled");
-  return s;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// context_t
-// Derive context_t to add support for streams and a custom allocator.
-
-struct context_t {
-  context_t() = default;
-
-  // Disable copy ctor and assignment operator. We don't want to let the
-  // user copy only a slice.
-  context_t(const context_t& rhs) = delete;
-  context_t& operator=(const context_t& rhs) = delete;
-
-  virtual const cudaDeviceProp& props() const = 0; 
-  virtual int ptx_version() const = 0;
-  virtual cudaStream_t stream() = 0;
-
-  // Alloc GPU memory.
-  virtual void* alloc(size_t size, memory_space_t space) = 0;
-  virtual void free(void* p, memory_space_t space) = 0;
-
-  // cudaStreamSynchronize or cudaDeviceSynchronize for stream 0.
-  virtual void synchronize() = 0;
-
-  virtual cudaEvent_t event() = 0;
-  virtual void timer_begin() = 0;
-  virtual double timer_end() = 0;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// standard_context_t is a trivial implementation of context_t. Users can
-// derive this type to provide a custom allocator.
-
-class standard_context_t : public context_t {
-protected:
-  cudaDeviceProp _props;
-  int _ptx_version;
-  cudaStream_t _stream;
-
-  cudaEvent_t _timer[2];
-  cudaEvent_t _event;
-
-  // Making this a template argument means we won't generate an instance
-  // of dummy_k for each translation unit. 
-  template<int dummy_arg = 0>
-  void init() {
-    cudaFuncAttributes attr;
-    cudaError_t result = cudaFuncGetAttributes(&attr, (void *)dummy_k<0>);
-    if(cudaSuccess != result) throw cuda_exception_t(result);
-    _ptx_version = attr.ptxVersion;
-
-    int ord;
-    cudaGetDevice(&ord);
-    cudaGetDeviceProperties(&_props, ord);
-    
-    cudaEventCreate(&_timer[0]);
-    cudaEventCreate(&_timer[1]);
-    cudaEventCreate(&_event);    
-  }
-
-public:
-  standard_context_t(bool print_prop = true, cudaStream_t stream_ = 0) : 
-    context_t(), _stream(stream_) {
-
-    init();
-    if(print_prop) {
-      printf("%s\n", device_prop_string(_props).c_str());
-    }
-  }
-  ~standard_context_t() {
-    cudaEventDestroy(_timer[0]);
-    cudaEventDestroy(_timer[1]);
-    cudaEventDestroy(_event);
-  }
-
-  virtual const cudaDeviceProp& props() const { return _props; }
-  virtual int ptx_version() const { return _ptx_version; }
-  virtual cudaStream_t stream() { return _stream; }
-
-  // Alloc GPU memory.
-  virtual void* alloc(size_t size, memory_space_t space) {
-    void* p = nullptr;
-    if(size) {
-      cudaError_t result = (memory_space_device == space) ? 
-        cudaMalloc(&p, size) :
-        cudaMallocHost(&p, size);
-      if(cudaSuccess != result) throw cuda_exception_t(result);
-    }
-    return p;    
-  }
-
-  virtual void free(void* p, memory_space_t space) {
-    if(p) {
-      cudaError_t result = (memory_space_device == space) ? 
-        cudaFree(p) :
-        cudaFreeHost(p);
-      if(cudaSuccess != result) throw cuda_exception_t(result);
-    }
-  }
-
-  virtual void synchronize() {
-    cudaError_t result = _stream ? 
-      cudaStreamSynchronize(_stream) : 
-      cudaDeviceSynchronize();
-    if(cudaSuccess != result) throw cuda_exception_t(result);
-  }
-
-  virtual cudaEvent_t event() {
-    return _event;
-  }
-  virtual void timer_begin() {
-    cudaEventRecord(_timer[0], _stream);
-  }
-  virtual double timer_end() {
-    cudaEventRecord(_timer[1], _stream);
-    cudaEventSynchronize(_timer[1]);
-    float ms;
-    cudaEventElapsedTime(&ms, _timer[0], _timer[1]);
-    return ms / 1.0e3;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// mem_t
-
-template<typename type_t>
-class mem_t {
-  context_t* _context;
-  type_t* _pointer;
-  size_t _size;
-  memory_space_t _space;
-
-public:
-  void swap(mem_t& rhs) {
-    std::swap(_context, rhs._context);
-    std::swap(_pointer, rhs._pointer);
-    std::swap(_size, rhs._size);
-    std::swap(_space, rhs._space);
-  }
-
-  mem_t() : _context(nullptr), _pointer(nullptr), _size(0), 
-    _space(memory_space_device) { }
-  mem_t& operator=(const mem_t& rhs) = delete;
-  mem_t(const mem_t& rhs) = delete;
-
-  mem_t(size_t size, context_t& context, 
-    memory_space_t space = memory_space_device) :
-    _context(&context), _pointer(nullptr), _size(size), _space(space) {
-    _pointer = (type_t*)context.alloc(sizeof(type_t) * size, space);
-  }
-
-  mem_t(mem_t&& rhs) : mem_t() {
-    swap(rhs);
-  }
-  mem_t& operator=(mem_t&& rhs) {
-    swap(rhs);
-    return *this;
-  }
-
-  ~mem_t() {
-    if(_context && _pointer) _context->free(_pointer, _space);
-    _pointer = nullptr;
-    _size = 0;
-  }
-
-  context_t& context() { return *_context; }
-  size_t size() const { return _size; }
-  type_t* data() const { return _pointer; }
-  memory_space_t space() const { return _space; }
-
-  // Return a deep copy of this container.
-  mem_t clone() {
-    mem_t cloned(size(), context(), space());
-    if(memory_space_device) dtod(cloned.data(), data(), size());
-    else htoh(cloned.data(), data(), size());
-    return cloned;
-  }
-};
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/context_reduced.hxx b/src/util/cuda/moderngpu/context_reduced.hxx
deleted file mode 100644
index 88544a7e..00000000
--- a/src/util/cuda/moderngpu/context_reduced.hxx
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * context_reduced.hxx
- *
- *  Created on: Dec 27, 2018
- *      Author: i-bird
- */
-
-#ifndef CONTEXT_REDUCED_HXX_
-#define CONTEXT_REDUCED_HXX_
-
-#include <cstdarg>
-#include <string>
-
-
-namespace mgpu {
-
-enum memory_space_t {
-  memory_space_device = 0,
-  memory_space_host = 1
-};
-
-struct cuda_exception_t : std::exception {
-  cudaError_t result;
-
-  cuda_exception_t(cudaError_t result_) : result(result_) { }
-  virtual const char* what() const noexcept {
-    return cudaGetErrorString(result);
-  }
-};
-
-namespace detail {
-
-inline std::string stringprintf(const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  int len = vsnprintf(0, 0, format, args);
-  va_end(args);
-
-  // allocate space.
-  std::string text;
-  text.resize(len);
-
-  va_start(args, format);
-  vsnprintf(&text[0], len + 1, format, args);
-  va_end(args);
-
-  return text;
-}
-
-} // namespace detail
-
-inline std::string device_prop_string(cudaDeviceProp prop) {
-  int ordinal;
-  cudaGetDevice(&ordinal);
-
-  size_t freeMem, totalMem;
-  cudaError_t result = cudaMemGetInfo(&freeMem, &totalMem);
-  if(cudaSuccess != result) throw cuda_exception_t(result);
-
-  double memBandwidth = (prop.memoryClockRate * 1000.0) *
-    (prop.memoryBusWidth / 8 * 2) / 1.0e9;
-
-  std::string s = detail::stringprintf(
-    "%s : %8.3lf Mhz   (Ordinal %d)\n"
-    "%d SMs enabled. Compute Capability sm_%d%d\n"
-    "FreeMem: %6dMB   TotalMem: %6dMB   %2d-bit pointers.\n"
-    "Mem Clock: %8.3lf Mhz x %d bits   (%5.1lf GB/s)\n"
-    "ECC %s\n\n",
-    prop.name, prop.clockRate / 1000.0, ordinal,
-    prop.multiProcessorCount, prop.major, prop.minor,
-    (int)(freeMem / (1<< 20)), (int)(totalMem / (1<< 20)), 8 * sizeof(int*),
-    prop.memoryClockRate / 1000.0, prop.memoryBusWidth, memBandwidth,
-    prop.ECCEnabled ? "Enabled" : "Disabled");
-  return s;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// context_t
-// Derive context_t to add support for streams and a custom allocator.
-
-struct context_t {
-  context_t() = default;
-
-  // Disable copy ctor and assignment operator. We don't want to let the
-  // user copy only a slice.
-  context_t(const context_t& rhs) = delete;
-  context_t& operator=(const context_t& rhs) = delete;
-
-  virtual const cudaDeviceProp& props() const = 0;
-  virtual int ptx_version() const = 0;
-  virtual cudaStream_t stream() = 0;
-
-  // Alloc GPU memory.
-  virtual void* alloc(size_t size, memory_space_t space) = 0;
-  virtual void free(void* p, memory_space_t space) = 0;
-
-  // cudaStreamSynchronize or cudaDeviceSynchronize for stream 0.
-  virtual void synchronize() = 0;
-
-  virtual cudaEvent_t event() = 0;
-  virtual void timer_begin() = 0;
-  virtual double timer_end() = 0;
-};
-
-}
-
-#endif /* CONTEXT_REDUCED_HXX_ */
diff --git a/src/util/cuda/moderngpu/cpp11.hxx b/src/util/cuda/moderngpu/cpp11.hxx
deleted file mode 100644
index 7b0dad23..00000000
--- a/src/util/cuda/moderngpu/cpp11.hxx
+++ /dev/null
@@ -1,154 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "tuple.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-///////////////////////
-// tuple_iterator_value
-
-template<typename tpl_t>
-struct tuple_iterator_value;
-
-template<typename... args_t>
-struct tuple_iterator_value<tuple<args_t...> > {
-  typedef tuple<typename std::iterator_traits<args_t>::value_type...> type;
-};
-
-template<typename tpl_t>
-using tuple_iterator_value_t = typename tuple_iterator_value<tpl_t>::type;
-
-////////////////////////////////////
-// load and store to pointer tuples.
-
-namespace detail {
-
-template<typename int_t, typename... pointers_t, size_t... seq_i>
-MGPU_HOST_DEVICE auto _lvalue_dereference(tuple<pointers_t...> pointers, 
-  index_sequence<seq_i...> seq, int_t index) ->
-  decltype(forward_as_tuple(get<seq_i>(pointers)[0]...)) {
-
-  return forward_as_tuple(get<seq_i>(pointers)[index]...);
-}
-
-}
-
-// Returns lvalues for each of the dereferenced pointers in the tuple.
-template<typename int_t, typename... pointers_t>
-MGPU_HOST_DEVICE auto dereference(tuple<pointers_t...> pointers, 
-  int_t index) -> decltype(detail::_lvalue_dereference(pointers, 
-    make_index_sequence<sizeof...(pointers_t)>(), index)) {
-
-  return detail::_lvalue_dereference(pointers, 
-    make_index_sequence<sizeof...(pointers_t)>(), index);
-}
-
-template<typename int_t, typename... pointers_t>
-MGPU_HOST_DEVICE void store(tuple<pointers_t...> pointers, 
-  tuple_iterator_value_t<tuple<pointers_t...> > values, 
-  int_t index) {
-
-  dereference(pointers, index) = values;
-}
-
-template<typename int_t, typename... pointers_t>
-tuple_iterator_value_t<tuple<pointers_t...> > 
-MGPU_HOST_DEVICE load(tuple<pointers_t...> pointers, int_t index) {
-  typedef tuple_iterator_value_t<tuple<pointers_t...> > value_t;
-  return value_t(dereference(pointers, index));
-}
-
-/////////////////////////////
-// Tuple comparison operators
-
-namespace detail {
-template<size_t i, size_t count>
-struct _tuple_compare {
-  template<typename tpl_t>
-  MGPU_HOST_DEVICE static bool eq(const tpl_t a, const tpl_t b) {
-    return get<i>(a) == get<i>(b) && _tuple_compare<i + 1, count>::eq(a, b);
-  }
-
-  template<typename tpl_t>
-  MGPU_HOST_DEVICE static bool less(const tpl_t a, const tpl_t b) {
-    return get<i>(a) < get<i>(b) || 
-      (!(get<i>(b) < get<i>(a)) && _tuple_compare<i + 1, count>::less(a, b));
-  }
-};
-
-template<size_t count>
-struct _tuple_compare<count, count> {
-  template<typename tpl_t>
-  MGPU_HOST_DEVICE static bool eq(const tpl_t, const tpl_t) {
-    return true;
-  }
-
-  template<typename tpl_t>
-  MGPU_HOST_DEVICE static bool less(const tpl_t, const tpl_t) {
-    return false;
-  }
-};
-
-} // namespace detail
-
-//////////////////////////////////////////////
-// Size of the largest component in the tuple.
-
-template<size_t... values>
-struct var_max;
-
-template<size_t value_, size_t... values_> 
-struct var_max<value_, values_...> {
-  constexpr static size_t value = max(value_, var_max<values_...>::value);
-};
-
-template<size_t value_>
-struct var_max<value_> {
-  constexpr static size_t value = value_;
-};
-
-template<> struct var_max<> {
-  constexpr static size_t value = 0;
-};
-
-template<typename tpl_t>
-struct tuple_union_size;
-
-template<typename... args_t>
-struct tuple_union_size<tuple<args_t...> > {
-  constexpr static size_t value = var_max<sizeof(args_t)...>::value;
-};
-
-END_MGPU_NAMESPACE
-
-// Putting comparison operators back into global namespace.
-template<typename... args_t>
-MGPU_HOST_DEVICE bool operator<(const mgpu::tuple<args_t...>& a, 
-  const mgpu::tuple<args_t...>& b) {
-  return mgpu::detail::_tuple_compare<0, sizeof...(args_t)>::less(a, b);
-}
-template<typename... args_t>
-MGPU_HOST_DEVICE bool operator<=(const mgpu::tuple<args_t...>& a, 
-  const mgpu::tuple<args_t...>& b) {
-  return !(b < a);
-}
-template<typename... args_t>
-MGPU_HOST_DEVICE bool operator>(const mgpu::tuple<args_t...>& a, 
-  const mgpu::tuple<args_t...>& b) {
-  return b < a;
-}
-template<typename... args_t>
-MGPU_HOST_DEVICE bool operator>=(const mgpu::tuple<args_t...>& a, 
-  const mgpu::tuple<args_t...>& b) {
-  return !(a < b);
-}
-template<typename... args_t>
-MGPU_HOST_DEVICE bool operator==(const mgpu::tuple<args_t...>& a, 
-  const mgpu::tuple<args_t...>& b) {
-  return mgpu::detail::_tuple_compare<0, sizeof...(args_t)>::eq(a, b);
-}
-template<typename... args_t>
-MGPU_HOST_DEVICE bool operator!=(const mgpu::tuple<args_t...>& a, 
-  const mgpu::tuple<args_t...>& b) {
-  return !(a == b);
-}
diff --git a/src/util/cuda/moderngpu/cta_load_balance.hxx b/src/util/cuda/moderngpu/cta_load_balance.hxx
deleted file mode 100644
index c397b789..00000000
--- a/src/util/cuda/moderngpu/cta_load_balance.hxx
+++ /dev/null
@@ -1,263 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "cta_merge.hxx"
-#include "operators.hxx"
-#include "cpp11.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-struct lbs_placement_t {
-  merge_range_t range;    // The merge range of *loaded* values. 
-                          // May extend b_range one element in each direction.
-  int a_index;            // Starting A index for merge.
-  int b_index;            // Starting B index for merge.
-};
-
-template<int nt, int vt, typename segments_it>
-MGPU_DEVICE lbs_placement_t cta_load_balance_place(int tid, 
-  merge_range_t range, int count, segments_it segments, int num_segments,
-  int* b_shared) {
-
-  // We want to know the value of the segment ID for the segment starting
-  // this tile. Load it by decrementing range.b_begin.
-  int load_preceding = 0 < range.b_begin;
-  range.b_begin -= load_preceding;
-
-  // Load a trailing member of the segment ID array. This lets us read one past
-  // the last member: b_key = b_shared[++b0]. Note the use of prefix increment,
-  // which gets the beginning of the next identifier, not the current one.
-  if(range.b_end < num_segments && range.a_end < count)
-    ++range.b_end;
-
-  int load_count = range.b_count();
-  int fill_count = nt * vt + 1 + load_preceding - load_count - range.a_count();
-
-  // Fill the end of the array with dest_count.
-  for(int i = tid; i < fill_count; i += nt)
-    b_shared[load_count + i] = count;
-
-  // Load the segments descriptors into the front of the indices array.
-  // TODO: SUBTRACT OUT A_BEGIN FROM B_BEGIN SO WE CAN DO 32-BIT COMPARISONS!
-  for(int i = tid; i < load_count; i += nt)
-    b_shared[i] = segments[range.b_begin + i];
-  __syncthreads();
-
-  // Run a merge path search to find the start of the serial merge for
-  // each thread. If we loaded a preceding value from B, increment the 
-  // cross-diagonal so that we don't redundantly process it.
-  int diag = vt * tid + load_preceding;
-  int mp = merge_path<bounds_upper>(counting_iterator_t<int>(range.a_begin),
-    range.a_count(), b_shared, load_count + fill_count, diag, less_t<int>());
-  __syncthreads();
-
-  // Get the starting points for the merge for A and B. Why do we subtract 1
-  // from B? At the start of the array, we are pointing to output 0 and 
-  // segment 0. But we don't really start merging A until we've encountered
-  // its start flag at B. That is, the first iteration should increment b_index
-  // to 0, then start merging from the first segment of A, so b_index needs to
-  // start at -1.
-  int a_index = range.a_begin + mp;
-  int b_index = range.b_begin + (diag - mp) - 1;
-
-  return lbs_placement_t {
-    range, a_index, b_index
-  };
-}
-
-struct lbs_fill_t {
-  merge_range_t range;
-  int b_offset;
-};
-
-template<int nt, int vt, typename segments_it, typename partition_it>
-MGPU_DEVICE lbs_fill_t cta_load_balance_fill(int count, 
-  segments_it segments, int num_segments, int tid, int cta, 
-  partition_it partitions, int* shared) {
- 
-  merge_range_t range = compute_merge_range(count, num_segments, cta, 
-    nt * vt, partitions[cta], partitions[cta + 1]);
-
-  int* a_shared = shared - range.a_begin;
-  int* b_shared = shared + range.a_count();
-
-  lbs_placement_t placement = cta_load_balance_place<nt, vt>(tid, range, 
-    count, segments, num_segments, b_shared);
-
-  // Adjust the b pointer by the loaded b_begin. This lets us dereference it
-  // directly with the segment index.
-  b_shared -= placement.range.b_begin;
-
-  // Fill shared memory with the segment IDs of the in-range values.
-  int cur_item = placement.a_index;
-  int cur_segment = placement.b_index;
-
-  iterate<vt>([&](int i) {
-    bool p = cur_item < b_shared[cur_segment + 1];
-    if(p) a_shared[cur_item++] = cur_segment;
-    else ++cur_segment;
-  });
-  __syncthreads();
-
-  return lbs_fill_t {
-    range,
-    range.a_count() - placement.range.b_begin
-  };
-}
-
-template<int nt, int vt>
-struct cta_load_balance_t {
-  enum { nv = nt * vt };
-  struct storage_t {
-    int indices[nv + 2];
-  };
-
-  struct result_t {
-    lbs_placement_t placement;
-    merge_range_t merge_range;
-
-    // thread-order data.
-    int merge_flags;
-
-    // strided-order data.
-    array_t<int, vt> indices;
-    array_t<int, vt> segments;
-    array_t<int, vt> ranks;
-  };
-
-  template<typename segments_it, typename partition_it>
-  MGPU_DEVICE result_t load_balance(int count, segments_it segments, 
-    int num_segments, int tid, int cta, partition_it partitions, 
-    storage_t& storage) const {
-
-    merge_range_t range = compute_merge_range(count, num_segments, cta, 
-      nv, partitions[cta], partitions[cta + 1]);
-
-    int* a_shared = storage.indices - range.a_begin;
-    int* b_shared = storage.indices + range.a_count();
-
-    lbs_placement_t placement  = cta_load_balance_place<nt, vt>(tid, range, 
-      count, segments, num_segments, b_shared);
-
-    // Adjust the b pointer by the loaded b_begin. This lets us dereference it
-    // directly with the segment index.
-    b_shared -= placement.range.b_begin;
-
-    // Store the segment of each element in A.
-    int cur_item = placement.a_index;
-    int cur_segment = placement.b_index;
-    int merge_flags = 0;
-
-    // Fill shared memory with the segment IDs of the in-range values.
-    iterate<vt + 1>([&](int i) {
-      // Compare the output index to the starting position of the next segment.
-      bool p = cur_item < b_shared[cur_segment + 1];
-      if(p && i < vt) // Advance A (the needle). 
-        a_shared[cur_item++] = cur_segment;
-      else  // Advance B (the haystack)
-        ++cur_segment;
-      merge_flags |= (int)p<< i;
-    });
-    __syncthreads();
-
-    // Load the segment indices in strided order. Use the segment ID to compute
-    // rank of each element. These strided-order (index, seg, rank) tuples
-    // will be passed to the lbs functor.
-    array_t<int, vt> indices, seg, ranks;
-    iterate<vt>([&](int i) {
-      int j = nt * i + tid;
-      indices[i] = range.a_begin + j;
-      if(j < range.a_count()) {
-        seg[i] = storage.indices[j];
-        ranks[i] = indices[i] - b_shared[seg[i]];
-      } else {
-        seg[i] = range.b_begin;
-        ranks[i] = -1;
-      }
-    });
-    __syncthreads();
-
-    return result_t { 
-      placement, range, merge_flags,
-      indices, seg, ranks
-    };
-  }
-};
-
-
-namespace detail {
-
-template<int nt, typename pointers_t>
-struct cached_segment_load_t {
-
-  enum { size = tuple_size<pointers_t>:: value };
-  typedef make_index_sequence<size> seq_t;
-  typedef tuple_iterator_value_t<pointers_t> value_t;
-
-  template<typename seq_t>
-  struct load_storage_t;
-
-  template<size_t... seq_i>
-  struct load_storage_t<index_sequence<seq_i...> > {
-    tuple<
-      array_t<typename tuple_element<seq_i, value_t>::type, nt>...
-    > data;
-
-    MGPU_HOST_DEVICE void store_value(const value_t& value, int index) {
-      swallow(get<seq_i>(data)[index] = get<seq_i>(value)...);
-    }
-
-    MGPU_HOST_DEVICE value_t load_value(int index) const {
-      return make_tuple(get<seq_i>(data)[index]...);
-    }
-  };
-
-  typedef load_storage_t<seq_t> storage_t;
-
-  template<int vt0, int vt>
-  MGPU_DEVICE static array_t<value_t, vt> load(int tid, int count,
-    range_t range, array_t<int, vt> segments, storage_t& storage, 
-    pointers_t iterators) {
-    
-    array_t<value_t, vt> loaded;
-    if(range.count() <= nt) {
-      // Cached load through shared memory.
-      if(tid < range.count()) {
-        value_t value = mgpu::load(iterators, range.begin + tid);
-        storage.store_value(value, tid);
-      }
-      __syncthreads();
-
-      // Load the values into register.
-      strided_iterate<nt, vt, vt0>([&](int i, int j) {
-        loaded[i] = storage.load_value(segments[i] - range.begin);
-      }, tid, count);
-      __syncthreads();
-
-    } else {
-      // Direct load.
-      strided_iterate<nt, vt, vt0>([&](int i, int j) {
-        loaded[i] = mgpu::load(iterators, segments[i]);      
-      }, tid, count);
-    }
-
-    return loaded;
-  }
-};
-
-template<int nt>
-struct cached_segment_load_t<nt, tuple<> > {
-  typedef empty_t storage_t;
-  typedef tuple<> value_t;
-
-  template<int vt0, int vt>
-  MGPU_DEVICE static array_t<value_t, vt> load(int tid, int count,
-    range_t range, array_t<int, vt> segments, storage_t& storage,
-    tuple<> iterators) {
-
-    return array_t<value_t, vt>();
-  }
-};
-
-} // namespace detail 
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/cta_merge.hxx b/src/util/cuda/moderngpu/cta_merge.hxx
deleted file mode 100644
index 9ff38db3..00000000
--- a/src/util/cuda/moderngpu/cta_merge.hxx
+++ /dev/null
@@ -1,209 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "loadstore.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<bounds_t bounds = bounds_lower, typename a_keys_it,
-  typename b_keys_it, typename int_t, typename comp_t>
-MGPU_HOST_DEVICE int_t merge_path(a_keys_it a_keys, int_t a_count,
-  b_keys_it b_keys, int_t b_count, int_t diag, comp_t comp) {
-
-  typedef typename std::iterator_traits<a_keys_it>::value_type type_t;
-  int_t begin = max(0, diag - b_count);
-  int_t end = min(diag, a_count);
-
-  while(begin < end) {
-    int_t mid = (begin + end) / 2;
-    type_t a_key = a_keys[mid];
-    type_t b_key = b_keys[diag - 1 - mid];
-    bool pred = (bounds_upper == bounds) ?
-      comp(a_key, b_key) :
-      !comp(b_key, a_key);
-
-    if(pred) begin = mid + 1;
-    else end = mid;
-  }
-  return begin;
-}
-
-template<bounds_t bounds, typename keys_it, typename comp_t>
-MGPU_HOST_DEVICE int merge_path(keys_it keys, merge_range_t range,
-  int diag, comp_t comp) {
-
-  return merge_path<bounds>(
-    keys + range.a_begin, range.a_count(),
-    keys + range.b_begin, range.b_count(),
-    diag, comp);
-}
-
-template<bounds_t bounds, bool range_check, typename type_t, typename comp_t>
-MGPU_HOST_DEVICE bool merge_predicate(type_t a_key, type_t b_key, 
-  merge_range_t range, comp_t comp) {
-
-  bool p;
-  if(range_check && !range.a_valid()) p = false;
-  else if(range_check && !range.b_valid()) p = true;
-  else p = (bounds_upper == bounds) ? comp(a_key, b_key) : !comp(b_key, a_key);
-  return p;
-}
-
-MGPU_HOST_DEVICE merge_range_t compute_merge_range(int a_count, int b_count,
-  int partition, int spacing, int mp0, int mp1) {
-
-  int diag0 = spacing * partition;
-  int diag1 = min(a_count + b_count, diag0 + spacing);
-
-  return merge_range_t { mp0, mp1, diag0 - mp0, diag1 - mp1 };
-}
-
-
-// Specialization that emits just one LD instruction. Can only reliably used
-// with raw pointer types. Fixed not to use pointer arithmetic so that 
-// we don't get undefined behaviors with unaligned types.
-template<int nt, int vt, typename type_t>
-MGPU_DEVICE array_t<type_t, vt> 
-load_two_streams_reg(const type_t* a, int a_count, 
-  const type_t* b, int b_count, int tid) {
-
-  b -= a_count;
-  array_t<type_t, vt> x;
-  strided_iterate<nt, vt>([&](int i, int index) {
-    const type_t* p = (index >= a_count) ? b : a;
-    x[i] = p[index];
-  }, tid, a_count + b_count);
-
-  return x;  
-}
-
-template<int nt, int vt, typename type_t, typename a_it, typename b_it>
-MGPU_DEVICE 
-enable_if_t<
-  !(std::is_pointer<a_it>::value && std::is_pointer<b_it>::value), 
-  array_t<type_t, vt> 
-> load_two_streams_reg(a_it a, int a_count, b_it b, int b_count, int tid) {
-  b -= a_count;
-  array_t<type_t, vt> x;
-  strided_iterate<nt, vt>([&](int i, int index) {
-    x[i] = (index < a_count) ? a[index] : b[index];
-  }, tid, a_count + b_count);
-  return x;
-}
-
-template<int nt, int vt, typename a_it, typename b_it, typename type_t,
-  int shared_size>
-MGPU_DEVICE void load_two_streams_shared(a_it a, int a_count,
-  b_it b, int b_count, int tid, type_t (&shared)[shared_size], 
-  bool sync = true) {
-
-  // Load into register then make an unconditional strided store into memory.
-  array_t<type_t, vt> x = load_two_streams_reg<nt, vt, type_t>(
-    a, a_count, b, b_count, tid);
-  reg_to_shared_strided<nt>(x, tid, shared, sync);
-}
-
-template<int nt, int vt, typename type_t>
-MGPU_DEVICE array_t<type_t, vt> gather_two_streams_strided(const type_t* a,
-  int a_count, const type_t* b, int b_count, array_t<int, vt> indices,
-  int tid) {
-
-  ptrdiff_t b_offset = b - a - a_count;
-  int count = a_count + b_count;
-
-  array_t<type_t, vt> x;
-  strided_iterate<nt, vt>([&](int i, int j) { 
-    ptrdiff_t gather = indices[i];
-    if(gather >= a_count) gather += b_offset;
-    x[i] = a[gather];
-  }, tid, count);
-
-  return x;
-}
-template<int nt, int vt, typename type_t, typename a_it, typename b_it>
-MGPU_DEVICE 
-enable_if_t<
-  !(std::is_pointer<a_it>::value && std::is_pointer<b_it>::value), 
-  array_t<type_t, vt> 
-> gather_two_streams_strided(a_it a,
-  int a_count, b_it b, int b_count, array_t<int, vt> indices, int tid) {
-
-  b -= a_count;
-  array_t<type_t, vt> x;
-  strided_iterate<nt, vt>([&](int i, int j) { 
-    x[i] = (indices[i] < a_count) ? a[indices[i]] : b[indices[i]];
-  }, tid, a_count + b_count);
-
-  return x;
-}
-
-template<int nt, int vt, typename a_it, typename b_it, typename c_it>
-MGPU_DEVICE void transfer_two_streams_strided(a_it a, int a_count, b_it b, 
-  int b_count, array_t<int, vt> indices, int tid, c_it c) {
-
-  typedef typename std::iterator_traits<a_it>::value_type type_t;
-  array_t<type_t, vt> x = gather_two_streams_strided<nt, vt, type_t>(a, 
-    a_count, b, b_count, indices, tid);
-
-  reg_to_mem_strided<nt>(x, tid, a_count + b_count, c);
-}
-
-
-// This function must be able to dereference keys[a_begin] and keys[b_begin],
-// no matter the indices for each. The caller should allocate at least 
-// nt * vt + 1 elements for 
-template<bounds_t bounds, int vt, typename type_t, typename comp_t>
-MGPU_DEVICE merge_pair_t<type_t, vt> 
-serial_merge(const type_t* keys_shared, merge_range_t range, comp_t comp, 
-  bool sync = true) {
-
-  type_t a_key = keys_shared[range.a_begin];
-  type_t b_key = keys_shared[range.b_begin];
-
-  merge_pair_t<type_t, vt> merge_pair;
-  iterate<vt>([&](int i) {
-    bool p = merge_predicate<bounds, true>(a_key, b_key, range, comp);
-    int index = p ? range.a_begin : range.b_begin;
-
-    merge_pair.keys[i] = p ? a_key : b_key;
-    merge_pair.indices[i] = index;
-
-    type_t c_key = keys_shared[++index];
-    if(p) a_key = c_key, range.a_begin = index;
-    else b_key = c_key, range.b_begin = index;
-  });
-
-  if(sync) __syncthreads();
-  return merge_pair;
-}
-
-// Load arrays a and b from global memory and merge into register.
-template<bounds_t bounds, int nt, int vt, typename a_it, typename b_it, 
-  typename type_t, typename comp_t, int shared_size>
-MGPU_DEVICE merge_pair_t<type_t, vt> 
-cta_merge_from_mem(a_it a, b_it b, merge_range_t range_mem, int tid, 
-  comp_t comp, type_t (&keys_shared)[shared_size]) {
-
-  static_assert(shared_size >= nt * vt + 1, 
-    "cta_merge_from_mem requires temporary storage of at "
-    "least nt * vt + 1 items");
-
-  // Load the data into shared memory.
-  load_two_streams_shared<nt, vt>(a + range_mem.a_begin, range_mem.a_count(),
-    b + range_mem.b_begin, range_mem.b_count(), tid, keys_shared, true);
-
-  // Run a merge path to find the start of the serial merge for each thread.
-  merge_range_t range_local = range_mem.to_local();
-  int diag = vt * tid;
-  int mp = merge_path<bounds>(keys_shared, range_local, diag, comp);
-
-  // Compute the ranges of the sources in shared memory. The end iterators
-  // of the range are inaccurate, but still facilitate exact merging, because
-  // only vt elements will be merged.
-  merge_pair_t<type_t, vt> merged = serial_merge<bounds, vt>(keys_shared,
-    range_local.partition(mp, diag), comp);
-
-  return merged;
-};
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/cta_mergesort.hxx b/src/util/cuda/moderngpu/cta_mergesort.hxx
deleted file mode 100644
index 69a60fa8..00000000
--- a/src/util/cuda/moderngpu/cta_mergesort.hxx
+++ /dev/null
@@ -1,140 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "cta_merge.hxx"
-#include "sort_networks.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-MGPU_HOST_DEVICE int out_of_range_flags(int first, int vt, int count) {
-  int out_of_range = min(vt, first + vt - count);
-  int head_flags = 0;
-  if(out_of_range > 0) {
-    const int mask = (1<< vt) - 1;
-    head_flags = mask & (~mask>> out_of_range);
-  }
-  return head_flags;
-}
-
-MGPU_HOST_DEVICE merge_range_t compute_mergesort_frame(int partition,
-  int coop, int spacing) {
-
-  int size = spacing * (coop / 2);
-  int start = ~(coop - 1) & partition;
-  int a_begin = spacing * start;
-  int b_begin = spacing * start + size;
-
-  return merge_range_t {
-    a_begin,
-    a_begin + size,
-    b_begin,
-    b_begin + size
-  };
-}
-
-MGPU_HOST_DEVICE merge_range_t compute_mergesort_range(int count, 
-  int partition, int coop, int spacing) {
-
-  merge_range_t frame = compute_mergesort_frame(partition, coop, spacing);
-
-  return merge_range_t {
-    frame.a_begin,
-    min(count, frame.a_end),
-    min(count, frame.b_begin),
-    min(count, frame.b_end)
-  };
-}
-
-MGPU_HOST_DEVICE merge_range_t compute_mergesort_range(int count, 
-  int partition, int coop, int spacing, int mp0, int mp1) {
-
-  merge_range_t range = compute_mergesort_range(count, partition, 
-    coop, spacing);
-
-  // Locate the diagonal from the start of the A sublist.
-  int diag = spacing * partition - range.a_begin;
-
-  // The end partition of the last cta for each merge operation is computed
-  // and stored as the begin partition for the subsequent merge. i.e. it is
-  // the same partition but in the wrong coordinate system, so its 0 when it
-  // should be listSize. Correct that by checking if this is the last cta
-  // in this merge operation.
-  if(coop - 1 != ((coop - 1) & partition)) {
-    range.a_end = range.a_begin + mp1;
-    range.b_end = min(count, range.b_begin + diag + spacing - mp1);
-  }
-
-  range.a_begin = range.a_begin + mp0;
-  range.b_begin = min(count, range.b_begin + diag - mp0);
-
-  return range;
-}
-
-template<int nt, int vt, typename key_t, typename val_t>
-struct cta_sort_t {
-  enum { 
-    has_values = !std::is_same<val_t, empty_t>::value,
-    num_passes = s_log2(nt)
-  };
-
-  union storage_t {
-    key_t keys[nt * vt + 1];
-    val_t vals[nt * vt];
-  };
-
-  static_assert(is_pow2(nt), "cta_sort_t requires pow2 number of threads");
-
-  template<typename comp_t>
-  MGPU_DEVICE kv_array_t<key_t, val_t, vt> 
-  merge_pass(kv_array_t<key_t, val_t, vt> x, int tid, int count, 
-    int pass, comp_t comp, storage_t& storage) const {
-
-    // Divide the CTA's keys into lists.
-    int coop = 2<< pass;
-    merge_range_t range = compute_mergesort_range(count, tid, coop, vt);
-    int diag = vt * tid - range.a_begin;
-
-    // Store the keys into shared memory for searching.
-    reg_to_shared_thread<nt, vt>(x.keys, tid, storage.keys);
-    
-    // Search for the merge path for this thread within its list.
-    int mp = merge_path<bounds_lower>(storage.keys, range, diag, comp);
-
-    // Run a serial merge and return.
-    merge_pair_t<key_t, vt> merge = serial_merge<bounds_lower, vt>(
-      storage.keys, range.partition(mp, diag), comp);
-    x.keys = merge.keys;
-
-    if(has_values) {
-      // Reorder values through shared memory.
-      reg_to_shared_thread<nt, vt>(x.vals, tid, storage.vals);
-      x.vals = shared_gather<nt, vt>(storage.vals, merge.indices);
-    }
-
-    return x;
-  }
-
-  template<typename comp_t>
-  MGPU_DEVICE kv_array_t<key_t, val_t, vt> 
-  block_sort(kv_array_t<key_t, val_t, vt> x, int tid, int count,
-    comp_t comp, storage_t& storage) const {
-
-    // Sort the inputs within each thread. If any threads have fewer than
-    // vt items, use the segmented sort network to prevent out-of-range
-    // elements from contaminating the sort.
-    if(count < nt * vt) {
-      int head_flags = out_of_range_flags(vt * tid, vt, count);
-      x = odd_even_sort(x, comp, head_flags);
-    } else
-      x = odd_even_sort(x, comp);
-
-    // Merge threads starting with a pair until all values are merged.
-    for(int pass = 0; pass < num_passes; ++pass)
-      x = merge_pass(x, tid, count, pass, comp, storage);
-    
-    return x;
-  }
-};
-
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/cta_reduce.hxx b/src/util/cuda/moderngpu/cta_reduce.hxx
deleted file mode 100644
index 0b377c62..00000000
--- a/src/util/cuda/moderngpu/cta_reduce.hxx
+++ /dev/null
@@ -1,134 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "loadstore.hxx"
-#include "intrinsics.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-// requires __CUDA_ARCH__ >= 300.
-// warp_size can be any power-of-two <= warp_size.
-// warp_reduce_t returns the reduction only in lane 0.
-template<typename type_t, int group_size>
-struct shfl_reduce_t {
- 
-  static_assert(group_size <= warp_size && is_pow2(group_size),
-    "shfl_reduce_t must operate on a pow2 number of threads <= warp_size (32)");
-  enum { num_passes = s_log2(group_size) };
-
-  template<typename op_t = plus_t<type_t> >
-  MGPU_DEVICE type_t reduce(int lane, type_t x, int count, op_t op = op_t()) {
-    if(count == group_size) { 
-      iterate<num_passes>([&](int pass) {
-        int offset = 1<< pass;
-        x = shfl_down_op(x, offset, op, group_size);
-      });
-    } else {
-      iterate<num_passes>([&](int pass) {
-        int offset = 1<< pass;
-        type_t y = shfl_down(x, offset, group_size);
-        if(lane + offset < count) x = op(x, y);
-      });
-    }
-    return x;
-  }
-};
-
-// cta_reduce_t returns the reduction of all inputs for thread 0, and returns
-// type_t() for all other threads. This behavior saves a broadcast.
-
-template<int nt, typename type_t>
-struct cta_reduce_t {
-
-  enum { 
-    group_size = min(nt, (int)warp_size), 
-    num_passes = s_log2(group_size),
-    num_items = nt / group_size 
-  };
-
-  static_assert(0 == nt % warp_size, 
-    "cta_reduce_t requires num threads to be a multiple of warp_size (32)");
-
-  struct storage_t {
-    struct { type_t data[max(nt, 2 * group_size)]; };
-  };
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-
-  typedef shfl_reduce_t<type_t, group_size> group_reduce_t;
-
-  template<typename op_t = plus_t<type_t> >
-  MGPU_DEVICE type_t reduce(int tid, type_t x, storage_t& storage, 
-    int count = nt, op_t op = op_t(), bool all_return = true) const {
-
-    // Store your data into shared memory.
-    storage.data[tid] = x;
-    __syncthreads();
-
-    if(tid < group_size) {
-      // Each thread scans within its lane.
-      strided_iterate<group_size, num_items>([&](int i, int j) {
-        if(i > 0) x = op(x, storage.data[j]);
-      }, tid, count);
-
-      // Cooperative reduction.
-      x = group_reduce_t().reduce(tid, x, min(count, (int)group_size), op);
-
-      if(all_return) storage.data[tid] = x;
-    }
-    __syncthreads();
-
-    if(all_return) {
-      x = storage.data[0];
-      __syncthreads();
-    }
-    return x;
-  }
-
-#else
-
-  template<typename op_t = plus_t<type_t> >
-  MGPU_DEVICE type_t reduce(int tid, type_t x, storage_t& storage, 
-    int count = nt, op_t op = op_t(), bool all_return = true) const {
-
-    // Store your data into shared memory.
-    storage.data[tid] = x;
-    __syncthreads();
-
-    if(tid < group_size) {
-      // Each thread scans within its lane.
-      strided_iterate<group_size, num_items>([&](int i, int j) {
-        type_t y = storage.data[j];
-        if(i > 0) x = op(x, y);
-      }, tid, count);
-      storage.data[tid] = x;
-    }
-    __syncthreads();
-
-    int count2 = min(count, int(group_size));
-    int first = (1 & num_passes) ? group_size : 0;
-    if(tid < group_size)
-      storage.data[first + tid] = x;
-    __syncthreads();
-
-    iterate<num_passes>([&](int pass) {
-      if(tid < group_size) {
-        int offset = 1 << pass;
-        if(tid + offset < count2) 
-          x = op(x, storage.data[first + offset + tid]);
-        first = group_size - first;
-        storage.data[first + tid] = x;
-      }
-      __syncthreads();
-    });
-
-    if(all_return) {
-      x = storage.data[0];
-      __syncthreads();
-    }
-    return x;
-  }
-
-#endif
-};
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/cta_scan.hxx b/src/util/cuda/moderngpu/cta_scan.hxx
deleted file mode 100644
index f690157e..00000000
--- a/src/util/cuda/moderngpu/cta_scan.hxx
+++ /dev/null
@@ -1,231 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "loadstore.hxx"
-#include "intrinsics.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-enum scan_type_t {
-  scan_type_exc,
-  scan_type_inc
-};
-
-template<typename type_t, int vt = 0, bool is_array = (vt > 0)>
-struct scan_result_t {
-  type_t scan;
-  type_t reduction;
-};
-
-template<typename type_t, int vt>
-struct scan_result_t<type_t, vt, true> {
-  array_t<type_t, vt> scan;
-  type_t reduction;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-
-template<int nt, typename type_t>
-struct cta_scan_t {
-  enum { num_warps = nt / warp_size, capacity = nt + num_warps };
-  union storage_t {
-    type_t data[2 * nt];
-    struct { type_t threads[nt], warps[num_warps]; };
-  };
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300  
-
-  //////////////////////////////////////////////////////////////////////////////
-  // Optimized CTA scan code that uses warp shfl intrinsics.
-  // Shfl is used for all data types, not just 4-byte built-in types, however
-  // those have accelerated plus, maximum and minimum operators.
-
-  template<typename op_t = plus_t<type_t> >
-  MGPU_DEVICE scan_result_t<type_t>
-  scan(int tid, type_t x, storage_t& storage, int count = nt, op_t op = op_t(), 
-    type_t init = type_t(), scan_type_t type = scan_type_exc) const {
-
-    int warp = tid / warp_size;
-
-    // Scan each warp using shfl_add.
-    type_t warp_scan = x;
-    iterate<s_log2(warp_size)>([&](int pass) {
-      warp_scan = shfl_up_op(warp_scan, 1<< pass, op, warp_size);
-    });
-
-    // Store the intra-warp scans.
-    storage.threads[tid] = warp_scan;
-
-    // Store the reduction (last element) of each warp into storage.
-    if(min(warp_size * (warp + 1), count) - 1 == tid)
-      storage.warps[warp] = warp_scan;
-    __syncthreads();
-
-    // Scan the warp reductions.
-    if(tid < num_warps) { 
-      type_t cta_scan = storage.warps[tid];
-      iterate<s_log2(num_warps)>([&](int pass) {
-        cta_scan = shfl_up_op(cta_scan, 1<< pass, op, num_warps);
-      });
-      storage.warps[tid] = cta_scan;
-    }
-    __syncthreads();
-
-    type_t scan = warp_scan;
-    if(scan_type_exc == type) {
-      scan = tid ? storage.threads[tid - 1] : init;
-      warp = (tid - 1) / warp_size;
-    }
-    if(warp > 0) scan = op(scan, storage.warps[warp - 1]);
-
-    type_t reduction = storage.warps[div_up(count, warp_size) - 1];
-    
-    scan_result_t<type_t> result { 
-      tid < count ? scan : reduction, 
-      reduction 
-    };
-    __syncthreads();
-
-    return result;
-  }
-
-#else
-
-  //////////////////////////////////////////////////////////////////////////////
-  // Standard CTA scan code that does not use shfl intrinsics. 
-
-  template<typename op_t = plus_t<type_t> >
-  MGPU_DEVICE scan_result_t<type_t> 
-  scan(int tid, type_t x, storage_t& storage, int count = nt, op_t op = op_t(), 
-    type_t init = type_t(), scan_type_t type = scan_type_exc) const {
-
-    int first = 0;
-    storage.data[first + tid] = x;
-    __syncthreads();
-
-    iterate<s_log2(nt)>([&](int pass) {
-      int offset = 1<< pass;
-      if(tid >= offset)
-        x = op(storage.data[first + tid - offset], x);
-      first = nt - first;
-      storage.data[first + tid] = x;
-      __syncthreads();
-    });
-
-    scan_result_t<type_t> result;
-    result.reduction = storage.data[first + count - 1];
-    result.scan = (tid < count) ? 
-      (scan_type_inc == type ? x :
-        (tid ? storage.data[first + tid - 1] : init)) :
-      result.reduction;
-    __syncthreads();
-
-    return result;
-  }
-
-#endif  
-
-  //////////////////////////////////////////////////////////////////////////////
-  // CTA vectorized scan. Accepts multiple values per thread and adds in 
-  // optional global carry-in.
-
-  template<int vt, typename op_t = plus_t<type_t> >
-  MGPU_DEVICE scan_result_t<type_t, vt>
-  scan(int tid, array_t<type_t, vt> x, storage_t& storage, 
-    type_t carry_in = type_t(), bool use_carry_in = false, 
-    int count = nt, op_t op = op_t(), type_t init = type_t(),
-    scan_type_t type = scan_type_exc) const {
-
-    // Start with an inclusive scan of the in-range elements.
-    if(count >= nt * vt) {
-      iterate<vt>([&](int i) {
-        x[i] = i ? op(x[i], x[i - 1]) : x[i];
-      });
-    } else {
-      iterate<vt>([&](int i) {
-        int index = vt * tid + i;
-        x[i] = i ? 
-          ((index < count) ? op(x[i], x[i - 1]) : x[i - 1]) :
-          (x[i] = (index < count) ? x[i] : init);
-      });
-    }
-
-    // Scan the thread-local reductions for a carry-in for each thread.
-    scan_result_t<type_t> result = scan(tid, x[vt - 1], storage, 
-      div_up(count, vt), op, init, scan_type_exc);
-
-    // Perform the scan downsweep and add both the global carry-in and the
-    // thread carry-in to the values.
-    if(use_carry_in) {
-      result.reduction = op(carry_in, result.reduction);
-      result.scan = tid ? op(carry_in, result.scan) : carry_in;
-    } else
-      use_carry_in = tid > 0;
-
-    array_t<type_t, vt> y;
-    iterate<vt>([&](int i) {
-      if(scan_type_exc == type) {
-        y[i] = i ? x[i - 1] : result.scan;
-        if(use_carry_in && i > 0) y[i] = op(result.scan, y[i]);
-      } else {
-        y[i] = use_carry_in ? op(x[i], result.scan) : x[i];
-      }
-    });
-
-    return scan_result_t<type_t, vt> { y, result.reduction };
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// Overload for scan of bools.
-
-template<int nt>
-struct cta_scan_t<nt, bool> {
-  enum { num_warps = nt / warp_size };
-  struct storage_t {
-    int warps[num_warps];
-  };
-
-  MGPU_DEVICE scan_result_t<int> scan(int tid, bool x, 
-    storage_t& storage) const {
-
-    // Store the bit totals for each warp.
-    int lane = (warp_size - 1) & tid;
-    int warp = tid / warp_size;
-
-    int bits = ballot(x);
-    storage.warps[warp] = popc(bits);
-    __syncthreads();
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-    if(tid < num_warps) {
-      // Cooperative warp scan of partial reductions.
-      int scan = storage.warps[tid];
-      iterate<s_log2(num_warps)>([&](int i) {
-        scan = shfl_up_op(scan, 1<< i, plus_t<int>(), num_warps);
-      });
-      storage.warps[tid] = scan;
-    }
-    __syncthreads();
-#else
-    
-    if(0 == tid) {
-      // Inclusive scan of partial reductions..
-      int scan = 0;
-      iterate<num_warps>([&](int i) {
-        storage.warps[i] = scan += storage.warps[i];
-      });
-    }
-    __syncthreads();
-
-#endif    
-
-    int scan = ((warp > 0) ? storage.warps[warp - 1] : 0) +
-      popc(bfe(bits, 0, lane));
-    int reduction = storage.warps[num_warps - 1];
-    __syncthreads();
-
-    return scan_result_t<int> { scan, reduction };
-  }
-};
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/cta_search.hxx b/src/util/cuda/moderngpu/cta_search.hxx
deleted file mode 100644
index 8ff23f49..00000000
--- a/src/util/cuda/moderngpu/cta_search.hxx
+++ /dev/null
@@ -1,100 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "cta_merge.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<bounds_t bounds, typename keys_it, typename int_t, typename key_t, 
-  typename comp_t>
-MGPU_HOST_DEVICE int_t binary_search(keys_it keys, int_t count, key_t key,
-  comp_t comp) {
-
-  int_t begin = 0;
-  int_t end = count;
-  while(begin < end) {
-    int_t mid = (begin + end) / 2;
-    key_t key2 = keys[mid];
-    bool pred = (bounds_upper == bounds) ? 
-      !comp(key, key2) :
-      comp(key2, key);
-    if(pred) begin = mid + 1;
-    else end = mid;
-  }
-  return begin;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// TODO: Implement a moderngpu V1 style vectorized sorted search.
-
-template<typename type_t, int vt>
-struct search_result_t {
-  array_t<type_t, vt> keys;
-  array_t<int, vt> indices;
-  int decisions;              // Set a bit if this iteration has progressed A.
-  int matches_a;              // A set flag for a match on each iteration.
-  int matches_b;
-};
-
-template<int vt, bounds_t bounds, bool range_check, typename type_t, 
-  typename comp_t>
-MGPU_DEVICE search_result_t<type_t, vt> 
-serial_search(const type_t* keys_shared, merge_range_t range,
-  int a_offset, int b_offset, comp_t comp, bool sync = true) {
-
-  type_t a_key = keys_shared[range.a_begin];
-  type_t b_key = keys_shared[range.b_begin];
-  type_t a_prev = type_t(), b_prev = type_t();
-
-  int a_start = 0;
-  int b_start = range.a_end;    // Assume the b_keys start right after the end
-                                // of the a_keys.
-  if(range.a_begin > 0) a_prev = keys_shared[range.a_begin - 1];
-  if(range.b_begin > b_start) b_prev = keys_shared[range.b_begin - 1];
-
-  search_result_t<type_t, vt> result = search_result_t<type_t, vt>();
-
-  iterate<vt>([&](int i) {
-    // This is almost the same body as serial_merge, except for the match
-    // criterion below.
-    bool p = merge_predicate<bounds, range_check>(a_key, b_key, range, comp);
-
-    if(p) {
-      bool match = (bounds_upper == bounds) ?
-        (!range_check || range.b_begin > b_start) && 
-          !comp(b_prev, a_key) :
-        (!range_check || range.b_valid()) && 
-          !comp(a_key, b_key);
-
-      result.decisions |= 1<< i;
-      result.matches_a |= (int)match<< i;
-      a_prev = a_key;
-
-    } else {
-      bool match = (bounds_upper == bounds) ?
-        (!range_check || (range.a_valid() && range.b_valid())) && 
-          !comp(b_key, a_key) :
-        (!range_check || (range.b_valid() && range.a_begin > a_start)) && 
-          !comp(a_prev, b_key);
-
-      result.matches_b |= (int)match<< i;
-      b_prev = b_key;
-    }
-
-    // Same advancement behavior as serial_merge.
-    int index = p ? range.a_begin : range.b_begin;
-
-    result.keys[i] = p ? a_key : b_key;
-    result.indices[i] = index + (p ? a_offset : b_offset);
-
-    type_t c_key = keys_shared[++index];
-    if(p) a_key = c_key, range.a_begin = index;
-    else b_key = c_key, range.b_begin = index;
-  });
-
-  if(sync) __syncthreads();
-
-  return result;
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/cta_segscan.hxx b/src/util/cuda/moderngpu/cta_segscan.hxx
deleted file mode 100644
index e8738c5c..00000000
--- a/src/util/cuda/moderngpu/cta_segscan.hxx
+++ /dev/null
@@ -1,119 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "cta_scan.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<typename type_t>
-struct segscan_result_t {
-  type_t scan;
-  type_t reduction;
-  bool has_carry_in;
-  int left_lane;
-};
-
-template<int nt, typename type_t>
-struct cta_segscan_t {
-  enum { num_warps = nt / warp_size };
-
-  union storage_t {
-    int delta[num_warps + nt]; 
-    struct { type_t values[2 * nt]; int packed[nt]; };
-  };
-
-  MGPU_DEVICE int find_left_lane(int tid, bool has_head_flag, 
-    storage_t& storage) const {
-
-    int warp = tid / warp_size;
-    int lane = (warp_size - 1) & tid;
-    int warp_mask = 0xffffffff>> (31 - lane);   // inclusive search.
-    int cta_mask = 0x7fffffff>> (31 - lane);    // exclusive search.
-
-    #ifdef __HIP__
-    // Build a head flag bitfield and store it into shared memory.
-    long int warp_bits = ballot(has_head_flag);
-    storage.delta[warp] = (int)warp_bits;
-    #else
-    // Build a head flag bitfield and store it into shared memory.
-    int warp_bits = ballot(has_head_flag);
-    storage.delta[warp] = warp_bits;
-    #endif
-
-
-    __syncthreads();
-
-    if(tid < num_warps) {
-      #ifdef __HIP__
-      int cta_bits = (int)ballot(0 != storage.delta[tid]);
-      #else
-      unsigned mask = __activemask();
-      int cta_bits = ballot(0 != storage.delta[tid], mask);
-      #endif
-      int warp_segment = 31 - clz(cta_mask & cta_bits);
-      int start = (-1 != warp_segment) ?
-        (31 - clz(storage.delta[warp_segment]) + 32 * warp_segment) : 0;
-      storage.delta[num_warps + tid] = start;
-
-    }
-    __syncthreads();
-
-    // Find the closest flag to the left of this thread within the warp.
-    // Include the flag for this thread.
-    int start = 31 - clz(warp_mask & warp_bits);
-    if(-1 != start) start += ~31 & tid;
-    else start = storage.delta[num_warps + warp];
-    __syncthreads();
-
-    return start;
-  }
-
-  template<typename op_t = plus_t<type_t> >
-  MGPU_DEVICE segscan_result_t<type_t> segscan(int tid, bool has_head_flag,
-    bool has_carry_out, type_t x, storage_t& storage, type_t init = type_t(),
-    op_t op = op_t()) const {
-
-    if(!has_carry_out) x = init;
-
-    int left_lane = find_left_lane(tid, has_head_flag, storage);
-    int tid_delta = tid - left_lane;
-
-    // Store the has_carry_out flag.
-    storage.packed[tid] = (int)has_carry_out | (left_lane<< 1);
-
-    // Run an inclusive scan.
-    int first = 0;
-    storage.values[first + tid] = x;
-    __syncthreads();
-
-    int packed = storage.packed[left_lane];
-    left_lane = packed>> 1;
-    tid_delta = tid - left_lane;
-    if(0 == (1 & packed)) --tid_delta;
-
-    iterate<s_log2(nt)>([&](int pass) {
-      int offset = 1<< pass;
-      if(tid_delta >= offset)
-        x = op(x, storage.values[first + tid - offset]);
-      first = nt - first;
-      storage.values[first + tid] = x;
-      __syncthreads();
-    });
-
-    // Get the exclusive scan by fetching the preceding element. Also return
-    // the carry-out value as the total.
-    bool has_carry_in = tid ? (0 != (1 & storage.packed[tid - 1])) : false;
-
-    segscan_result_t<type_t> result { 
-      (has_carry_in && tid) ? storage.values[first + tid - 1] : init,
-      storage.values[first + nt - 1],
-      has_carry_in,
-      left_lane
-    };
-    __syncthreads();
-
-    return result;
-  }
-};
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/cta_segsort.hxx b/src/util/cuda/moderngpu/cta_segsort.hxx
deleted file mode 100644
index 3e75791b..00000000
--- a/src/util/cuda/moderngpu/cta_segsort.hxx
+++ /dev/null
@@ -1,226 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "cta_mergesort.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<typename keys_it, typename comp_t>
-MGPU_HOST_DEVICE int segmented_merge_path(keys_it keys, merge_range_t range,
-  range_t active, int diag, comp_t comp) {
-
-  // Consider a rectangle defined by range.
-  // Now consider a sub-rectangle at the top-right corner defined by
-  // active. We want to run the merge path only within this corner part.
-
-  // If the cross-diagonal does not intersect our corner, return immediately.
-  if(range.a_begin + diag <= active.begin) return diag;
-  if(range.a_begin + diag >= active.end) return range.a_count();
-
-  // Call merge_path on the corner domain.
-  active.begin = max(active.begin, range.a_begin);
-  active.end = min(active.end, range.b_end);
-
-  merge_range_t active_range = { 
-    active.begin, range.a_end, 
-    range.b_begin, active.end 
-  };
-
-  int active_offset = active.begin - range.a_begin;
-  int p = merge_path<bounds_lower>(keys, active_range, 
-    diag - active_offset, comp);
-
-  return p + active_offset;
-}
-
-template<int vt, typename type_t, typename comp_t>
-MGPU_DEVICE merge_pair_t<type_t, vt> segmented_serial_merge(
-  const type_t* keys_shared, merge_range_t range, range_t active, 
-  comp_t comp, bool sync = true) {
-
-  range.b_end = min(active.end, range.b_end);
-
-  type_t a_key = keys_shared[range.a_begin];
-  type_t b_key = keys_shared[range.b_begin];
-
-  merge_pair_t<type_t, vt> merge_pair;
-  iterate<vt>([&](int i) {
-    bool p;
-    if(range.a_begin >= range.a_end) 
-      // If A has run out of inputs, emit B.
-      p = false;
-    else if(range.b_begin >= range.b_end || range.a_begin < active.begin)
-      // B has hit the end of the middle segment.
-      // Emit A if A has inputs remaining in the middle segment.
-      p = true;
-    else 
-      // Emit the smaller element in the middle segment.
-      p = !comp(b_key, a_key);
-
-    int index = p ? range.a_begin : range.b_begin;
-    merge_pair.keys[i] = p ? a_key : b_key;
-    merge_pair.indices[i] = index;
-
-    type_t c_key = keys_shared[++index];
-    if(p) a_key = c_key, range.a_begin = index;
-    else b_key = c_key, range.b_begin = index;
-  });
-  
-  if(sync) __syncthreads();
-  return merge_pair;
-}
-
-template<int nt, int vt>
-struct cta_load_head_flags {
-  enum { 
-    nv = nt * vt, 
-
-    // Store each flag in a byte; there are 4 bytes in a word, and threads
-    // cooperatively reset these.
-    words_per_thread = div_up(vt, 32 / 8)
-  };
-
-  union storage_t {
-    char flags[nv];
-    int words[nt * words_per_thread];
-  };
-
-  template<typename seg_it>
-  MGPU_DEVICE int load(seg_it segments, const int* partitions_global,
-    int tid, int cta, int count, storage_t& storage) {
-
-    int mp0 = partitions_global[0];
-    int mp1 = partitions_global[1];
-    int gid = nv * cta;
-    count -= gid;
-
-    // Set the head flags for out-of-range keys.
-    int head_flags = out_of_range_flags(vt * tid, vt, count);
-
-    if(mp1 > mp0) {
-      // Clear the flag bytes, then loop through the indices and poke in
-      // flag bytes.
-      iterate<words_per_thread>([&](int i) { 
-        storage.words[nt * i + tid] = 0;
-      });
-      __syncthreads();
-
-      for(int index = mp0 + tid; index < mp1; index += nt)
-        storage.flags[segments[index] - gid] = 1;
-      __syncthreads();
-
-      // Combine all the head flags for this thread.
-      int first = vt * tid;
-      int offset = first / 4;
-      int prev = storage.words[offset];
-      int mask = 0x3210 + 0x1111 * (3 & first);
-      iterate<words_per_thread>([&](int i) {
-        int next = storage.words[offset + 1 + i];
-        int x = prmt(prev, next, mask);
-        prev = next;
-
-        // Set the head flag bits.
-        if(0x00000001 & x) head_flags |= 1<< (4 * i + 0);
-        if(0x00000100 & x) head_flags |= 1<< (4 * i + 1);
-        if(0x00010000 & x) head_flags |= 1<< (4 * i + 2);
-        if(0x01000000 & x) head_flags |= 1<< (4 * i + 3);
-      });
-      head_flags &= (1<< vt) - 1;
-      __syncthreads();
-    }
-
-    return head_flags;
-  }
-};
-
-template<int nt, int vt, typename key_t, typename val_t>
-struct cta_segsort_t {
-  enum { 
-    nv = nt * vt,
-    has_values = !std::is_same<val_t, empty_t>::value,
-    num_passes = s_log2(nt)
-  };
-
-  struct storage_t {
-    union {
-      key_t keys[nt * vt];
-      val_t vals[nt * vt];
-    };
-    int ranges[nt];
-  };
-
-  static_assert(is_pow2(nt), "cta_segsort_t requires pow2 number of threads");
-
-  template<typename comp_t>
-  MGPU_DEVICE kv_array_t<key_t, val_t, vt>
-  merge_pass(kv_array_t<key_t, val_t, vt> x, int tid, int count, 
-    int pass, range_t& active, comp_t comp, storage_t& storage) const {
-
-    int coop = 2<< pass;
-    merge_range_t range = compute_mergesort_range(count, tid, coop, vt);
-
-    int list = tid>> pass;
-
-    int list_parity = 1 & list;
-    int diag = vt * tid - range.a_begin;
-
-    // Fetch the active range for the list this thread's list is merging with.
-    int sibling_range = storage.ranges[1 ^ list];
-    range_t sibling { 0x0000ffff & sibling_range, sibling_range>> 16 };
-
-    // This pass does a segmented merge on ranges list and 1 ^ list.
-    // ~1 & list is the left list and 1 | list is the right list.
-    // We find the inner segments for merging, then update the active
-    // range to the outer segments for the next pass.
-    range_t left = list_parity ? sibling : active;
-    range_t right = list_parity ? active : sibling;
-    range_t inner = { left.end, right.begin };
-    active.begin = min(left.begin, right.begin);
-    active.end = max(left.end, right.end);
-
-    // Store the data from thread order into shared memory.
-    reg_to_shared_thread<nt, vt>(x.keys, tid, storage.keys);
-
-    int mp = segmented_merge_path(storage.keys, range, inner, diag, comp);
-
-    // Run a segmented serial merge.
-    merge_pair_t<key_t, vt> merge = segmented_serial_merge<vt>(storage.keys,
-      range.partition(mp, diag), inner, comp);
-
-    // Pack and store the outer range to shared memory.
-    storage.ranges[list>> 1] = (int)bfi(active.end, active.begin, 16, 16);
-    if(!has_values) __syncthreads();
-
-    x.keys = merge.keys;
-    if(has_values) {
-      // Reorder values through shared memory.
-      reg_to_shared_thread<nt, vt>(x.vals, tid, storage.vals);
-      x.vals = shared_gather<nt, vt>(storage.vals, merge.indices);
-    }
-
-    return x;
-  }
-
-  template<typename comp_t>
-  MGPU_DEVICE kv_array_t<key_t, val_t, vt> 
-  block_sort(kv_array_t<key_t, val_t, vt> x, int tid, int count,
-    int head_flags, range_t& active, comp_t comp, storage_t& storage) const {
-
-    // Sort the inputs within each thread.
-    x = odd_even_sort(x, comp, head_flags);
-
-    // Record the first and last occurrences of head flags in this segment.
-    active.begin = head_flags ? (vt * tid - 1 + ffs(head_flags)) : nv;
-    active.end = head_flags ? (vt * tid + 31 - clz(head_flags)) : -1;
-    storage.ranges[tid] = bfi(active.end, active.begin, 16, 16);
-    __syncthreads();
-
-    // Merge threads starting with a pair until all values are merged.
-    for(int pass = 0; pass < num_passes; ++pass)
-      x = merge_pass(x, tid, count, pass, active, comp, storage);
-    
-    return x;
-  }
-};
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/intrinsics.hxx b/src/util/cuda/moderngpu/intrinsics.hxx
deleted file mode 100644
index 39af8b6e..00000000
--- a/src/util/cuda/moderngpu/intrinsics.hxx
+++ /dev/null
@@ -1,363 +0,0 @@
-#pragma once
-
-#include "operators.hxx"
-
-#if !defined(__CUDACC__) && !defined(__HIP__)
-#error "You must compile this file with nvcc. You must."
-#endif
-
-BEGIN_MGPU_NAMESPACE
-
-#ifndef MEMBERMASK
-	#define MEMBERMASK 0xffffffff
-#endif
-
-#if (__CUDACC_VER_MAJOR__ >= 9 && defined(__CUDA_ARCH__) && \
-     __CUDA_ARCH__ >= 300) && !defined(USE_SHFL_SYNC)
-  #define USE_SHFL_SYNC
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// ballot, brev, popc, clz, bfe, bfi, prmt
-
-// ballot
-
-MGPU_HOST_DEVICE unsigned ballot(int predicate, unsigned mask=MEMBERMASK) {
-  unsigned y = 0;
-#ifdef USE_SHFL_SYNC
-	y = __ballot_sync(mask, predicate);
-#else
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-	y = __ballot(predicate);
-#endif
-#endif
-  return y;
-}
-
-// Reverse the bits in an integer.
-MGPU_HOST_DEVICE unsigned brev(unsigned x) { 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
-  unsigned y = __brev(x);
-#else
-  unsigned y = 0;
-  for(int i = 0; i < 32; ++i)
-    y |= (1 & (x>> i))<< (31 - i);
-#endif
-  return y;
-}
-
-// Count number of bits in a register.
-MGPU_HOST_DEVICE int popc(unsigned x) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
-  return __popc(x);
-#else
-  int c;
-  for(c = 0; x; ++c)
-    x &= x - 1;
-  return c;
-#endif
-}
-
-// Count leading zeros - start from most significant bit.
-MGPU_HOST_DEVICE int clz(int x) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
-  return __clz(x);
-#else
-  for(int i = 31; i >= 0; --i)
-    if((1<< i) & x) return 31 - i;
-  return 32;
-#endif
-}
-
-// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
-MGPU_HOST_DEVICE int ffs(int x) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
-  return __ffs(x);
-#else
-  for(int i = 0; i < 32; ++i)
-    if((1<< i) & x) return i + 1;
-  return 0;
-#endif
-}
-
-MGPU_HOST_DEVICE unsigned bfe(unsigned x, unsigned bit, unsigned num_bits) {
-  unsigned result;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
-  asm("bfe.u32 %0, %1, %2, %3;" : 
-    "=r"(result) : "r"(x), "r"(bit), "r"(num_bits));
-#else
-  result = ((1<< num_bits) - 1) & (x>> bit);
-#endif
-  return result;
-}
-
-MGPU_HOST_DEVICE unsigned bfi(unsigned x, unsigned y, unsigned bit, 
-  unsigned num_bits) {
-  unsigned result;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
-  asm("bfi.b32 %0, %1, %2, %3, %4;" : 
-    "=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(num_bits));
-#else
-  if(bit + num_bits > 32) num_bits = 32 - bit;
-  unsigned mask = ((1<< num_bits) - 1)<< bit;
-  result = y & ~mask;
-  result |= mask & (x<< bit);
-#endif
-  return result;
-}
-
-MGPU_HOST_DEVICE unsigned prmt(unsigned a, unsigned b, unsigned index) {
-  unsigned result;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
-  asm("prmt.b32 %0, %1, %2, %3;" : "=r"(result) : "r"(a), "r"(b), "r"(index));
-#else
-  result = 0;
-  for(int i = 0; i < 4; ++i) {
-    unsigned sel = 0xf & (index>> (4 * i));
-    unsigned x = ((7 & sel) > 3) ? b : a;
-    x = 0xff & (x>> (8 * (3 & sel)));
-    if(8 & sel) x = (128 & x) ? 0xff : 0;
-    result |= x<< (8 * i);
-  }
-#endif
-  return result;
-}
-
-// Find log2(x) and optionally round up to the next integer logarithm.
-MGPU_HOST_DEVICE int find_log2(int x, bool round_up = false) {
-  int a = 31 - clz(x);
-  if(round_up) a += !is_pow2(x);
-  return a;
-} 
-
-////////////////////////////////////////////////////////////////////////////////
-// Divide operators.
-
-MGPU_HOST_DEVICE int mulhi(int a, int b) {
-#ifdef __CUDA_ARCH__
-  return __mulhi(a, b);
-#else
-  union {
-    int64_t x;
-    struct { int low, high; };
-  } product;
-  product.x = (int64_t)a * b;
-  return product.high;
-#endif
-}
-
-MGPU_HOST_DEVICE unsigned umulhi(unsigned a, unsigned b) {
-#ifdef __CUDA_ARCH__
-  return __mulhi(a, b);
-#else
-  union {
-    uint64_t x;
-    struct { unsigned low, high; };
-  } product;
-  product.x = (uint64_t)a * b;
-  return product.high; 
-#endif  
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Wrappers around PTX shfl_up and shfl_down.
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-
-template<typename type_t>
-MGPU_DEVICE type_t shfl_up(type_t x, int offset, int width = warp_size) { 
-  enum { num_words = div_up(sizeof(type_t), sizeof(int)) };
-  union {
-    int x[num_words];
-    type_t t;
-  } u;
-  u.t = x;
-
-  iterate<num_words>([&](int i) {
-    #ifdef USE_SHFL_SYNC
-    if (i < width) {
-      unsigned mask = __activemask();
-      u.x[i] = __shfl_up_sync(mask, u.x[i], offset);
-    }
-    #else
-    u.x[i] = __shfl_up(u.x[i], offset, width);
-    #endif
-  });
-  return u.t;
-}
-
-template<typename type_t>
-MGPU_DEVICE type_t shfl_down(type_t x, int offset, int width = warp_size) { 
-  enum { num_words = div_up(sizeof(type_t), sizeof(int)) };
-  union {
-    int x[num_words];
-    type_t t;
-  } u;
-  u.t = x;
-
-  iterate<num_words>([&](int i) {
-    #ifdef USE_SHFL_SYNC
-    if (i < width) {
-      unsigned mask = __activemask();
-      u.x[i] = __shfl_down_sync(mask, u.x[i], offset);
-    }
-    #else
-    u.x[i] = __shfl_down(u.x[i], offset, width);
-    #endif
-  });
-  return u.t;
-}
-
-template<typename type_t, typename op_t> 
-MGPU_DEVICE type_t shfl_up_op(type_t x, int offset, op_t op, 
-  int width = warp_size) {
-
-  type_t y = shfl_up(x, offset, width);
-  int lane = (width - 1) & threadIdx.x;
-  if(lane >= offset) x = op(x, y);
-  return x;
-}
-
-template<typename type_t, typename op_t> 
-MGPU_DEVICE type_t shfl_down_op(type_t x, int offset, op_t op, 
-  int width = warp_size) {
-
-  type_t y = shfl_down(x, offset, width);
-  int lane = (width - 1) & threadIdx.x;
-  if(lane < width - offset) x = op(x, y);
-  return x;
-}
-
-#ifdef USE_SHFL_SYNC
-#define SHFL_OP_MACRO(dir, is_up, ptx_type, r, c_type, ptx_op, c_op) \
-MGPU_DEVICE inline c_type shfl_##dir##_op(c_type x, int offset, \
-  c_op<c_type> op, int width = warp_size) { \
-  c_type result = x; \
-  int mask = (warp_size - width)<< 8 | (is_up ? 0 : (width - 1)); \
-  int lane = threadIdx.x & (warp_size - 1); \
-  if (lane < width) { \
-  unsigned threadmask = __activemask(); \
-  asm( \
-    "{.reg ."#ptx_type" r0;" \
-    ".reg .pred p;" \
-    "shfl.sync."#dir".b32 r0|p, %1, %2, %3, %4;" \
-    "@p "#ptx_op"."#ptx_type" r0, r0, %5;" \
-    "mov."#ptx_type" %0, r0; }" \
-    : "="#r(result) : #r(x), "r"(offset), "r"(mask), "r"(threadmask), #r(x)); \
-  } \
-  return result; \
-}
-#else
-#define SHFL_OP_MACRO(dir, is_up, ptx_type, r, c_type, ptx_op, c_op) \
-MGPU_DEVICE inline c_type shfl_##dir##_op(c_type x, int offset, \
-  c_op<c_type> op, int width = warp_size) { \
-  c_type result = c_type(); \
-  int mask = (warp_size - width)<< 8 | (is_up ? 0 : (width - 1)); \
-  asm( \
-    "{.reg ."#ptx_type" r0;" \
-    ".reg .pred p;" \
-    "shfl."#dir".b32 r0|p, %1, %2, %3;" \
-    "@p "#ptx_op"."#ptx_type" r0, r0, %4;" \
-    "mov."#ptx_type" %0, r0; }" \
-    : "="#r(result) : #r(x), "r"(offset), "r"(mask), #r(x)); \
-  return result; \
-}
-#endif
-
-SHFL_OP_MACRO(up, true, s32, r, int, add, plus_t)
-SHFL_OP_MACRO(up, true, s32, r, int, max, maximum_t)
-SHFL_OP_MACRO(up, true, s32, r, int, min, minimum_t)
-SHFL_OP_MACRO(down, false, s32, r, int, add, plus_t)
-SHFL_OP_MACRO(down, false, s32, r, int, max, maximum_t)
-SHFL_OP_MACRO(down, false, s32, r, int, min, minimum_t)
-
-SHFL_OP_MACRO(up, true, u32, r, unsigned, add, plus_t)
-SHFL_OP_MACRO(up, true, u32, r, unsigned, max, maximum_t)
-SHFL_OP_MACRO(up, true, u32, r, unsigned, min, minimum_t)
-SHFL_OP_MACRO(down, false, u32, r, unsigned, add, plus_t)
-SHFL_OP_MACRO(down, false, u32, r, unsigned, max, maximum_t)
-SHFL_OP_MACRO(down, false, u32, r, unsigned, min, minimum_t)
-
-SHFL_OP_MACRO(up, true, f32, f, float, add, plus_t)
-SHFL_OP_MACRO(up, true, f32, f, float, max, maximum_t)
-SHFL_OP_MACRO(up, true, f32, f, float, max, minimum_t)
-SHFL_OP_MACRO(down, false, f32, f, float, add, plus_t)
-SHFL_OP_MACRO(down, false, f32, f, float, max, maximum_t)
-SHFL_OP_MACRO(down, false, f32, f, float, max, minimum_t)
-
-#undef SHFL_OP_MACRO
-
-#ifdef USE_SHFL_SYNC
-#define SHFL_OP_64b_MACRO(dir, is_up, ptx_type, r, c_type, ptx_op, c_op) \
-MGPU_DEVICE inline c_type shfl_##dir##_op(c_type x, int offset, \
-  c_op<c_type> op, int width = warp_size) { \
-  c_type result = x; \
-  int mask = (warp_size - width)<< 8 | (is_up ? 0 : (width - 1)); \
-  int lane = threadIdx.x & (warp_size - 1); \
-  if (lane < width) { \
-  unsigned threadmask = __activemask(); \
-  asm( \
-    "{.reg ."#ptx_type" r0;" \
-    ".reg .u32 lo;" \
-    ".reg .u32 hi;" \
-    ".reg .pred p;" \
-    "mov.b64 {lo, hi}, %1;" \
-    "shfl.sync."#dir".b32 lo|p, lo, %2, %3, %4;" \
-    "shfl.sync."#dir".b32 hi  , hi, %2, %3, %4;" \
-    "mov.b64 r0, {lo, hi};" \
-    "@p "#ptx_op"."#ptx_type" r0, r0, %5;" \
-    "mov."#ptx_type" %0, r0; }" \
-    : "="#r(result) : #r(x), "r"(offset), "r"(mask), "r"(threadmask), #r(x) \
-  ); \
-  } \
-  return result; \
-}
-#else
-#define SHFL_OP_64b_MACRO(dir, is_up, ptx_type, r, c_type, ptx_op, c_op) \
-MGPU_DEVICE inline c_type shfl_##dir##_op(c_type x, int offset, \
-  c_op<c_type> op, int width = warp_size) { \
-  c_type result = c_type(); \
-  int mask = (warp_size - width)<< 8 | (is_up ? 0 : (width - 1)); \
-  asm( \
-    "{.reg ."#ptx_type" r0;" \
-    ".reg .u32 lo;" \
-    ".reg .u32 hi;" \
-    ".reg .pred p;" \
-    "mov.b64 {lo, hi}, %1;" \
-    "shfl."#dir".b32 lo|p, lo, %2, %3;" \
-    "shfl."#dir".b32 hi  , hi, %2, %3;" \
-    "mov.b64 r0, {lo, hi};" \
-    "@p "#ptx_op"."#ptx_type" r0, r0, %4;" \
-    "mov."#ptx_type" %0, r0; }" \
-    : "="#r(result) : #r(x), "r"(offset), "r"(mask), #r(x) \
-  ); \
-  return result; \
-}
-#endif
-
-SHFL_OP_64b_MACRO(up, true, s64, l, int64_t, add, plus_t)
-SHFL_OP_64b_MACRO(up, true, s64, l, int64_t, max, maximum_t)
-SHFL_OP_64b_MACRO(up, true, s64, l, int64_t, min, minimum_t)
-SHFL_OP_64b_MACRO(down, false, s64, l, int64_t, add, plus_t)
-SHFL_OP_64b_MACRO(down, false, s64, l, int64_t, max, maximum_t)
-SHFL_OP_64b_MACRO(down, false, s64, l, int64_t, min, minimum_t)
-
-SHFL_OP_64b_MACRO(up, true, u64, l, uint64_t, add, plus_t)
-SHFL_OP_64b_MACRO(up, true, u64, l, uint64_t, max, maximum_t)
-SHFL_OP_64b_MACRO(up, true, u64, l, uint64_t, min, minimum_t)
-SHFL_OP_64b_MACRO(down, false, u64, l, uint64_t, add, plus_t)
-SHFL_OP_64b_MACRO(down, false, u64, l, uint64_t, max, maximum_t)
-SHFL_OP_64b_MACRO(down, false, u64, l, uint64_t, min, minimum_t)
-
-SHFL_OP_64b_MACRO(up, true, f64, d, double, add, plus_t)
-SHFL_OP_64b_MACRO(up, true, f64, d, double, max, maximum_t)
-SHFL_OP_64b_MACRO(up, true, f64, d, double, min, minimum_t)
-SHFL_OP_64b_MACRO(down, false, f64, d, double, add, plus_t)
-SHFL_OP_64b_MACRO(down, false, f64, d, double, max, maximum_t)
-SHFL_OP_64b_MACRO(down, false, f64, d, double, min, minimum_t)
-
-#undef SHFL_OP_64b_MACRO
-
-#endif
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_bulkinsert.hxx b/src/util/cuda/moderngpu/kernel_bulkinsert.hxx
deleted file mode 100644
index c85e8221..00000000
--- a/src/util/cuda/moderngpu/kernel_bulkinsert.hxx
+++ /dev/null
@@ -1,18 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "kernel_merge.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-// Insert the values at a_keys before the values at b_keys identified by
-// insert.
-template<typename launch_t = empty_t, typename a_it, typename insert_it, 
-  typename b_it, typename c_it>
-void bulk_insert(a_it a, insert_it a_insert, int insert_size, b_it b, 
-  int source_size, c_it c, context_t& context) {
-
-  merge<launch_t>(a_insert, a, insert_size, counting_iterator_t<int>(0), b, 
-    source_size, discard_iterator_t<int>(), c, mgpu::less_t<int>(), context);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_bulkremove.hxx b/src/util/cuda/moderngpu/kernel_bulkremove.hxx
deleted file mode 100644
index 497e9867..00000000
--- a/src/util/cuda/moderngpu/kernel_bulkremove.hxx
+++ /dev/null
@@ -1,91 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "search.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<typename launch_arg_t = empty_t,
-  typename input_it, typename indices_it, typename output_it>
-void bulk_remove(input_it input, int count, indices_it indices, 
-  int num_indices, output_it output, context_t& context) {
-
-  typedef typename conditional_typedef_t<launch_arg_t, 
-    launch_box_t<
-      arch_20_cta<128, 15>,
-      arch_35_cta<128, 11>,
-      arch_52_cta<128, 15>
-    >
-  >::type_t launch_t;
-
-  typedef typename std::iterator_traits<input_it>::value_type type_t;
-
-  // Map the removal indices into tiles.
-  mem_t<int> partitions = binary_search_partitions<bounds_lower>(indices, 
-    count, num_indices, launch_t::nv(context), context);
-  const int* p_data = partitions.data();
-
-  auto k = [=]MGPU_DEVICE(int tid, int cta) {
-    typedef typename launch_t::sm_ptx params_t; 
-    enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-    
-    __shared__ union {
-      int indices[nv + 1];
-    } shared;
-
-    range_t tile = get_tile(cta, nv, count);
-
-    // Search the begin and end iterators to load.
-    int begin = p_data[cta];
-    int end = p_data[cta + 1]; 
-    int b_count = end - begin;
-
-    int* a_shared = shared.indices;
-    int* b_shared = shared.indices + tile.count() - b_count;
-
-    // Store the indices to shared memory.
-    // TODO: MODIFY MEM_TO_SHARED TO UNCONDITIONALLY WRITE TO FULL SMEM.
-    mem_to_shared<nt, vt>(indices + begin, tid, b_count, b_shared, false);
-
-    // Binary search into the remove array to prepare a range for the thread.
-    merge_range_t range = {
-      // a range
-      vt * tid, 
-      tile.count(), 
-      
-      // b range
-      binary_search<bounds_lower>(b_shared, b_count, 
-        tile.begin + vt * tid, less_t<int>()),
-      b_count
-    };
-
-    // Emit all values that aren't removed.
-    iterate<vt>([&](int i) {
-      bool p = range.a_valid() && (!range.b_valid() || 
-        tile.begin + range.a_begin < b_shared[range.b_begin]);
-      if(p)
-        a_shared[range.a_begin - range.b_begin] = tile.begin + range.a_begin;
-      else 
-        ++range.b_begin;
-      ++range.a_begin;
-    });
-    __syncthreads();
-
-    // Pull the gather indices out of shared memory in strided order.
-    array_t<int, vt> gather = shared_to_reg_strided<nt, vt>(
-      shared.indices, tid);
-
-    // Gather the elements from input.
-    int num_move = tile.count() - b_count;
-    array_t<type_t, vt> values;
-    strided_iterate<nt, vt, 0>([&](int i, int j) {
-      values[i] = input[gather[i]];
-    }, tid, num_move);
-
-    // Stream to output.
-    reg_to_mem_strided<nt, vt>(values, tid, num_move, 
-      output + tile.begin - begin);
-  };
-  cta_transform<launch_t>(k, count, context);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_compact.hxx b/src/util/cuda/moderngpu/kernel_compact.hxx
deleted file mode 100644
index 774b69a2..00000000
--- a/src/util/cuda/moderngpu/kernel_compact.hxx
+++ /dev/null
@@ -1,139 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "kernel_scan.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<typename launch_arg_t>
-struct stream_compact_t {
-
-  typedef typename conditional_typedef_t<launch_arg_t, 
-    launch_box_t<
-      arch_20_cta<128, 11>,
-      arch_35_cta<128,  7>,
-      arch_52_cta<128, 11>
-    >
-  >::type_t launch_t;
-
-  cta_dim_t cta_dim;
-  int num_ctas;
-  int count;
-  context_t& context;
-
-  mem_t<short> bits;
-  mem_t<int> cta_offsets;
-
-public:
-  stream_compact_t(int count_, context_t& context_) : context(context_) {
-    count = count_;
-    cta_dim = launch_t::cta_dim(context);
-    num_ctas = cta_dim.num_ctas(count);
-
-    bits = mem_t<short>(num_ctas * cta_dim.nt, context);
-    cta_offsets = mem_t<int>(num_ctas, context);
-  }
-
-  // upsweep of stream compaction. 
-  // func_t implements bool operator(int index);
-  // The return value is flag for indicating that we want to *keep* the data
-  // in the compacted stream.
-  template<typename func_t>
-  int upsweep(func_t f) {
-    short* bits_data = bits.data();
-    int* cta_offsets_data = cta_offsets.data();
-    int count = this->count;
-
-    auto upsweep_k = [=]MGPU_DEVICE(int tid, int cta) {
-      typedef typename launch_t::sm_ptx params_t;
-      enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-      typedef cta_reduce_t<nt, int> reduce_t;
-      static_assert(vt <= 16, "mgpu::stream_compact_vt must be <= 16.");
-
-      __shared__ union {
-        typename reduce_t::storage_t reduce;
-      } shared;
-
-      range_t tile = get_tile(cta, nv, count);
-      short stream_bits = 0;
-      strided_iterate<nt, vt>([&](int i, int j) {
-        int index = tile.begin + j;
-        bool stream = f(index);
-        if(stream) stream_bits |= 1<< i;
-      }, tid, tile.count());
-
-      // Reduce the values and store to global memory.
-      int total_stream = reduce_t().reduce(tid, popc(stream_bits), 
-        shared.reduce, nt, plus_t<int>(), false);
-
-      bits_data[nt * cta + tid] = stream_bits;
-      if(!tid) cta_offsets_data[cta] = total_stream;
-    };
-    cta_launch<launch_t>(upsweep_k, num_ctas, context);
-
-    // Scan reductions.
-    mem_t<int> counts_host(1, context, memory_space_host);
-    scan_event(cta_offsets_data, num_ctas, cta_offsets_data, 
-      plus_t<int>(), counts_host.data(), context, context.event());
-    cudaEventSynchronize(context.event());
-
-    // Return the total number of elements to stream.
-    int stream_total = counts_host.data()[0];
-    return stream_total;
-  }
-
-  // downsweep of stream compaction.
-  // func_t implements void operator(int dest_index, int source_index).
-  template<typename func_t>
-  void downsweep(func_t f) {
-    const short* bits_data = bits.data();
-    const int* cta_offsets_data = cta_offsets.data();
-
-    auto downsweep_k = [=]MGPU_DEVICE(int tid, int cta) {
-      typedef typename launch_t::sm_ptx params_t;
-      enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-      typedef cta_scan_t<nt, int> scan_t;
-      __shared__ union {
-        typename scan_t::storage_t scan;
-        short indices[nv];
-      } shared;
-
-      short stream_bits = bits_data[nt * cta + tid];
-      int cta_offset = cta_offsets_data[cta];
-
-      // For each set stream_bits bit, set shared.indices to 1.
-      iterate<vt>([&](int i) {
-        shared.indices[nt * i + tid] = 0 != ((1<< i) & stream_bits);
-      });
-      __syncthreads();
-
-      // Load out the values and scan. Compact into shared.indices the 
-      // CTA-local indices of each streaming work-item.
-      array_t<short, vt> flags = shared_to_reg_thread<nt, vt>(
-        shared.indices, tid);
-      scan_result_t<int> scan = scan_t().scan(tid, reduce(flags), 
-        shared.scan);
-      iterate<vt>([&](int i) {
-        if(flags[i]) shared.indices[scan.scan++] = (short)(vt * tid + i);
-      });
-      __syncthreads();
-
-      // Call the user-supplied callback with destination and source indices.
-      for(int i = tid; i < scan.reduction; i += nt) {
-        int source_index = nv * cta + shared.indices[i];
-        int dest_index = cta_offset + i;
-        f(dest_index, source_index);
-      }
-      __syncthreads();
-    };
-    cta_launch<launch_t>(downsweep_k, num_ctas, context);
-  }
-};
-
-template<typename launch_arg_t = empty_t>
-stream_compact_t<launch_arg_t> 
-transform_compact(int count, context_t& context) {
-  return stream_compact_t<launch_arg_t>(count, context);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_intervalmove.hxx b/src/util/cuda/moderngpu/kernel_intervalmove.hxx
deleted file mode 100644
index 8a4cad20..00000000
--- a/src/util/cuda/moderngpu/kernel_intervalmove.hxx
+++ /dev/null
@@ -1,67 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "kernel_load_balance.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<typename launch_arg_t = empty_t, typename input_it, 
-  typename segments_it, typename output_it>
-void interval_expand(input_it input, int count, segments_it segments,
-  int num_segments, output_it output, context_t& context) {
-
-  typedef typename std::iterator_traits<input_it>::value_type type_t;
-  transform_lbs<launch_arg_t>(
-    []MGPU_DEVICE(int index, int seg, int rank, tuple<type_t> desc,
-      output_it output) {
-      output[index] = get<0>(desc);
-    }, 
-    count, segments, num_segments, make_tuple(input), context, output
-  );
-}
-
-template<typename launch_arg_t = empty_t, typename input_it, 
-  typename segments_it, typename gather_it, typename output_it>
-void interval_gather(input_it input, int count, segments_it segments,
-  int num_segments, gather_it gather, output_it output, context_t& context) {
-
-  transform_lbs<launch_arg_t>(
-    []MGPU_DEVICE(int index, int seg, int rank, tuple<int> desc, 
-      input_it input, output_it output) {
-      output[index] = input[get<0>(desc) + rank];
-    }, 
-    count, segments, num_segments, make_tuple(gather), context, input, output
-  );
-}
-
-template<typename launch_arg_t = empty_t, typename input_it, 
-  typename segments_it, typename scatter_it, typename output_it>
-void interval_scatter(input_it input, int count, segments_it segments,
-  int num_segments, scatter_it scatter, output_it output, context_t& context) {
-
-  transform_lbs<launch_arg_t>(
-    []MGPU_DEVICE(int index, int seg, int rank, tuple<int> desc, 
-      input_it input, output_it output) {
-      output[get<0>(desc) + rank] = input[index];
-    }, 
-    count, segments, num_segments, make_tuple(scatter), context, input, output
-  );
-}
-
-template<typename launch_arg_t = empty_t, 
-  typename input_it, typename segments_it, typename scatter_it,
-  typename gather_it, typename output_it>
-void interval_move(input_it input, int count, segments_it segments,
-  int num_segments, scatter_it scatter, gather_it gather, output_it output, 
-  context_t& context) {
-
-  transform_lbs<launch_arg_t>(
-    []MGPU_DEVICE(int index, int seg, int rank, tuple<int, int> desc,
-      input_it input, output_it output) {
-      output[get<0>(desc) + rank] = input[get<1>(desc) + rank];
-    }, 
-    count, segments, num_segments, make_tuple(scatter, gather), context,
-    input, output
-  );
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_join.hxx b/src/util/cuda/moderngpu/kernel_join.hxx
deleted file mode 100644
index 48ca685d..00000000
--- a/src/util/cuda/moderngpu/kernel_join.hxx
+++ /dev/null
@@ -1,50 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "kernel_sortedsearch.hxx"
-#include "kernel_scan.hxx"
-#include "kernel_load_balance.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<typename launch_arg_t = empty_t, 
-  typename a_it, typename b_it, typename comp_t>
-mem_t<int2> inner_join(a_it a, int a_count, b_it b, int b_count, 
-  comp_t comp, context_t& context) {
-
-  // Compute lower and upper bounds of a into b.
-  mem_t<int> lower(a_count, context);
-  mem_t<int> upper(a_count, context);
-  sorted_search<bounds_lower, launch_arg_t>(a, a_count, b, b_count, 
-    lower.data(), comp, context);
-  sorted_search<bounds_upper, launch_arg_t>(a, a_count, b, b_count, 
-    upper.data(), comp, context);
-
-  // Compute output ranges by scanning upper - lower. Retrieve the reduction
-  // of the scan, which specifies the size of the output array to allocate.
-  mem_t<int> scanned_sizes(a_count, context);
-  const int* lower_data = lower.data();
-  const int* upper_data = upper.data();
-
-  mem_t<int> count(1, context);
-  transform_scan<int>([=]MGPU_DEVICE(int index) {
-    return upper_data[index] - lower_data[index];
-  }, a_count, scanned_sizes.data(), plus_t<int>(), count.data(), context);
-
-  // Allocate an int2 output array and use load-balancing search to compute
-  // the join.
-  int join_count = from_mem(count)[0];
-  mem_t<int2> output(join_count, context);
-  int2* output_data = output.data();
-
-  // Use load-balancing search on the segmens. The output is a pair with
-  // a_index = seg and b_index = lower_data[seg] + rank.
-  auto k = [=]MGPU_DEVICE(int index, int seg, int rank, tuple<int> lower) {
-    output_data[index] = make_int2(seg, get<0>(lower) + rank);
-  };
-  transform_lbs<launch_arg_t>(k, join_count, scanned_sizes.data(), a_count,
-    make_tuple(lower_data), context);
-
-  return output;
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_load_balance.hxx b/src/util/cuda/moderngpu/kernel_load_balance.hxx
deleted file mode 100644
index 7a0e8459..00000000
--- a/src/util/cuda/moderngpu/kernel_load_balance.hxx
+++ /dev/null
@@ -1,88 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "cta_load_balance.hxx"
-#include "search.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<typename launch_arg_t = empty_t, typename func_t, 
-  typename segments_it, typename pointers_t, typename... args_t>
-void transform_lbs(func_t f, int count, segments_it segments, 
-  int num_segments, pointers_t caching_iterators, context_t& context,
-  args_t... args) {
-
-  typedef typename conditional_typedef_t<launch_arg_t, 
-    launch_box_t<
-      arch_20_cta<128, 11, 9>,
-      arch_35_cta<128,  7, 5>,
-      arch_52_cta<128, 11, 9>
-    >
-  >::type_t launch_t;
-
-  typedef typename std::iterator_traits<segments_it>::value_type int_t;
-  typedef tuple_iterator_value_t<pointers_t> value_t;
-
-  mem_t<int_t> mp = load_balance_partitions(count, segments, num_segments,
-    launch_t::nv(context), context);
-  const int_t* mp_data = mp.data();
-
-  auto k = [=]MGPU_DEVICE(int tid, int cta, args_t... args) {
-
-    typedef typename launch_t::sm_ptx params_t;
-    enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 };
-    typedef cta_load_balance_t<nt, vt> load_balance_t;
-    typedef detail::cached_segment_load_t<nt, pointers_t> cached_load_t;
-
-    __shared__ union {
-      typename load_balance_t::storage_t lbs;
-      typename cached_load_t::storage_t cached;
-    } shared;
-
-    // Compute the load-balancing search and materialize (index, seg, rank)
-    // arrays.
-    auto lbs = load_balance_t().load_balance(count, segments, num_segments,
-      tid, cta, mp_data, shared.lbs);
-
-    // Load from the cached iterators. Use the placement range, not the 
-    // merge-path range for situating the segments.
-    array_t<value_t, vt> cached_values = cached_load_t::template load<vt0>(
-      tid, lbs.merge_range.a_count(), lbs.placement.range.b_range(), 
-      lbs.segments, shared.cached, caching_iterators);
-
-    // Call the user-supplied functor f.
-    strided_iterate<nt, vt, vt0>([=](int i, int j) {
-      int index = lbs.merge_range.a_begin + j;
-      int seg = lbs.segments[i];
-      int rank = lbs.ranks[i];
-
-      f(index, seg, rank, cached_values[i], args...);
-    }, tid, lbs.merge_range.a_count());
-  };
-  cta_transform<launch_t>(k, count + num_segments, context, args...);
-}
-
-// load-balancing search without caching.
-template<typename launch_arg_t = empty_t, typename func_t, 
-  typename segments_it, typename... args_t>
-void transform_lbs(func_t f, int count, segments_it segments, 
-  int num_segments, context_t& context, args_t... args) {
-
-  transform_lbs<launch_arg_t>(
-    [=]MGPU_DEVICE(int index, int seg, int rank, tuple<>, args_t... args) {
-      f(index, seg, rank, args...);    // drop the cached values.
-    },
-    count, segments, num_segments, tuple<>(), context, args...
-  );
-}
-
-template<typename launch_arg_t = empty_t, typename segments_it,
-  typename output_it>
-void load_balance_search(int count, segments_it segments, 
-  int num_segments, output_it output, context_t& context) {
-
-  transform_lbs<launch_arg_t>([=]MGPU_DEVICE(int index, int seg, int rank) {
-    output[index] = seg;
-  }, count, segments, num_segments, context);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_merge.hxx b/src/util/cuda/moderngpu/kernel_merge.hxx
deleted file mode 100644
index 7e5cac27..00000000
--- a/src/util/cuda/moderngpu/kernel_merge.hxx
+++ /dev/null
@@ -1,92 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "cta_merge.hxx"
-#include "search.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-// Key-value merge.
-template<typename launch_arg_t = empty_t,
-  typename a_keys_it, typename a_vals_it, 
-  typename b_keys_it, typename b_vals_it,
-  typename c_keys_it, typename c_vals_it, 
-  typename comp_t>
-void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count, 
-  b_keys_it b_keys, b_vals_it b_vals, int b_count,
-  c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context) {
-
-  typedef typename conditional_typedef_t<launch_arg_t, 
-    launch_box_t<
-      arch_20_cta<128, 15>,
-      arch_35_cta<128, 11>,
-      arch_52_cta<128, 15>
-    >
-  >::type_t launch_t;
-
-  typedef typename std::iterator_traits<a_keys_it>::value_type type_t;
-  typedef typename std::iterator_traits<a_vals_it>::value_type val_t;
-  enum { has_values = !std::is_same<val_t, empty_t>::value };
-
-  mem_t<int> partitions = merge_path_partitions<bounds_lower>(a_keys, a_count, 
-    b_keys, b_count, launch_t::nv(context), comp, context);
-  int* mp_data = partitions.data();
-
-  auto k = [=] MGPU_DEVICE (int tid, int cta) {
-    typedef typename launch_t::sm_ptx params_t;
-    enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-
-    __shared__ union {
-      type_t keys[nv + 1];
-      int indices[nv];
-    } shared;
-
-    // Load the range for this CTA and merge the values into register.
-    int mp0 = mp_data[cta + 0];
-    int mp1 = mp_data[cta + 1];
-    merge_range_t range = compute_merge_range(a_count, b_count, cta, nv, 
-      mp0, mp1);
-
-// Any attempt to debug the problem on clang failed (if you remove this will crash on clang)
-#ifdef __clang__
-
-    if (range.b_end > b_count)
-    {
-      return;
-    }
-
-#endif
-
-    merge_pair_t<type_t, vt> merge = cta_merge_from_mem<bounds_lower, nt, vt>(
-      a_keys, b_keys, range, tid, comp, shared.keys);
-
-    int dest_offset = nv * cta;
-    reg_to_mem_thread<nt>(merge.keys, tid, range.total(), c_keys + dest_offset,
-      shared.keys);
-
-    if(has_values) {
-      // Transpose the indices from thread order to strided order.
-      array_t<int, vt> indices = reg_thread_to_strided<nt>(merge.indices, tid, 
-        shared.indices);
-
-      // Gather the input values and merge into the output values.
-      transfer_two_streams_strided<nt>(a_vals + range.a_begin, range.a_count(),
-        b_vals + range.b_begin, range.b_count(), indices, tid, 
-        c_vals + dest_offset);
-    }
-  };
-  cta_transform<launch_t>(k, a_count + b_count, context);
-}
-
-// Key-only merge.
-template<typename launch_t = empty_t,
-  typename a_keys_it, typename b_keys_it, typename c_keys_it,
-  typename comp_t>
-void merge(a_keys_it a_keys, int a_count, b_keys_it b_keys, int b_count,
-  c_keys_it c_keys, comp_t comp, context_t& context) {
-
-  merge<launch_t>(a_keys, (const empty_t*)nullptr, a_count, b_keys, 
-    (const empty_t*)nullptr, b_count, c_keys, (empty_t*)nullptr, comp,
-    context);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_mergesort.hxx b/src/util/cuda/moderngpu/kernel_mergesort.hxx
deleted file mode 100644
index 6d3f9459..00000000
--- a/src/util/cuda/moderngpu/kernel_mergesort.hxx
+++ /dev/null
@@ -1,150 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "transform.hxx"
-#include "kernel_merge.hxx"
-#include "cta_mergesort.hxx"
-#include "intrinsics.hxx"
-
-BEGIN_MGPU_NAMESPACE
-  
-template<typename keys_it, typename comp_t>
-mem_t<int> merge_sort_partitions(keys_it keys, int count, int coop, 
-  int spacing, comp_t comp, context_t& context) {
-
-  int num_partitions = div_up(count, spacing) + 1;
-  auto k = [=]MGPU_DEVICE(int index) {
-    merge_range_t range = compute_mergesort_range(count, index, coop, spacing);
-    int diag = min(spacing * index, count) - range.a_begin;
-    return merge_path<bounds_lower>(keys + range.a_begin, range.a_count(), 
-      keys + range.b_begin, range.b_count(), diag, comp);
-  };
-
-  return fill_function<int>(k, num_partitions, context);
-}
-
-// Key-value mergesort.
-template<typename launch_arg_t = empty_t, typename key_t, typename val_t,
-  typename comp_t>
-void mergesort(key_t* keys_input, val_t* vals_input, int count,
-  comp_t comp, context_t& context) {
-
-  enum { has_values = !std::is_same<val_t, empty_t>::value };
-
-  typedef typename conditional_typedef_t<launch_arg_t, 
-    launch_box_t<
-      arch_20_cta<128, 17>,
-      arch_35_cta<128, 11>,
-      arch_52_cta<128, 15>
-    >
-  >::type_t launch_t;
-
-  int nv = launch_t::nv(context);
-  int num_ctas = div_up(count, nv);
-  int num_passes = find_log2(num_ctas, true);
-
-  mem_t<key_t> keys_temp(num_passes ? count : 0, context);
-  key_t* keys_output = keys_temp.data();
-
-  mem_t<val_t> vals_temp(has_values && num_passes ? count : 0, context);
-  val_t* vals_output = vals_temp.data();
-
-  key_t* keys_blocksort = (1 & num_passes) ? keys_output : keys_input;
-  val_t* vals_blocksort = (1 & num_passes) ? vals_output : vals_input;
-
-  auto k = [=] MGPU_DEVICE(int tid, int cta) {
-    typedef typename launch_t::sm_ptx params_t;
-    enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-    typedef cta_sort_t<nt, vt, key_t, val_t> sort_t;
-
-    __shared__ union {
-      typename sort_t::storage_t sort;
-      key_t keys[nv];
-      val_t vals[nv];
-    } shared;
-
-    range_t tile = get_tile(cta, nv, count);
-
-    // Load the keys and values.
-    kv_array_t<key_t, val_t, vt> unsorted;
-    unsorted.keys = mem_to_reg_thread<nt, vt>(keys_input + tile.begin, tid, 
-      tile.count(), shared.keys);
-    if(has_values)
-      unsorted.vals = mem_to_reg_thread<nt, vt>(vals_input + tile.begin, tid,
-        tile.count(), shared.vals);
-
-    // Blocksort.
-    kv_array_t<key_t, val_t, vt> sorted = sort_t().block_sort(unsorted,
-      tid, tile.count(), comp, shared.sort);
-
-    // Store the keys and values.
-    reg_to_mem_thread<nt, vt>(sorted.keys, tid, tile.count(), 
-      keys_blocksort + tile.begin, shared.keys);
-    if(has_values)
-      reg_to_mem_thread<nt, vt>(sorted.vals, tid, tile.count(), 
-        vals_blocksort + tile.begin, shared.vals);
-  };
-
-  cta_transform<launch_t>(k, count, context);
-
-  if(1 & num_passes) {
-    std::swap(keys_input, keys_output);
-    std::swap(vals_input, vals_output);
-  }
-
-  for(int pass = 0; pass < num_passes; ++pass) {
-    int coop = 2<< pass;
-    mem_t<int> partitions = merge_sort_partitions(keys_input, count, coop,
-      nv, comp, context);
-    int* mp_data = partitions.data();
-
-    auto k = [=] MGPU_DEVICE(int tid, int cta) {
-      typedef typename launch_t::sm_ptx params_t;
-      enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-
-      __shared__ union {
-        key_t keys[nv + 1];
-        int indices[nv];
-      } shared;
-
-      range_t tile = get_tile(cta, nv, count);
-
-      // Load the range for this CTA and merge the values into register.
-      merge_range_t range = compute_mergesort_range(count, cta, coop, nv, 
-        mp_data[cta + 0], mp_data[cta + 1]);
-
-      merge_pair_t<key_t, vt> merge = cta_merge_from_mem<bounds_lower, nt, vt>(
-        keys_input, keys_input, range, tid, comp, shared.keys);
-
-      // Store merged values back out.
-      reg_to_mem_thread<nt>(merge.keys, tid, tile.count(), 
-        keys_output + tile.begin, shared.keys);
-
-      if(has_values) {
-        // Transpose the indices from thread order to strided order.
-        array_t<int, vt> indices = reg_thread_to_strided<nt>(merge.indices,
-          tid, shared.indices);
-
-        // Gather the input values and merge into the output values.
-        transfer_two_streams_strided<nt>(vals_input + range.a_begin, 
-          range.a_count(), vals_input + range.b_begin, range.b_count(),
-          indices, tid, vals_output + tile.begin);
-      }
-    };
-    cta_transform<launch_t>(k, count, context);
-
-    std::swap(keys_input, keys_output);
-    std::swap(vals_input, vals_output);
-  }
-}
-
-// Key-only mergesort
-template<typename launch_arg_t = empty_t, typename key_t, typename comp_t>
-void mergesort(key_t* keys_input, int count, comp_t comp, 
-  context_t& context) {
-
-  mergesort<launch_arg_t>(keys_input, (empty_t*)nullptr, count, comp, 
-    context);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_reduce.hxx b/src/util/cuda/moderngpu/kernel_reduce.hxx
deleted file mode 100644
index 28112218..00000000
--- a/src/util/cuda/moderngpu/kernel_reduce.hxx
+++ /dev/null
@@ -1,70 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "cta_reduce.hxx"
-#include "memory.hxx"
-#include "transform.hxx"
-#include "operators.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<typename launch_arg_t = empty_t, typename input_it, 
-  typename output_it, typename op_t>
-void reduce(input_it input, int count, output_it reduction, op_t op, 
-  context_t& context) {
-
-  typedef typename conditional_typedef_t<launch_arg_t, 
-    launch_params_t<128, 8>
-  >::type_t launch_t;
-
-  typedef typename std::iterator_traits<input_it>::value_type type_t;
-
-  int num_ctas = launch_t::cta_dim(context).num_ctas(count);
-  mem_t<type_t> partials(num_ctas, context);
-  type_t* partials_data = partials.data();
-
-  auto k = [=] MGPU_DEVICE(int tid, int cta) {
-    typedef typename launch_t::sm_ptx params_t;
-    enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-    typedef cta_reduce_t<nt, type_t> reduce_t;
-    __shared__ typename reduce_t::storage_t shared_reduce;
-
-    // Load the data for the first tile for each cta.
-    range_t tile = get_tile(cta, nv, count);
-    array_t<type_t, vt> x = mem_to_reg_strided<nt, vt>(input + tile.begin, 
-      tid, tile.count());
-
-    // Reduce the multiple values per thread into a scalar.
-    type_t scalar;
-    strided_iterate<nt, vt>([&](int i, int j) {
-      scalar = i ? op(scalar, x[i]) : x[0];
-    }, tid, tile.count());
-
-    // Reduce to a scalar per CTA.
-    scalar = reduce_t().reduce(tid, scalar, shared_reduce, 
-      min(tile.count(), (int)nt), op, false);
-
-    if(!tid) {
-      if(1 == num_ctas) *reduction = scalar;
-      else partials_data[cta] = scalar;
-    }
-  };
-  cta_launch<launch_t>(k, num_ctas, context);
-
-  // Recursively call reduce until there's just one scalar.
-  if(num_ctas > 1)
-    reduce<launch_params_t<512, 4> >(partials_data, num_ctas, reduction, op, 
-      context);
-}
-
-template<typename launch_arg_t = empty_t, typename func_t, 
-  typename output_it, typename op_t>
-void transform_reduce(func_t f, int count, output_it reduction, op_t op, 
-  context_t& context) {
-
-  typedef typename std::iterator_traits<output_it>::value_type type_t;
-  reduce<launch_arg_t>(make_load_iterator<type_t>(f), count, reduction, op, 
-    context);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_scan.hxx b/src/util/cuda/moderngpu/kernel_scan.hxx
deleted file mode 100644
index b5f30859..00000000
--- a/src/util/cuda/moderngpu/kernel_scan.hxx
+++ /dev/null
@@ -1,198 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "cta_reduce.hxx"
-#include "cta_scan.hxx"
-#include "memory.hxx"
-#include "operators.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<scan_type_t scan_type = scan_type_exc, 
-  typename launch_arg_t = empty_t, typename input_it, 
-  typename output_it, typename op_t, typename reduction_it>
-void scan_event(input_it input, int count, output_it output, op_t op, 
-  reduction_it reduction, context_t& context, cudaEvent_t event) {
-
-  typedef typename conditional_typedef_t<launch_arg_t, 
-    launch_box_t<
-      arch_20_cta<128, 11>,
-      arch_35_cta<128, 7>,
-      arch_52_cta<128, 11>
-    >
-  >::type_t launch_t;
-
-  typedef typename std::iterator_traits<input_it>::value_type type_t;
-
-  int num_ctas = launch_t::cta_dim(context).num_ctas(count);
-
-  if(num_ctas > 8) {
-    mem_t<type_t> partials(num_ctas, context);
-    type_t* partials_data = partials.data();
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Upsweep phase. Reduce each tile to a scalar and store to partials.
-
-    auto upsweep_k = [=] MGPU_DEVICE(int tid, int cta) {
-      typedef typename launch_t::sm_ptx params_t;
-      enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-      typedef cta_reduce_t<nt, type_t> reduce_t;
-
-      __shared__ union {
-        typename reduce_t::storage_t reduce;
-      } shared;
-
-      // Load the tile's data into register.
-      range_t tile = get_tile(cta, nv, count);
-      array_t<type_t, vt> x = mem_to_reg_strided<nt, vt>(input + tile.begin,
-        tid, tile.count());
-
-      // Reduce the thread's values into a scalar.
-      type_t scalar = type_t();
-      strided_iterate<nt, vt>([&](int i, int j) {
-        scalar = i ? op(scalar, x[i]) : x[0];
-      }, tid, tile.count());
-
-      // Reduce across all threads.
-      type_t all_reduce = reduce_t().reduce(tid, scalar, shared.reduce, 
-        tile.count(), op);
-
-      // Store the final reduction to the partials.
-      if(!tid)
-        partials_data[cta] = all_reduce;
-    };
-    cta_transform<launch_t>(upsweep_k, count, context);
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Spine phase. Recursively call scan on the CTA partials.
-
-    scan_event<scan_type_exc>(partials_data, num_ctas, partials_data,
-      op, reduction, context, event);
-
-    // Record the event. This lets the caller wait on just the reduction 
-    // part of the operation. It's useful when writing the reduction to
-    // host-side paged-locked memory; the caller can read out the value more
-    // quickly to allocate memory and launch the next kernel.
-    if(event)
-      cudaEventRecord(event, context.stream());
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Downsweep phase. Perform an intra-tile scan and add the scan of the 
-    // partials as carry-in.
-
-    auto downsweep_k = [=] MGPU_DEVICE(int tid, int cta) {
-      typedef typename launch_t::sm_ptx params_t;
-      enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-      typedef cta_scan_t<nt, type_t> scan_t;
-
-      __shared__ union {
-        typename scan_t::storage_t scan;
-        type_t values[nv];
-      } shared;
-
-      // Load a tile to register in thread order.
-      range_t tile = get_tile(cta, nv, count);
-      array_t<type_t, vt> x = mem_to_reg_thread<nt, vt>(input + tile.begin, 
-        tid, tile.count(), shared.values);
-
-      // Scan the array with carry-in from the partials.
-      array_t<type_t, vt> y = scan_t().scan(tid, x, shared.scan, 
-        partials_data[cta], cta > 0, tile.count(), op, type_t(), 
-        scan_type).scan;
-
-      // Store the scanned values to the output.
-      reg_to_mem_thread<nt, vt>(y, tid, tile.count(), output + tile.begin, 
-        shared.values);    
-    };
-    cta_transform<launch_t>(downsweep_k, count, context);
-  
-  } else {
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Small input specialization. This is the non-recursive branch.
-
-    typedef launch_params_t<512, 3> spine_params_t;
-    auto spine_k = [=] MGPU_DEVICE(int tid, int cta) {
-     
-      enum { nt = spine_params_t::nt, vt = spine_params_t::vt, nv = nt * vt };
-      typedef cta_scan_t<nt, type_t> scan_t;
-
-      __shared__ union {
-        typename scan_t::storage_t scan;
-        type_t values[nv];
-      } shared;
-
-      type_t carry_in = type_t();
-      for(int cur = 0; cur < count; cur += nv) {
-        // Cooperatively load values into register.
-        int count2 = min<int>(count - cur, nv);
-        array_t<type_t, vt> x = mem_to_reg_thread<nt, vt>(input + cur, 
-          tid, count2, shared.values);
-
-        scan_result_t<type_t, vt> result = scan_t().scan(tid, x, shared.scan,
-          carry_in, cur > 0, count2, op, type_t(), scan_type);
-
-        // Store the scanned values back to global memory.
-        reg_to_mem_thread<nt, vt>(result.scan, tid, count2, 
-          output + cur, shared.values);
-        
-        // Roll the reduction into carry_in.
-        carry_in = result.reduction;
-      }
-
-      // Store the carry-out to the reduction pointer. This may be a
-      // discard_iterator_t if no reduction is wanted.
-      if(!tid)
-        *reduction = carry_in;
-    };
-    cta_launch<spine_params_t>(spine_k, 1, context);
-
-    // Record the event. This lets the caller wait on just the reduction 
-    // part of the operation. It's useful when writing the reduction to
-    // host-side paged-locked memory; the caller can read out the value more
-    // quickly to allocate memory and launch the next kernel.
-    if(event)
-      cudaEventRecord(event, context.stream());
-  }
-}
-
-template<scan_type_t scan_type = scan_type_exc, 
-  typename launch_arg_t = empty_t, typename input_it, 
-  typename output_it, typename op_t, typename reduction_it>
-void scan(input_it input, int count, output_it output, op_t op, 
-  reduction_it reduction, context_t& context) {
-  return scan_event<scan_type, launch_arg_t>(input, count, output, op, 
-    reduction, context, 0);
-}
-
-template<scan_type_t scan_type = scan_type_exc, 
-  typename launch_arg_t = empty_t, 
-  typename input_it, typename output_it>
-void scan(input_it input, int count, output_it output, context_t& context) {
-
-  typedef typename std::iterator_traits<input_it>::value_type type_t;
-  scan<scan_type, launch_arg_t>(input, count, output, plus_t<type_t>(),
-    discard_iterator_t<type_t>(), context);
-}
-
-template<typename type_t, scan_type_t scan_type = scan_type_exc, 
-  typename launch_arg_t = empty_t, typename func_t, typename output_it,
-  typename op_t, typename reduction_it>
-void transform_scan_event(func_t f, int count, output_it output, op_t op,
-  reduction_it reduction, context_t& context, cudaEvent_t event) {
-
-  scan_event<scan_type, launch_arg_t>(make_load_iterator<type_t>(f),
-    count, output, op, reduction, context, event);
-}
-
-template<typename type_t, scan_type_t scan_type = scan_type_exc, 
-  typename launch_arg_t = empty_t, typename func_t, typename output_it,
-  typename op_t, typename reduction_it>
-void transform_scan(func_t f, int count, output_it output, op_t op,
-  reduction_it reduction, context_t& context) {
-
-  transform_scan_event<type_t, scan_type, launch_arg_t>(f, count, output, op,
-    reduction, context, 0);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_segreduce.hxx b/src/util/cuda/moderngpu/kernel_segreduce.hxx
deleted file mode 100644
index 185c34ef..00000000
--- a/src/util/cuda/moderngpu/kernel_segreduce.hxx
+++ /dev/null
@@ -1,406 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "search.hxx"
-#include "cta_load_balance.hxx"
-#include "cta_segscan.hxx"
-#include "transform.hxx"
-#include "memory.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-namespace detail {
-
-////////////////////////////////////////////////////////////////////////////////
-// cta_segreduce_t is common intra-warp segmented reduction code for 
-// these kernels. Should clean up and move to cta_segreduce.hxx.
-
-template<int nt, int vt, typename type_t>
-struct cta_segreduce_t {
-  typedef cta_segscan_t<nt, type_t> segscan_t;
-  
-  union storage_t {
-    typename segscan_t::storage_t segscan;
-    type_t values[nt * vt + 1];
-  };
-
-  // Values must be stored in storage.values on entry.
-  template<typename op_t, typename output_it>
-  MGPU_DEVICE void segreduce(merge_range_t merge_range, 
-    lbs_placement_t placement, array_t<bool, vt + 1> p, int tid, 
-    int cta, type_t init, op_t op, output_it output, 
-    type_t* carry_out_values, int* carry_out_codes, storage_t& storage) {
-
-    int cur_item = placement.a_index;
-    int begin_segment = placement.b_index;
-    int cur_segment = begin_segment;
-    bool carry_in = false;
-
-    const type_t* a_shared = storage.values - merge_range.a_begin;
-    type_t x[vt];
-    int segments[vt + 1];
-    iterate<vt>([&](int i) {
-      if(p[i]) {
-        // This is a data node, so accumulate and advance the data ID.
-        x[i] = a_shared[cur_item++];
-        if(carry_in) x[i] = op(x[i - 1], x[i]);
-        carry_in = true;
-      } else {
-        // This is a segment node, so advance the segment ID.
-        x[i] = init;
-        ++cur_segment;
-        carry_in = false;
-      }
-      segments[i] = cur_segment;
-    });
-    // Always flush at the end of the last thread.
-    bool overwrite = (nt - 1 == tid) && (!p[vt - 1] && p[vt]);
-    if(nt - 1 == tid) p[vt] = false;
-    if(!p[vt]) ++cur_segment;
-    segments[vt] = cur_segment;
-    overwrite = __syncthreads_or(overwrite);
-
-    // Get the segment ID for the next item. This lets us find an end flag
-    // for the last value in this thread.
-    bool has_head_flag = begin_segment < segments[vt - 1];
-    bool has_carry_out = p[vt - 1];
-
-    // Compute the carry-in for each thread.
-    segscan_result_t<type_t> result = segscan_t().segscan(tid, has_head_flag,
-      has_carry_out, x[vt - 1], storage.segscan, init, op);
-
-    // Add the carry-in back into each value and recompute the reductions.
-    type_t* x_shared = storage.values - placement.range.b_begin;
-    carry_in = result.has_carry_in && p[0];
-    iterate<vt>([&](int i) {
-      if(segments[i] < segments[i + 1]) {
-        // We've hit the end of this segment. Store the reduction to shared
-        // memory.
-        if(carry_in) x[i] = op(result.scan, x[i]);
-        x_shared[segments[i]] = x[i];
-        carry_in = false;
-      }
-    });
-    __syncthreads();
-
-    // Store the reductions for segments which begin in this tile. 
-    for(int i = merge_range.b_begin + tid; i < merge_range.b_end; i += nt)
-      output[i] = x_shared[i];
-
-    // Store the partial reduction for the segment which begins in the 
-    // preceding tile, if there is one.
-    if(!tid) {
-      if(segments[0] == merge_range.b_begin) segments[0] = -1;
-      int code = (segments[0]<< 1) | (int)overwrite;
-      carry_out_values[cta] = (segments[0] != -1) ?
-        x_shared[segments[0]] : 
-        init;
-      carry_out_codes[cta] = code;
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// Adds the carry-out for each segreduce CTA into the outputs.
-
-template<typename output_it, typename type_t, typename op_t>
-void segreduce_fixup(output_it output, const type_t* values,
-  const int* codes, int count, op_t op, type_t init,
-  context_t& context) {
-
-  enum { nt = 512 };
-  int num_ctas = div_up(count, nt);
-
-  mem_t<type_t> carry_out(num_ctas, context);
-  mem_t<int> codes_out(num_ctas, context);
-  type_t* carry_out_data = carry_out.data();
-  int* codes_data = codes_out.data();
-
-  auto k_fixup = [=]MGPU_DEVICE(int tid, int cta) {
-    typedef cta_segscan_t<nt, type_t> segscan_t;
-    __shared__ struct {
-      bool head_flags[nt];
-      typename segscan_t::storage_t segscan;
-    } shared;
-
-    range_t tile = get_tile(cta, nt, count);
-    int gid = tile.begin + tid;
-
-    ////////////////////////////////////////////////////////////////////////////
-    // As in the outer segmented reduce kernel, update the reductions for all
-    // segments that *start* in this CTA. That is, the first carry-out code
-    // for a segment must be mapped into this CTA to actually apply the 
-    // accumulate. This CTA will return a partial reduction for the segment
-    // that overlaps this CTA but starts in a preceding CTA.
-
-    // We don't need to worry about storing new overwrite bits as this kernel
-    // will always add carry-in values to empty segments.
-
-    int code0 = (gid - 1 >= 0 && gid - 1 < count) ? codes[gid - 1] : -1;
-    int code1 = (gid < count) ? codes[gid] : -1;
-    int code2 = (gid + 1 < count) ? codes[gid + 1] : -1;
-    type_t value = (gid < count) ? values[gid] : init;
-
-    int seg0 = code0>> 1;
-    int seg1 = code1>> 1;
-    int seg2 = code2>> 1;
-    bool has_head_flag = seg0 != seg1 || -1 == seg1;
-    bool has_carry_out = -1 != seg1 && seg1 == seg2;
-    bool has_end_flag = seg1 != seg2;
-
-    // Put the head flag in shared memory, because the last thread 
-    // participating in a reduction in the CTA needs to check the head flag
-    // for the first thread in the reduction.
-    shared.head_flags[tid] = has_head_flag;
-
-    segscan_result_t<type_t> result = segscan_t().segscan(tid, has_head_flag,
-      has_carry_out, value, shared.segscan, init, op);
-
-    bool carry_out_written = false;
-    if(-1 != seg1 && (has_end_flag || nt - 1 == tid)) {
-      // This is a valid reduction.
-      if(result.has_carry_in) 
-        value = op(value, result.scan);
-
-      if(0 == result.left_lane && !shared.head_flags[result.left_lane]) {
-        carry_out_data[cta] = value;
-        codes_data[cta] = seg1<< 1;
-        carry_out_written = true;
-      } else {
-        int left_code = codes[tile.begin + result.left_lane - 1];
-        if(0 == (1 & left_code))     // Add in the value already stored.
-          value = op(value, output[seg1]);
-        output[seg1] = value;
-      }
-    }
-
-    carry_out_written = __syncthreads_or(carry_out_written);
-    if(!carry_out_written && !tid)
-      codes_data[cta] = -1<< 1;
-  };
-  cta_launch<nt>(k_fixup, num_ctas, context);
-
-  if(num_ctas > 1)
-    segreduce_fixup(output, carry_out_data, codes_data, 
-      num_ctas, op, init, context);
-}
-
-} // namespace detail
-
-////////////////////////////////////////////////////////////////////////////////
-// Segmented reduction with loading from an input iterator. This does not
-// require explicit materialization of the load-balancing search.
-
-template<typename launch_arg_t = empty_t, typename input_it,
-  typename segments_it, typename output_it, typename op_t, typename type_t>
-void segreduce(input_it input, int count, segments_it segments, 
-  int num_segments, output_it output, op_t op, type_t init, 
-  context_t& context) {
-
-  typedef typename conditional_typedef_t<launch_arg_t, 
-    launch_box_t<
-      arch_20_cta<128, 11, 8>,
-      arch_35_cta<128,  7, 5>,
-      arch_52_cta<128, 11, 8>
-    >
-  >::type_t launch_t;
-
-  cta_dim_t cta_dim = launch_t::cta_dim(context);
-  int num_ctas = cta_dim.num_ctas(count + num_segments);
-
-  mem_t<type_t> carry_out(num_ctas, context);
-  mem_t<int> codes(num_ctas, context);
-  type_t* carry_out_data = carry_out.data();
-  int* codes_data = codes.data();
-
-  mem_t<int> mp = load_balance_partitions(count, segments, num_segments,
-    cta_dim.nv(), context);
-  const int* mp_data = mp.data();
-
-  auto k_reduce = [=]MGPU_DEVICE(int tid, int cta) {
-    typedef typename launch_t::sm_ptx params_t;
-    enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 };
-    typedef detail::cta_segreduce_t<nt, vt, type_t> segreduce_t;
-
-    __shared__ union {
-      typename segreduce_t::storage_t segreduce;
-      type_t values[nt * vt + 1];
-      type_t indices[nt * vt + 2];
-    } shared;
-
-    merge_range_t merge_range = compute_merge_range(count, num_segments, 
-      cta, nt * vt, mp_data[cta], mp_data[cta + 1]);
-
-    // Cooperatively load values from input into shared.
-    mem_to_shared<nt, vt, vt0>(input + merge_range.a_begin, tid, 
-      merge_range.a_count(), shared.segreduce.values);
-
-    // Load segment data into the B region of shared. Search for the starting
-    // index of each thread for a merge.
-    int* b_shared = sizeof(type_t) > sizeof(int) ?
-      (int*)(shared.segreduce.values + merge_range.a_count()) :
-      ((int*)shared.segreduce.values + merge_range.a_count());
-    lbs_placement_t placement = cta_load_balance_place<nt, vt>(tid, 
-      merge_range, count, segments, num_segments, b_shared);
-
-    // Adjust the pointer so that dereferencing at the segment ID returns the
-    // offset of that segment.
-    b_shared -= placement.range.b_begin;
-    int cur_item = placement.a_index;
-    int cur_segment = placement.b_index;
-    array_t<bool, vt + 1> merge_bits;
-    iterate<vt + 1>([&](int i) {
-      bool p = cur_item < b_shared[cur_segment + 1];
-      if(p) ++cur_item;
-      else ++cur_segment;
-      merge_bits[i] = p;
-    });
-
-    // Compute the segmented reduction.
-    segreduce_t().segreduce(merge_range, placement, merge_bits, tid, cta, 
-      init, op, output, carry_out_data, codes_data, shared.segreduce);
-
-  };
-  cta_launch<launch_t>(k_reduce, num_ctas, context);
-
-  if(num_ctas > 1)
-    detail::segreduce_fixup(output, carry_out_data, codes_data, num_ctas,
-      op, init, context);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-template<typename launch_arg_t = empty_t, typename func_t,
-  typename segments_it, typename output_it, typename op_t, typename type_t>
-void transform_segreduce(func_t f, int count, segments_it segments, 
-  int num_segments, output_it output, op_t op, type_t init, 
-  context_t& context) {
-
-  segreduce<launch_arg_t>(make_load_iterator<type_t>(f), count, segments, 
-    num_segments, output, op, init, context);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// spmv - sparse matrix * vector.
-
-template<typename launch_arg_t = empty_t, typename matrix_it,
-  typename columns_it, typename vector_it, typename segments_it, 
-  typename output_it>
-void spmv(matrix_it matrix, columns_it columns, vector_it vector,
-  int count, segments_it segments, int num_segments, output_it output,
-  context_t& context) { 
-
-  typedef typename std::iterator_traits<matrix_it>::value_type type_t;
-  
-  transform_segreduce<launch_arg_t>([=]MGPU_DEVICE(int index) {
-    return matrix[index] * ldg(vector + columns[index]);    // sparse m * v.
-  }, count, segments, num_segments, output, plus_t<type_t>(), 
-    (type_t)0, context);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// lbs_segreduce
-
-template<typename launch_arg_t = empty_t, 
-  typename func_t, typename segments_it, typename pointers_t, 
-  typename output_it, typename op_t, typename type_t, typename... args_t>
-void lbs_segreduce(func_t f, int count, segments_it segments,
-  int num_segments, pointers_t caching_iterators, output_it output, op_t op,
-  type_t init, context_t& context, args_t... args) {
-
-  typedef typename conditional_typedef_t<launch_arg_t, 
-    launch_box_t<
-      arch_20_cta<128, 11, 8>,
-      arch_35_cta<128,  7, 5>,
-      arch_52_cta<128, 11, 8>
-    >
-  >::type_t launch_t;
-
-  typedef tuple_iterator_value_t<pointers_t> value_t;
-
-  cta_dim_t cta_dim = launch_t::cta_dim(context);
-  int num_ctas = cta_dim.num_ctas(count + num_segments);
-
-  mem_t<type_t> carry_out(num_ctas, context);
-  mem_t<int> codes(num_ctas, context);
-  type_t* carry_out_data = carry_out.data();
-  int* codes_data = codes.data();
-
-  mem_t<int> mp = load_balance_partitions(count, segments, num_segments,
-    cta_dim.nv(), context);
-  const int* mp_data = mp.data();
-
-  auto k_reduce = [=]MGPU_DEVICE(int tid, int cta, args_t... args) {
-    typedef typename launch_t::sm_ptx params_t;
-    enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 };
-    typedef cta_load_balance_t<nt, vt> load_balance_t;
-    typedef detail::cached_segment_load_t<nt, pointers_t> cached_load_t;
-    typedef detail::cta_segreduce_t<nt, vt, type_t> segreduce_t;
-
-    __shared__ union {
-      typename load_balance_t::storage_t lbs;
-      typename cached_load_t::storage_t cached;
-      typename segreduce_t::storage_t segreduce;
-      type_t values[nt * vt + 1];
-    } shared;
-
-    // Compute the load-balancing search and materialize (index, seg, rank)
-    // arrays.
-    auto lbs = load_balance_t().load_balance(count, segments, num_segments,
-      tid, cta, mp_data, shared.lbs);
-
-    // Load from the cached iterators. Use the placement range, not the 
-    // merge-path range for situating the segments.
-    array_t<value_t, vt> cached_values = cached_load_t::template load<vt0>(
-      tid, lbs.merge_range.a_count(), lbs.placement.range.b_range(), 
-      lbs.segments, shared.cached, caching_iterators);
-
-    // Call the user-supplied functor f.
-    array_t<type_t, vt> strided_values;
-    strided_iterate<nt, vt, vt0>([&](int i, int j) {
-      int index = lbs.merge_range.a_begin + j;
-      int seg = lbs.segments[i];
-      int rank = lbs.ranks[i];
-
-      strided_values[i] = f(index, seg, rank, cached_values[i], args...);
-    }, tid, lbs.merge_range.a_count());
-
-    // Store the values back to shared memory for segmented reduction.
-    reg_to_shared_strided<nt, vt>(strided_values, tid, 
-      shared.segreduce.values);
-
-    // Split the flags.
-    array_t<bool, vt + 1> merge_bits;
-    iterate<vt + 1>([&](int i) {
-      merge_bits[i] = 0 != ((1<< i) & lbs.merge_flags);
-    });
-
-    // Compute the segmented reduction.
-    segreduce_t().segreduce(lbs.merge_range, lbs.placement, merge_bits,
-      tid, cta, init, op, output, carry_out_data, codes_data, 
-      shared.segreduce);
-  };
-  cta_launch<launch_t>(k_reduce, num_ctas, context, args...);
-
-  if(num_ctas > 1)
-    detail::segreduce_fixup(output, carry_out_data, codes_data, num_ctas,
-      op, init, context);
-}
-
-// lbs_segreduce with no caching iterators.
-template<typename launch_arg_t = empty_t, 
-  typename func_t, typename segments_it, typename output_it, typename op_t,
-  typename type_t, typename... args_t>
-void lbs_segreduce(func_t f, int count, segments_it segments,
-  int num_segments, output_it output, op_t op, type_t init, 
-  context_t& context, args_t... args) {
-
-  lbs_segreduce<launch_arg_t>(
-    [=]MGPU_DEVICE(int index, int seg, int rank, tuple<>, args_t... args) {
-      return f(index, seg, rank, args...);
-    },
-    count, segments, num_segments, tuple<>(), output, op, init, context,
-    args...
-  );
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_segsort.hxx b/src/util/cuda/moderngpu/kernel_segsort.hxx
deleted file mode 100644
index 217a45ad..00000000
--- a/src/util/cuda/moderngpu/kernel_segsort.hxx
+++ /dev/null
@@ -1,444 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "search.hxx"
-#include "cta_segsort.hxx"
-#include "cta_scan.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-namespace detail {
-
-template<typename launch_arg_t, typename key_t, typename val_t, 
-  typename comp_t>
-struct segsort_t {
-  enum { has_values = !std::is_same<val_t, empty_t>::value };
-  typedef typename conditional_typedef_t<launch_arg_t, 
-    launch_box_t<
-      arch_20_cta<128, 15>,
-      arch_35_cta<128, 11>,
-      arch_52_cta<128, 15>
-    >
-  >::type_t launch_t;
-
-  context_t& context;
-  comp_t comp;
-  cta_dim_t cta_dim;
-  int count, nv, num_ctas, num_passes;
-
-  mem_t<key_t> keys_temp;
-  mem_t<val_t> vals_temp;
-
-  key_t* keys_source, *keys_dest, *keys_blocksort;
-  val_t* vals_source, *vals_dest, *vals_blocksort;
-
-  mem_t<range_t> merge_ranges;
-  mem_t<merge_range_t> merge_list;
-  mem_t<int> compressed_ranges, copy_list, copy_status;
-  mem_t<int2> op_counters;
-
-  segsort_t(key_t* keys, val_t* vals, int count_, comp_t comp_, 
-    context_t& context_) : count(count_), comp(comp_), context(context_) { 
-
-    nv = launch_t::nv(context);
-    num_ctas = div_up(count, nv);
-    num_passes = find_log2(num_ctas, true);
-    
-    int capacity = num_ctas;                 // log(num_ctas) per pass.
-    for(int i = 0; i < num_passes; ++i)
-      capacity += div_up(num_ctas, 1<< i);
-
-    if(num_passes              ) keys_temp = mem_t<key_t>(count, context);
-    if(num_passes && has_values) vals_temp = mem_t<val_t>(count, context);
-
-    keys_source = keys;
-    vals_source = vals;
-    keys_dest = keys_temp.data();
-    vals_dest = vals_temp.data();
-
-    // The blocksort passes outputs to these arrays.
-    keys_blocksort = (1 & num_passes) ? keys_dest : keys_source;
-    vals_blocksort = (1 & num_passes) ? vals_dest : vals_source;
-
-    // Allocate space for temporary variables.
-    merge_ranges = mem_t<range_t>(capacity, context);
-    merge_list = mem_t<merge_range_t>(num_ctas, context);
-    compressed_ranges = mem_t<int>(num_ctas, context);
-    copy_list = mem_t<int>(num_ctas, context);
-    copy_status = mem_t<int>(num_ctas, context);
-    op_counters = fill<int2>(int2(), num_passes, context);
-  }
-
-  template<bool sort_indices = false, typename keys_it, typename vals_it, 
-    typename segments_it>
-  void blocksort_segments(keys_it keys, vals_it vals, segments_it segments, 
-    int num_segments) {
-
-    // Distribute the segment descriptors to different CTAs.
-    mem_t<int> partitions = binary_search_partitions<bounds_lower>(segments, 
-      count, num_segments, nv, context);
-    const int* mp_data = partitions.data();
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Block sort the input. The position of the first and last segment 
-    // descriptors are stored to merge_ranges.
-
-    comp_t comp = this->comp;
-    int count = this->count;
-    key_t* keys_blocksort = this->keys_blocksort;
-    val_t* vals_blocksort = this->vals_blocksort;
-    int* compressed_ranges_data = compressed_ranges.data();
-
-    auto blocksort_k = [=] MGPU_DEVICE(int tid, int cta) {
-      typedef typename launch_t::sm_ptx params_t;
-      enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-      typedef cta_load_head_flags<nt, vt> load_head_flags_t;
-      typedef cta_segsort_t<nt, vt, key_t, val_t> sort_t;
-
-      __shared__ union {
-        typename load_head_flags_t::storage_t load_head_flags;
-        typename sort_t::storage_t sort;
-        key_t keys[nv + 1];
-        val_t vals[nv];
-      } shared;
-
-      // Load the partitions for the segment descriptors and extract head 
-      // flags for each key.
-      int p[2] = { mp_data[cta], mp_data[cta + 1] };
-      int head_flags = load_head_flags_t().load(segments, p, tid, cta, 
-        count, shared.load_head_flags);
-
-      // Load the keys and values.
-      range_t tile = get_tile(cta, nv, count);
-
-      kv_array_t<key_t, val_t, vt> unsorted;
-      unsorted.keys = mem_to_reg_thread<nt, vt>(keys + tile.begin, tid, 
-        tile.count(), shared.keys);
-      if(sort_indices) {
-        // If we're sorting indices, load from the counting_iterator_t directly
-        // without staging through shared memory.
-        iterate<vt>([&](int i) {
-          unsorted.vals[i] = vals[tile.begin + vt * tid + i];
-        });
-      } else if(has_values) {
-        // If we're storing actual values, stage through shared memory.
-        unsorted.vals = mem_to_reg_thread<nt, vt>(vals + tile.begin, tid,
-          tile.count(), shared.vals);
-      }
-
-      // Blocksort.
-      range_t active { };
-      kv_array_t<key_t, val_t, vt> sorted = sort_t().block_sort(unsorted,
-        tid, tile.count(), head_flags, active, comp, shared.sort);
-
-      // Store the keys and values.
-      reg_to_mem_thread<nt, vt>(sorted.keys, tid, tile.count(), 
-        keys_blocksort + tile.begin, shared.keys);
-      if(has_values)
-        reg_to_mem_thread<nt, vt>(sorted.vals, tid, tile.count(), 
-          vals_blocksort + tile.begin, shared.vals);
-
-      // Store the active range for the entire CTA. These are used by the 
-      // segmented partitioning kernels.
-      if(!tid)
-        compressed_ranges_data[cta] = bfi(active.end, active.begin, 16, 16);
-    };
-    cta_transform<launch_t>(blocksort_k, count, context);
-
-    if(1 & num_passes) {
-      std::swap(this->keys_source, this->keys_dest);
-      std::swap(this->vals_source, this->vals_dest);
-    }
-  }
-
-  void merge_passes() {
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Execute a partitioning and a merge for each mergesort pass.
-
-    comp_t comp = this->comp;
-    int num_ranges = num_ctas;
-    int num_partitions = num_ctas + 1;
-    int count = this->count;
-    int nv = this->nv;
-
-    key_t* keys_source = this->keys_source;
-    val_t* vals_source = this->vals_source;
-    key_t* keys_dest = this->keys_dest;
-    val_t* vals_dest = this->vals_dest;
-
-    range_t* source_ranges = merge_ranges.data();
-    range_t* dest_ranges = merge_ranges.data();
-
-    const int* compressed_ranges_data = compressed_ranges.data();
-    int* copy_status_data = copy_status.data();
-    int* copy_list_data = copy_list.data();
-    merge_range_t* merge_list_data = merge_list.data();
-    int2* op_counters_data = op_counters.data();
-
-    for(int pass = 0; pass < num_passes; ++pass) {
-      int coop = 2<< pass;
-
-      //////////////////////////////////////////////////////////////////////////
-      // Partition the data within its segmented mergesort list.
-
-      enum { nt = 64 };
-      int num_partition_ctas = div_up(num_partitions, nt - 1);
-
-      auto partition_k = [=] MGPU_DEVICE(int tid, int cta) {
-        typedef cta_scan_t<nt, int> scan_t;
-        __shared__ union {
-          typename scan_t::storage_t scan;
-          int partitions[nt + 1];
-          struct { int merge_offset, copy_offset; };
-        } shared;
-
-        int partition = (nt - 1) * cta + tid;
-        int first = nv * partition;
-        int count2 = min(nv, count - first);
-
-        int mp0 = 0;
-        bool active = (tid < nt - 1) && (partition < num_partitions - 1);
-        int range_index = partition>> pass;
-
-        if(partition < num_partitions) {
-
-          merge_range_t range = compute_mergesort_range(count, partition, 
-            coop, nv);
-          int diag = min(nv * partition - range.a_begin, range.total());
-
-          int indices[2] = { 
-            min(num_ranges - 1, ~1 & range_index), 
-            min(num_ranges - 1, 1 | range_index) 
-          };
-          range_t ranges[2];
-
-          if(pass > 0) {
-            ranges[0] = source_ranges[indices[0]];
-            ranges[1] = source_ranges[indices[1]];
-          } else {
-            iterate<2>([&](int i) {
-              int compressed = compressed_ranges_data[indices[i]];
-              int first = nv * indices[i];
-
-              ranges[i] = range_t { 0x0000ffff & compressed, compressed>> 16 };
-              if(nv != ranges[i].begin) ranges[i].begin += first;
-              else ranges[i].begin = count;
-              if(-1 != ranges[i].end) ranges[i].end += first;
-            });
-          }
-
-          range_t inner = { 
-            ranges[0].end, 
-            max(range.b_begin, ranges[1].begin) 
-          };
-          range_t outer = { 
-            min(ranges[0].begin, ranges[1].begin),
-            max(ranges[0].end, ranges[1].end)
-          };
-
-          // Segmented merge path on inner.
-          mp0 = segmented_merge_path(keys_source, range, inner, diag, comp);
-
-          // Store outer merge range.
-          if(active && 0 == diag)
-            dest_ranges[range_index / 2] = outer;
-        }
-        shared.partitions[tid] = mp0;
-        __syncthreads();
-
-        int mp1 = shared.partitions[tid + 1];
-        __syncthreads();
-
-        // Update the merge range to include partitioning.
-        merge_range_t range = compute_mergesort_range(count, partition, coop, 
-          nv, mp0, mp1);
-
-        // Merge if the source interval does not exactly cover the destination
-        // interval. Otherwise copy or skip.
-        range_t interval = (1 & range_index) ? 
-          range.b_range() : range.a_range();
-        bool merge_op = false;
-        bool copy_op = false;
-
-        // Create a segsort job.
-        if(active) {
-          merge_op = (first != interval.begin) || (interval.count() != count2);
-          copy_op = !merge_op && (!pass || !copy_status_data[partition]);
-
-          // Use the b_end component to store the index of the destination tile.
-          // The actual b_end can be inferred from a_count and the length of 
-          // the input array.
-          range.b_end = partition;
-        }
-
-        // Scan the counts of merges and copies.
-        scan_result_t<int> merge_scan = scan_t().scan(tid, (int)merge_op, 
-          shared.scan);
-        scan_result_t<int> copy_scan = scan_t().scan(tid, (int)copy_op, 
-          shared.scan);
-
-        // Increment the operation counters by the totals.
-        if(!tid) {
-          shared.merge_offset = atomicAdd(&op_counters_data[pass].x, 
-            merge_scan.reduction);
-          shared.copy_offset = atomicAdd(&op_counters_data[pass].y, 
-            copy_scan.reduction);
-        }
-        __syncthreads();
-
-        if(active) {
-          copy_status_data[partition] = !merge_op;
-          if(merge_op)
-            merge_list_data[shared.merge_offset + merge_scan.scan] = range;
-          if(copy_op)
-            copy_list_data[shared.copy_offset + copy_scan.scan] = partition;
-        }
-      };
-      cta_launch<nt>(partition_k, num_partition_ctas, context);
-
-      source_ranges = dest_ranges;
-      num_ranges = div_up(num_ranges, 2);
-      dest_ranges += num_ranges;
-
-      //////////////////////////////////////////////////////////////////////////
-      // Merge or copy unsorted tiles.
-
-      auto merge_k = [=] MGPU_DEVICE(int tid, int cta) {
-        typedef typename launch_t::sm_ptx params_t;
-        enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-
-        __shared__ union {
-          key_t keys[nv + 1];
-          int indices[nv];
-        } shared;
-
-        merge_range_t range = merge_list_data[cta];
-
-        int tile = range.b_end;
-        int first = nv * tile;
-        int count2 = min((int)nv, count - first);
-        range.b_end = range.b_begin + (count2 - range.a_count());
-
-        int compressed_range = compressed_ranges_data[tile];
-        range_t active = {
-          0x0000ffff & compressed_range,
-          compressed_range>> 16
-        };
-        load_two_streams_shared<nt, vt>(keys_source + range.a_begin, 
-          range.a_count(), keys_source + range.b_begin, range.b_count(),
-          tid, shared.keys);
-
-        // Run a merge path search to find the starting point for each thread
-        // to merge. If the entire warp fits into the already-sorted segments,
-        // we can skip sorting it and leave its keys in shared memory.
-        int list_parity = 1 & (tile>> pass);
-        if(list_parity) active = range_t { 0, active.begin };
-        else active = range_t { active.end, nv };
-
-        int warp_offset = vt * (~(warp_size - 1) & tid);
-        bool sort_warp = list_parity ?
-          (warp_offset < active.end) : 
-          (warp_offset + vt * warp_size >= active.begin);
-   
-        merge_pair_t<key_t, vt> merge;
-        merge_range_t local_range = range.to_local();
-        if(sort_warp) {
-          int diag = vt * tid;
-          int mp = segmented_merge_path(shared.keys, local_range,
-            active, diag, comp);
-
-          merge_range_t partitioned = local_range.partition(mp, diag);
-          merge = segmented_serial_merge<vt>(shared.keys, 
-            local_range.partition(mp, diag), active, comp, false);
-        } else {
-          iterate<vt>([&](int i) {
-            merge.indices[i] = vt * tid + i;
-          });
-        }
-        __syncthreads();
-
-        // Store keys to global memory.
-        if(sort_warp)
-          reg_to_shared_thread<nt, vt>(merge.keys, tid, shared.keys, false);
-        __syncthreads();
-
-        shared_to_mem<nt, vt>(shared.keys, tid, count2, keys_dest + first);
-
-        if(has_values) {
-          // Transpose the indices from thread order to strided order.
-          array_t<int, vt> indices = reg_thread_to_strided<nt>(merge.indices,
-            tid, shared.indices);
-
-          // Gather the input values and merge into the output values.
-          transfer_two_streams_strided<nt>(vals_source + range.a_begin, 
-            range.a_count(), vals_source + range.b_begin, range.b_count(), 
-            indices, tid, vals_dest + first);
-        }
-      };
-      cta_launch<launch_t>(merge_k, &op_counters_data[pass].x, context);
-
-      auto copy_k = [=] MGPU_DEVICE(int tid, int cta) {
-        typedef typename launch_t::sm_ptx params_t;
-        enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-
-        int tile = copy_list_data[cta];
-        int first = nv * tile;
-        int count2 = min((int)nv, count - first);
-
-        mem_to_mem<nt, vt>(keys_source + first, tid, count2, 
-          keys_dest + first);
-
-        if(has_values)
-          mem_to_mem<nt, vt>(vals_source + first, tid, count2, 
-            vals_dest + first);
-      };
-      cta_launch<launch_t>(copy_k, &op_counters_data[pass].y, context);
-
-      std::swap(keys_source, keys_dest);
-      std::swap(vals_source, vals_dest);
-    }    
-  }
-};
-
-} // namespace detail
-
-// Key-value mergesort.
-template<typename launch_arg_t = empty_t, typename key_t, typename val_t,
-  typename seg_it, typename comp_t>
-void segmented_sort(key_t* keys, val_t* vals, int count, seg_it segments, 
-  int num_segments, comp_t comp, context_t& context) {
-
-  detail::segsort_t<launch_arg_t, key_t, val_t, comp_t> 
-    segsort(keys, vals, count, comp, context);
-
-  segsort.blocksort_segments(keys, vals, segments, num_segments);
-  segsort.merge_passes();
-}
-
-// Key-value mergesort. Automatically generate indices to sort as values.
-template<typename launch_arg_t = empty_t, typename key_t, typename seg_it, 
-  typename comp_t>
-void segmented_sort_indices(key_t* keys, int* indices, int count, 
-  seg_it segments, int num_segments, comp_t comp, context_t& context) {
-
-  detail::segsort_t<launch_arg_t, key_t, int, comp_t> 
-    segsort(keys, indices, count, comp, context);
-
-  segsort.template blocksort_segments<true>(keys, counting_iterator_t<int>(), 
-    segments, num_segments);
-  segsort.merge_passes();
-}
-
-// Key-only segmented sort
-template<typename launch_arg_t = empty_t, typename key_t, typename seg_it, 
-  typename comp_t>
-void segmented_sort(key_t* keys, int count, seg_it segments, 
-  int num_segments, comp_t comp, context_t& context) {
-
-  segmented_sort<launch_arg_t>(keys, (empty_t*)nullptr, count,
-    segments, num_segments, comp, context);
-}
-
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_sortedsearch.hxx b/src/util/cuda/moderngpu/kernel_sortedsearch.hxx
deleted file mode 100644
index c614175f..00000000
--- a/src/util/cuda/moderngpu/kernel_sortedsearch.hxx
+++ /dev/null
@@ -1,64 +0,0 @@
-#pragma once
-#include "cta_merge.hxx"
-#include "search.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<bounds_t bounds, typename launch_arg_t = empty_t,
-  typename needles_it, typename haystack_it, typename indices_it,
-  typename comp_it>
-void sorted_search(needles_it needles, int num_needles, haystack_it haystack,
-  int num_haystack, indices_it indices, comp_it comp, context_t& context) {
-
-  typedef typename conditional_typedef_t<launch_arg_t, 
-    launch_box_t<
-      arch_20_cta<128, 15>,
-      arch_35_cta<128, 11>,
-      arch_52_cta<128, 15>
-    >
-  >::type_t launch_t;
-
-  typedef typename std::iterator_traits<needles_it>::value_type type_t;
-
-  // Partition the needles and haystacks into tiles.
-  mem_t<int> partitions = merge_path_partitions<bounds>(needles, num_needles,
-    haystack, num_haystack, launch_t::nv(context), comp, context);
-  const int* mp_data = partitions.data();
-
-  auto k = [=]MGPU_DEVICE(int tid, int cta) {
-    typedef typename launch_t::sm_ptx params_t;
-    enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-    
-    __shared__ union {
-      type_t keys[nv + 1];
-      int indices[nv];
-    } shared;
-
-    // Load the range for this CTA and merge the values into register.
-    int mp0 = mp_data[cta + 0];
-    int mp1 = mp_data[cta + 1];
-    merge_range_t range = compute_merge_range(num_needles, num_haystack, cta,
-      nv, mp0, mp1);
-
-    // Merge the values needles and haystack.
-    merge_pair_t<type_t, vt> merge = cta_merge_from_mem<bounds, nt, vt>(
-      needles, haystack, range, tid, comp, shared.keys);
-
-    // Store the needle indices to shared memory.
-    iterate<vt>([&](int i) {
-      if(merge.indices[i] < range.a_count()) {
-        int needle = merge.indices[i];
-        int haystack = range.b_begin + vt * tid + i - needle;
-        shared.indices[needle] = haystack;
-      }
-    });
-    __syncthreads();
-
-    shared_to_mem<nt, vt>(shared.indices, tid, range.a_count(), 
-      indices + range.a_begin);
-  };
-
-  cta_transform<launch_t>(k, num_needles + num_haystack, context);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/kernel_workcreate.hxx b/src/util/cuda/moderngpu/kernel_workcreate.hxx
deleted file mode 100644
index 69494c90..00000000
--- a/src/util/cuda/moderngpu/kernel_workcreate.hxx
+++ /dev/null
@@ -1,272 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "search.hxx"
-#include "cta_load_balance.hxx"
-#include "kernel_scan.hxx"
-#include "tuple.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-// experimental feature
-namespace expt {
-
-template<typename launch_arg_t, typename segments_it>
-struct workcreate_t {
-  typedef typename conditional_typedef_t<launch_arg_t, 
-    launch_box_t<
-      arch_20_cta<128, 11, 8>,
-      arch_35_cta<128,  7, 5>,
-      arch_52_cta<128, 11, 8>
-    >
-  >::type_t launch_t;
-
-  segments_it segments;
-  int num_segments;
-  int count;
-  context_t& context;
-
-  cta_dim_t cta_dim;
-  int num_ctas;
-
-  mem_t<int> mp;
-  mem_t<short> bits;
-  mem_t<int2> cta_offsets;
-  int2 cta_total;
-
-  struct add_int2_t {
-    MGPU_HOST_DEVICE int2 operator()(int2 a, int2 b) const {
-      return make_int2(a.x + b.x, a.y + b.y);
-    }
-  };
-
-public:
-
-  struct count_t {
-    int count;
-    int num_segments;
-  };
-
-  workcreate_t(int count_, segments_it segments_, int num_segments_,
-    context_t& context_) : 
-    count(count_), segments(segments_), num_segments(num_segments_),
-    context(context_) {
-
-    // Compute the number of CTAs.
-    cta_dim = launch_t::cta_dim(context);
-    num_ctas = cta_dim.num_ctas(count + num_segments);
-
-    mp = load_balance_partitions(count, segments, num_segments, cta_dim.nv(), 
-      context);
-
-    bits = mem_t<short>(num_ctas * cta_dim.nt, context);
-
-    cta_offsets = mem_t<int2>(num_ctas, context);
-  }
-
-  // f(int index, int seg, int rank, tuple<...> desc) returns the number
-  // of work-items to create.
-  template<typename func_t, typename pointers_t>
-  count_t upsweep(func_t f, pointers_t caching_iterators) {
-    
-    const int* mp_data = mp.data();
-    short* bits_data = bits.data();
-    int2* counts_data = cta_offsets.data();
-    int count = this->count;
-    auto segments = this->segments;
-    int num_segments = this->num_segments;
-
-    typedef tuple_iterator_value_t<pointers_t> value_t;
-    auto upsweep_k = [=]MGPU_DEVICE(int tid, int cta) {
-      typedef typename launch_t::sm_ptx params_t;
-      enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 };
-      typedef cta_reduce_t<nt, int2> reduce_t;
-      typedef cta_load_balance_t<nt, vt> load_balance_t;
-      typedef detail::cached_segment_load_t<nt, pointers_t> cached_load_t;
-
-      static_assert(vt <= 16, "mgpu::workcreate_t vt must be <= 16.");
-
-      __shared__ union {
-        typename reduce_t::storage_t reduce;
-        typename load_balance_t::storage_t lbs;
-        typename cached_load_t::storage_t cached;
-      } shared;
-
-      // Compute the load-balancing search and materialize (index, seg, rank) 
-      // arrays.
-      auto lbs = load_balance_t().load_balance(count, segments, num_segments,
-        tid, cta, mp_data, shared.lbs);
-
-      // Call the user-supplied functor f.
-      short segment_bits = 0;
-      int work_items = 0;
-
-      // Load from the cached iterators. Use the placement range, not the 
-      // merge-path range for situating the segments.
-      array_t<value_t, vt> cached_values = cached_load_t::template load<vt0>(
-        tid, lbs.merge_range.a_count(), lbs.placement.range.b_range(), 
-        lbs.segments, shared.cached, caching_iterators);
-      
-      strided_iterate<nt, vt, vt0>([&](int i, int j) {
-        int index = lbs.merge_range.a_begin + j;
-        int seg = lbs.segments[i];
-        int rank = lbs.ranks[i];
-
-        int work_count = f(index, seg, rank, cached_values[i]);
-
-        if(work_count > 0) segment_bits |= 1<< i;
-        work_items += work_count;
-      }, tid, lbs.merge_range.a_count());
-
-      // Store the worker bits for this thread.
-      bits_data[nt * cta + tid] = segment_bits;
-
-      // Scan the segment and work-item counts.
-      int2 reduction = reduce_t().reduce(tid, 
-        make_int2(popc(segment_bits), work_items), shared.reduce,
-        nt, add_int2_t(), false);
-      if(!tid) counts_data[cta] = reduction;
-    };
-    cta_launch<launch_t>(upsweep_k, num_ctas, context);
-
-    // Scan the partial reductions.
-    mem_t<int2> counts_host(1, context, memory_space_host);
-    scan_event(counts_data, num_ctas, counts_data, add_int2_t(),
-      counts_host.data(), context, context.event());
-    cudaEventSynchronize(context.event());  
-
-    cta_total = counts_host.data()[0];
-    return count_t { cta_total.y, cta_total.x };
-  }
-
-  // upsweep without caching iterators.
-  template<typename func_t>
-  count_t upsweep(func_t f) {
-    return upsweep(
-      [=]MGPU_DEVICE(int index, int seg, int rank, tuple<>) {
-        return f(index, seg, rank);
-      }, 
-      tuple<>()
-    );
-  }
-
-  // f(int dest_seg, int index, int source_seg, int rank, tuple<...> desc)
-  // returns the number of work-items to create.
-  template<typename func_t, typename pointers_t, typename... args_t>
-  mem_t<int> downsweep(func_t f, pointers_t caching_iterators, args_t... args) {
-    // Input
-    const int* mp_data = mp.data();
-    const short* bits_data = bits.data();
-    const int2* counts_data = cta_offsets.data();
-    int count = this->count;
-    auto segments = this->segments;
-    int num_segments = this->num_segments;
-
-    // Output.
-    int num_dest_segments = cta_total.x;
-    mem_t<int> segments_result(num_dest_segments, context);
-    int* segments_output = segments_result.data();
-
-   // typedef tuple_iterator_value_t<pointers_t> value_t;
-   // typedef tuple<int> value_t;
-    auto downsweep_k = [=]MGPU_DEVICE(int tid, int cta, args_t... args) {
-      typedef typename launch_t::sm_ptx params_t;
-      enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
-      typedef cta_scan_t<nt, int> scan_t;
-      
-      // Note that this is a struct rather than the typical union. We want
-      // all three kinds of things to be valid during the callbacks into
-      // f.
-      __shared__ struct {
-        int indices[nv + 2];
-        short targets[nv];
-        typename scan_t::storage_t scan;
-      } shared;
-
-      // Decode the bits signifying work creation and compact them.
-      int segment_bits = bits_data[nt * cta + tid];
-      strided_iterate<nt, vt>([&](int i, int j) {
-        int work_create = 0 != ((1<< i) & segment_bits);
-        shared.indices[j] = work_create;
-      }, tid);
-      __syncthreads();
-
-      // Do a parallel scan of the work-create flags. Compact the indices
-      // of the work-creating items into shared.targets.
-      array_t<int, vt> flags = shared_to_reg_thread<nt, vt>(
-        shared.indices, tid);
-      scan_result_t<int> scan = scan_t().scan(tid, reduce(flags), shared.scan);
-      iterate<vt>([&](int i) {
-        if(flags[i]) shared.targets[scan.scan++] = (short)(vt * tid + i);
-      });
-      
-      // Use load-balancing search to fill shared memory with the segment of
-      // each in-range work-item.
-      lbs_fill_t fill = cta_load_balance_fill<nt, vt>(count, segments,
-        num_segments, tid, cta, mp_data, shared.indices);
-      const int* a_shared = shared.indices;
-      const int* b_shared = shared.indices + fill.b_offset;
-
-      int num_items = scan.reduction;
-      int segments_dest = counts_data[cta].x;
-      int work_item_dest = counts_data[cta].y;
-
-      int num_rounds = div_up(num_items, nt);
-      for(int i = 0; i < num_rounds; ++i) {
-        int j = i * nt + tid;
-        int dest_seg = segments_dest + j;
-        int work_count = 0;
-        if(j < num_items) {
-          // Lookup the segment info.
-          int cta_index = shared.targets[j];
-          int seg = a_shared[cta_index];
-          int seg_begin = b_shared[seg];
-          int index = fill.range.a_begin + cta_index;
-          int rank = index - seg_begin;
-
-          // Invoke the callback and the get the work-item count.
-          tuple<int> cached = load(caching_iterators, seg);
-          work_count = f(dest_seg, index, seg, rank, cached, args...);
-        }
-
-        // Scan the work-counts.
-        scan_result_t<int> work_scan = scan_t().scan(tid, work_count,
-          shared.scan);
-
-        // Stream the segments-descriptor array.
-        if(j < num_items)
-          segments_output[dest_seg] = work_item_dest + work_scan.scan;
-        work_item_dest += work_scan.reduction;
-      }
-    };
-    cta_launch<launch_t>(downsweep_k, num_ctas, context, args...);
-
-    return segments_result;     
-  }
-
-  template<typename func_t, typename... args_t>
-  mem_t<int> downsweep(func_t f, args_t... args) {
-    return downsweep(
-      [=]MGPU_DEVICE(int dest_seg, int index, int seg, int rank, tuple<>,
-        args_t... args) {
-        return f(dest_seg, index, seg, rank, args...);
-      },
-      tuple<>(), args...
-    );
-  }
-};
-
-// Use lbs_workcreate to construct an expt::workcreate_t instance. Then call
-// upsweep and downsweep, providing an appropriate lambda function.
-template<typename launch_arg_t = empty_t, typename segments_it>
-workcreate_t<launch_arg_t, segments_it>
-lbs_workcreate(int count, segments_it segments, int num_segments,
-  context_t& context) {
-  return workcreate_t<launch_arg_t, segments_it> {
-    count, segments, num_segments, context
-  };
-}
-
-} // namespace expt
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/launch_box.hxx b/src/util/cuda/moderngpu/launch_box.hxx
deleted file mode 100644
index ad449c64..00000000
--- a/src/util/cuda/moderngpu/launch_box.hxx
+++ /dev/null
@@ -1,93 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "context.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-// Specializable launch parameters.
-struct launch_box_default_t {
-  typedef launch_cta_t<0, 0, 0> sm_00;
-  typedef empty_t sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53,
-    sm_60, sm_61, sm_62, sm_70, sm_75;
-
-  template<typename new_base_t>
-  using rebind = launch_box_default_t;
-};
-
-template<typename... params_v>
-struct launch_box_t : inherit_t<params_v..., launch_box_default_t> { 
-  typedef inherit_t<params_v..., launch_box_default_t> base_t; 
-
-  typedef typename conditional_typedef_t<
-    typename base_t::sm_20, typename base_t::sm_00
-  >::type_t sm_20;
-
-#define INHERIT_LAUNCH_PARAMS(new_ver, old_ver) \
-  typedef typename conditional_typedef_t< \
-    typename base_t::sm_##new_ver, sm_##old_ver \
-  >::type_t sm_##new_ver;
-  
-  INHERIT_LAUNCH_PARAMS(21, 20)
-  INHERIT_LAUNCH_PARAMS(30, 21)
-  INHERIT_LAUNCH_PARAMS(32, 30)
-  INHERIT_LAUNCH_PARAMS(35, 30)
-  INHERIT_LAUNCH_PARAMS(37, 35)
-  INHERIT_LAUNCH_PARAMS(50, 35)
-  INHERIT_LAUNCH_PARAMS(52, 50)
-  INHERIT_LAUNCH_PARAMS(53, 50)
-  INHERIT_LAUNCH_PARAMS(60, 53)
-  INHERIT_LAUNCH_PARAMS(61, 60)
-  INHERIT_LAUNCH_PARAMS(62, 60)
-  INHERIT_LAUNCH_PARAMS(70, 62)
-  INHERIT_LAUNCH_PARAMS(75, 70)
-
-  // Overwrite the params defined for sm_00 so that the host-side compiler
-  // has all expected symbols available to it.
-  typedef sm_75 sm_00;
-  typedef MGPU_LAUNCH_PARAMS(launch_box_t) sm_ptx;
-
-  static cta_dim_t cta_dim(int ptx_version) {
-    // Ptx version from cudaFuncGetAttributes.
-    if     (ptx_version == 75) return cta_dim_t { sm_75::nt, sm_75::vt };
-    else if(ptx_version >= 70) return cta_dim_t { sm_70::nt, sm_70::vt };
-    else if(ptx_version == 62) return cta_dim_t { sm_62::nt, sm_62::vt };
-    else if(ptx_version >= 61) return cta_dim_t { sm_61::nt, sm_61::vt };
-    else if(ptx_version >= 60) return cta_dim_t { sm_60::nt, sm_60::vt };
-    else if(ptx_version == 53) return cta_dim_t { sm_53::nt, sm_53::vt };
-    else if(ptx_version >= 52) return cta_dim_t { sm_52::nt, sm_52::vt };
-    else if(ptx_version >= 50) return cta_dim_t { sm_50::nt, sm_50::vt };
-    else if(ptx_version == 37) return cta_dim_t { sm_37::nt, sm_37::vt };
-    else if(ptx_version >= 35) return cta_dim_t { sm_35::nt, sm_35::vt };
-    else if(ptx_version == 32) return cta_dim_t { sm_32::nt, sm_32::vt };
-    else if(ptx_version >= 30) return cta_dim_t { sm_30::nt, sm_30::vt };
-    else if(ptx_version >= 21) return cta_dim_t { sm_21::nt, sm_21::vt };
-    else if(ptx_version >= 20) return cta_dim_t { sm_20::nt, sm_20::vt };
-    else return cta_dim_t { -1, 0 };
-  }
-
-  static cta_dim_t cta_dim(const context_t& context) {
-    return cta_dim(context.ptx_version());
-  }
-
-  static int nv(const context_t& context) {
-    return cta_dim(context.ptx_version()).nv();
-  }
-};
-
-
-template<typename launch_box, typename func_t, typename... args_t>
-int occupancy(func_t f, const context_t& context, args_t... args) {
-  int num_blocks;
-  int nt = launch_box::cta_dim(context).nt;
-  cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks, 
-    &launch_box_cta_k<launch_box, func_t, args_t...>, 
-    nt,
-    (size_t)0
-  );
-  if(cudaSuccess != result) throw cuda_exception_t(result);
-  return context.props().multiProcessorCount * num_blocks;
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/launch_params.hxx b/src/util/cuda/moderngpu/launch_params.hxx
deleted file mode 100644
index 9dc32b1d..00000000
--- a/src/util/cuda/moderngpu/launch_params.hxx
+++ /dev/null
@@ -1,152 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "meta.hxx"
-#include "tuple.hxx"
-
-#ifdef __CUDA_ARCH__
-#if   __CUDA_ARCH__ == 750
-  #define MGPU_SM_TAG sm_75
-#elif __CUDA_ARCH__ >= 700
-  #define MGPU_SM_TAG sm_70
-#elif __CUDA_ARCH__ == 620
-  #define MGPU_SM_TAG sm_62
-#elif __CUDA_ARCH__ >= 610
-  #define MGPU_SM_TAG sm_61
-#elif __CUDA_ARCH__ >= 600
-  #define MGPU_SM_TAG sm_60
-#elif __CUDA_ARCH__ == 530
-  #define MGPU_SM_TAG sm_53
-#elif __CUDA_ARCH__ >= 520
-  #define MGPU_SM_TAG sm_52
-#elif __CUDA_ARCH__ >= 500
-  #define MGPU_SM_TAG sm_50
-#elif __CUDA_ARCH__ == 370
-  #define MGPU_SM_TAG sm_37
-#elif __CUDA_ARCH__ >= 350
-  #define MGPU_SM_TAG sm_35
-#elif __CUDA_ARCH__ == 320
-  #define MGPU_SM_TAG sm_32
-#elif __CUDA_ARCH__ >= 300
-  #define MGPU_SM_TAG sm_30
-#elif __CUDA_ARCH__ >= 210
-  #define MGPU_SM_TAG sm_21
-#elif __CUDA_ARCH__ >= 200
-  #define MGPU_SM_TAG sm_20
-#else
-  #error "Modern GPU v3 does not support builds for sm_1.x"
-#endif
-#else // __CUDA_ARCH__
-  #define MGPU_SM_TAG sm_00
-#endif
-
-#define MGPU_LAUNCH_PARAMS(launch_box) \
-  typename launch_box::MGPU_SM_TAG
-#define MGPU_LAUNCH_BOUNDS(launch_box) \
-  __launch_bounds__(launch_box::sm_ptx::nt, launch_box::sm_ptx::occ) 
-
-BEGIN_MGPU_NAMESPACE
-
-struct MGPU_ALIGN(8) cta_dim_t {
-  int nt, vt;
-  int nv() const { return nt * vt; }
-  int num_ctas(int count) const {
-    return div_up(count, nv());
-  }
-};
-
-namespace detail {
-
-// Due to a bug in the compiler we need to expand make_restrict() before
-// branching on cta < num_ctas.
-template<typename func_t, typename... args_t>
-MGPU_DEVICE void restrict_forward(func_t f, int tid, int cta, int num_ctas,
-  args_t... args) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
-  if(cta < num_ctas) 
-#endif 
-    f(tid, cta, args...);
-}
-
-}
-
-// Generic thread cta kernel.
-template<typename launch_box, typename func_t, typename... args_t>
-__global__ MGPU_LAUNCH_BOUNDS(launch_box)
-void launch_box_cta_k(func_t f, int num_ctas, args_t... args) {
-  // Masking threadIdx.x by (nt - 1) may help strength reduction because the
-  // compiler now knows the range of tid: (0, nt).
-  typedef typename launch_box::sm_ptx params_t;
-  int tid = (int)(threadIdx.x % (unsigned)params_t::nt);
-  int cta = blockIdx.x;
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
-  cta += gridDim.x * blockIdx.y;
-#endif
-
-  detail::restrict_forward(f, tid, cta, num_ctas, make_restrict(args)...);
-}
-
-// Dummy kernel for retrieving PTX version.
-template<int dummy_arg>
-__global__ void dummy_k() { }
-
-template<int nt_, int vt_ = 1, int vt0_ = vt_, int occ_= 0>
-struct launch_cta_t {
-  enum { nt = nt_, vt = vt_, vt0 = vt0_, occ = occ_ };
-};
-
-#define DEF_ARCH_STRUCT(ver)                                                  \
-  template<typename params_t, typename base_t = empty_t>                      \
-  struct arch_##ver : base_t {                                                \
-    typedef params_t sm_##ver;                                                \
-                                                                              \
-    template<typename new_base_t>                                             \
-    using rebind = arch_##ver<params_t, new_base_t>;                          \
-  };                                                                          \
-                                                                              \
-  template<int nt, int vt = 1, int vt0 = vt, int occ = 0>                     \
-  using arch_##ver##_cta = arch_##ver<launch_cta_t<nt, vt, vt0, occ> >;
-
-DEF_ARCH_STRUCT(20)
-DEF_ARCH_STRUCT(21)
-DEF_ARCH_STRUCT(30)
-DEF_ARCH_STRUCT(32)
-DEF_ARCH_STRUCT(35)
-DEF_ARCH_STRUCT(37)
-DEF_ARCH_STRUCT(50)
-DEF_ARCH_STRUCT(52)
-DEF_ARCH_STRUCT(53)
-DEF_ARCH_STRUCT(60)
-DEF_ARCH_STRUCT(61)
-DEF_ARCH_STRUCT(62)
-DEF_ARCH_STRUCT(70)
-DEF_ARCH_STRUCT(75)
-
-#undef DEF_ARCH_STRUCT
-
-struct context_t;
-
-// Non-specializable launch parameters.
-template<int nt, int vt, int vt0 = vt, int occ = 0>
-struct launch_params_t : launch_cta_t<nt, vt, vt0, occ> {
-  typedef launch_params_t sm_ptx;
-
-  static cta_dim_t cta_dim() {
-    return cta_dim_t { nt, vt };
-  }
-
-  static cta_dim_t cta_dim(int) {
-    return cta_dim();
-  }
-
-  static cta_dim_t cta_dim(const context_t& context) {
-    return cta_dim();
-  }
-
-  static int nv(const context_t& context) {
-    return cta_dim().nv();
-  }
-};
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/loadstore.hxx b/src/util/cuda/moderngpu/loadstore.hxx
deleted file mode 100644
index 836c2a34..00000000
--- a/src/util/cuda/moderngpu/loadstore.hxx
+++ /dev/null
@@ -1,188 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "types.hxx"
-#include "intrinsics.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-////////////////////////////////////////////////////////////////////////////////
-// reg<->shared
-
-template<int nt, int vt, typename type_t, int shared_size>
-MGPU_DEVICE void reg_to_shared_thread(array_t<type_t, vt> x, int tid,
-  type_t (&shared)[shared_size], bool sync = true) {
-
-  static_assert(shared_size >= nt * vt,
-    "reg_to_shared_thread must have at least nt * vt storage");
-
-  thread_iterate<vt>([&](int i, int j) { 
-    shared[j] = x[i]; 
-  }, tid);
-  if(sync) __syncthreads();
-}
-
-template<int nt, int vt, typename type_t, int shared_size>
-MGPU_DEVICE array_t<type_t, vt> shared_to_reg_thread(
-  const type_t (&shared)[shared_size], int tid, bool sync = true) {
-
-  static_assert(shared_size >= nt * vt,
-    "reg_to_shared_thread must have at least nt * vt storage");
-
-  array_t<type_t, vt> x;
-  thread_iterate<vt>([&](int i, int j) { 
-    x[i] = shared[j];
-  }, tid);
-  if(sync) __syncthreads();
-  return x;
-}
-
-template<int nt, int vt, typename type_t, int shared_size>
-MGPU_DEVICE void reg_to_shared_strided(array_t<type_t, vt> x, int tid,
-  type_t (&shared)[shared_size], bool sync = true) {
-
-  static_assert(shared_size >= nt * vt,
-    "reg_to_shared_strided must have at least nt * vt storage");
-
-  strided_iterate<nt, vt>([&](int i, int j) { shared[j] = x[i]; }, tid);
-  if(sync) __syncthreads();
-}
-
-template<int nt, int vt, typename type_t, int shared_size>
-MGPU_DEVICE array_t<type_t, vt> shared_to_reg_strided(
-  const type_t (&shared)[shared_size], int tid, bool sync = true) {
-
-  static_assert(shared_size >= nt * vt,
-    "shared_to_reg_strided must have at least nt * vt storage");
-
-  array_t<type_t, vt> x;
-  strided_iterate<nt, vt>([&](int i, int j) { x[i] = shared[j]; }, tid);
-  if(sync) __syncthreads();
-  return x;
-}
-
-template<int nt, int vt, typename type_t, int shared_size>
-MGPU_DEVICE array_t<type_t, vt> shared_gather(const type_t(&data)[shared_size],
-  array_t<int, vt> indices, bool sync = true) {
-
-  static_assert(shared_size >= nt * vt,
-    "shared_gather must have at least nt * vt storage");
-
-  array_t<type_t, vt> x;
-  iterate<vt>([&](int i) { x[i] = data[indices[i]]; });
-  if(sync) __syncthreads();
-  return x;
-}
-
-template<int nt, int vt, typename type_t, int shared_size>
-MGPU_DEVICE array_t<type_t, vt> thread_to_strided(array_t<type_t, vt> x, 
-  int tid, type_t (&shared)[shared_size]) {
-
-  reg_to_shared_thread<nt, vt>(x, tid, shared);
-  return shared_to_reg_strided<nt, vt>(shared, tid);
-}
-
-
-
-////////////////////////////////////////////////////////////////////////////////
-// reg<->memory
-
-template<int nt, int vt, int vt0 = vt, typename type_t, typename it_t>
-MGPU_DEVICE void reg_to_mem_strided(array_t<type_t, vt> x, int tid, 
-  int count, it_t mem) {
-
-  strided_iterate<nt, vt, vt0>([=](int i, int j) { 
-    mem[j] = x[i]; 
-  }, tid, count);
-}
-
-template<int nt, int vt, int vt0 = vt, typename it_t>
-MGPU_DEVICE array_t<typename std::iterator_traits<it_t>::value_type, vt> 
-mem_to_reg_strided(it_t mem, int tid, int count) {
-  typedef typename std::iterator_traits<it_t>::value_type type_t;
-  array_t<type_t, vt> x;
-  strided_iterate<nt, vt, vt0>([&](int i, int j) { 
-    x[i] = mem[j]; 
-  }, tid, count);
-  return x;
-}
-
-template<int nt, int vt, int vt0 = vt, typename type_t, typename it_t, 
-  int shared_size>
-MGPU_DEVICE void reg_to_mem_thread(array_t<type_t, vt> x, int tid,
-  int count, it_t mem, type_t (&shared)[shared_size]) {
-
-  reg_to_shared_thread<nt>(x, tid, shared);
-  array_t<type_t, vt> y = shared_to_reg_strided<nt, vt>(shared, tid);
-  reg_to_mem_strided<nt, vt, vt0>(y, tid, count, mem);
-}
-
-template<int nt, int vt, int vt0 = vt, typename type_t, typename it_t, 
-  int shared_size>
-MGPU_DEVICE array_t<type_t, vt> mem_to_reg_thread(it_t mem, int tid,
-  int count, type_t (&shared)[shared_size]) {
-
-  array_t<type_t, vt> x = mem_to_reg_strided<nt, vt, vt0>(mem, tid, count);
-  reg_to_shared_strided<nt, vt>(x, tid, shared);
-  array_t<type_t, vt> y = shared_to_reg_thread<nt, vt>(shared, tid);
-  return y;
-}
-
-template<int nt, int vt, int vt0 = vt, typename input_it, typename output_it>
-MGPU_DEVICE void mem_to_mem(input_it input, int tid, int count,
-  output_it output) {
-  typedef typename std::iterator_traits<input_it>::value_type type_t;
-  type_t x[vt];
-
-  strided_iterate<nt, vt, vt0>([&](int i, int j) {
-    x[i] = input[j];
-  }, tid, count);
-  strided_iterate<nt, vt, vt0>([&](int i, int j) {
-    output[j] = x[i];
-  }, tid, count);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// memory<->memory
-
-template<int nt, int vt, int vt0 = vt, typename type_t, typename it_t>
-MGPU_DEVICE void mem_to_shared(it_t mem, int tid, int count, type_t* shared, 
-  bool sync = true) {
-
-  array_t<type_t, vt> x = mem_to_reg_strided<nt, vt, vt0>(mem, tid, count);
-  strided_iterate<nt, vt, vt0>([&](int i, int j) {
-    shared[j] = x[i];
-  }, tid, count);
-  if(sync) __syncthreads();
-}
-
-template<int nt, int vt, typename type_t, typename it_t>
-MGPU_DEVICE void shared_to_mem(const type_t* shared, int tid, int count,
-  it_t mem, bool sync = true) {
-
-  strided_iterate<nt, vt>([&](int i, int j) { 
-    mem[j] = shared[j]; 
-  }, tid, count);
-  if(sync) __syncthreads();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// reg<->reg
-
-template<int nt, int vt, typename type_t, int shared_size>
-MGPU_DEVICE array_t<type_t, vt> reg_thread_to_strided(array_t<type_t, vt> x,
-  int tid, type_t (&shared)[shared_size]) {
-
-  reg_to_shared_thread<nt>(x, tid, shared);
-  return shared_to_reg_strided<nt, vt>(shared, tid);
-}
-
-template<int nt, int vt, typename type_t, int shared_size>
-MGPU_DEVICE array_t<type_t, vt> reg_strided_to_thread(array_t<type_t, vt> x,
-  int tid, type_t (&shared)[shared_size]) {
-
-  reg_to_shared_strided<nt>(x, tid, shared);
-  return shared_to_reg_thread<nt, vt>(shared, tid);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/memory.hxx b/src/util/cuda/moderngpu/memory.hxx
deleted file mode 100644
index ecde69e9..00000000
--- a/src/util/cuda/moderngpu/memory.hxx
+++ /dev/null
@@ -1,131 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "transform.hxx"
-#include "context.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory functions on raw pointers.
-
-template<typename type_t>
-cudaError_t htoh(type_t* dest, const type_t* source, size_t count) {
-  if(count) 
-    memcpy(dest, source, sizeof(type_t) * count);
-  return cudaSuccess;
-}
-
-template<typename type_t>
-cudaError_t dtoh(type_t* dest, const type_t* source, size_t count) {
-  cudaError_t result = count ? 
-    cudaMemcpy(dest, source, sizeof(type_t) * count,
-      cudaMemcpyDeviceToHost) :
-    cudaSuccess;
-  return result;
-}
-
-template<typename type_t>
-cudaError_t htod(type_t* dest, const type_t* source, size_t count) {
-  cudaError_t result = count ?
-    cudaMemcpy(dest, source, sizeof(type_t) * count,
-      cudaMemcpyHostToDevice) :
-    cudaSuccess;
-  return result;
-}
-
-template<typename type_t>
-cudaError_t dtod(type_t* dest, const type_t* source, size_t count) {
-  cudaError_t result = count ?
-    cudaMemcpy(dest, source, sizeof(type_t) * count,
-      cudaMemcpyDeviceToDevice) :
-    cudaSuccess;
-  return result;
-}
-
-template<typename type_t>
-cudaError_t dtoh(std::vector<type_t>& dest, const type_t* source, 
-  size_t count) {
-  dest.resize(count);
-  return dtoh(dest.data(), source, count);
-}
-
-template<typename type_t>
-cudaError_t htod(type_t* dest, const std::vector<type_t>& source) {
-  return htod(dest, source.data(), source.size());
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory functions on mem_t.
-
-template<typename type_t>
-mem_t<type_t> to_mem(const std::vector<type_t>& data, context_t& context) {
-  mem_t<type_t> mem(data.size(), context);
-  cudaError_t result = htod(mem.data(), data);
-  if(cudaSuccess != result) throw cuda_exception_t(result);
-  return mem;
-}
-
-template<typename type_t>
-std::vector<type_t> from_mem(const mem_t<type_t>& mem) {
-  std::vector<type_t> host;
-  cudaError_t result = dtoh(host, mem.data(), mem.size());
-  if(cudaSuccess != result) throw cuda_exception_t(result);
-  return host;
-}
-
-template<typename type_t, typename func_t>
-mem_t<type_t> fill_function(func_t f, size_t count, context_t& context) {
-  mem_t<type_t> mem(count, context);
-  type_t* p = mem.data();
-  transform([=]MGPU_DEVICE(int index) {
-    p[index] = f(index);
-  }, count, context);
-  return mem;
-}
-
-template<typename type_t>
-mem_t<type_t> fill(type_t value, size_t count, context_t& context) {
-  // We'd prefer to call fill_function and pass a lambda that returns value,
-  // but that can create tokens that are too long for VS2013.
-  mem_t<type_t> mem(count, context);
-  type_t* p = mem.data();
-  transform([=]MGPU_DEVICE(int index) {
-    p[index] = value;
-  }, count, context);
-  return mem;
-}
-
-template<typename it_t>
-auto copy_to_mem(it_t input, size_t count, context_t& context) -> 
-  mem_t<typename std::iterator_traits<it_t>::value_type> {
-  
-  typedef typename std::iterator_traits<it_t>::value_type type_t;
-  mem_t<type_t> mem(count, context);
-  type_t* p = mem.data();
-  transform([=]MGPU_DEVICE(int index) {
-    p[index] = input[index];
-  }, count, context);
-  return mem;
-}
-
-inline std::mt19937& get_mt19937() {
-  static std::mt19937 mt19937;
-  return mt19937;
-}
-
-mem_t<int> inline fill_random(int a, int b, size_t count, bool sorted, 
-  context_t& context) {
-
-  std::uniform_int_distribution<int> d(a, b);
-  std::vector<int> data(count);
-
-  for(int& i : data)
-    i = d(get_mt19937());
-  if(sorted) 
-    std::sort(data.begin(), data.end());
-
-  return to_mem(data, context);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/meta.hxx b/src/util/cuda/moderngpu/meta.hxx
deleted file mode 100644
index 369c303e..00000000
--- a/src/util/cuda/moderngpu/meta.hxx
+++ /dev/null
@@ -1,249 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include <typeinfo>
-#include <type_traits>
-#include <iterator>
-#include <cassert>
-#include <cfloat>
-#include <cstdint>
-
-#ifdef __CUDACC__
-
-#ifndef MGPU_HOST_DEVICE
-  #define MGPU_HOST_DEVICE __forceinline__ __device__ __host__
-#endif
-
-#ifndef MGPU_DEVICE
-  #define MGPU_DEVICE __device__
-#endif
-
-// Currently NVCC does not support __device__ __host__ tags on lambdas that
-// are captured on the host and executed on the device. There is no good reason
-// for this, as you can __device__ __host__ tag functor operators and use
-// them in the same way. So for now, tag your functors with MGPU_LAMBDA. This
-// means they are only supported in device code, but when a future version of
-// CUDA lists this restriction MGPU_LAMBDA will be redefined to __device__
-// __host__.
-#ifndef MGPU_LAMBDA
-  #define MGPU_LAMBDA __device__
-#endif
-
-#else // #ifndef __CUDACC__
-
-#define MGPU_HOST_DEVICE
-
-#endif // #ifdef __CUDACC__
-
-#ifndef PRAGMA_UNROLL
-#if defined(__CUDA_ARCH__) && !defined(__clang__)
-  #define PRAGMA_UNROLL #pragma PRAGMA_UNROLL
-#else
-  #define PRAGMA_UNROLL
-#endif
-#endif
-
-#define BEGIN_MGPU_NAMESPACE namespace mgpu {
-#define END_MGPU_NAMESPACE }
-
-BEGIN_MGPU_NAMESPACE
-
-template< bool B, class T = void >
-using enable_if_t = typename std::enable_if<B,T>::type;
-
-enum { warp_size = 32 };
-
-#if defined(_MSC_VER) && _MSC_VER <= 1800      // VS 2013 is terrible.
-
-#define is_pow2(x) (0 == ((x) & ((x) - 1)))
-#define div_up(x, y) (((x) + (y) - 1) / (y))
-
-namespace details {
-template<int i, bool recurse = (i > 1)>
-struct s_log2_t {
-  enum { value = s_log2_t<i / 2>::value + 1 };
-};
-template<int i> struct s_log2_t<i, false> {
-  enum { value = 0 };
-};
-} // namespace details
-
-#define s_log2(x) details::s_log2_t<x>::value
-
-#else
-
-MGPU_HOST_DEVICE constexpr bool is_pow2(int x) {
-  return 0 == (x & (x - 1));
-}
-MGPU_HOST_DEVICE constexpr int div_up(int x, int y) {
-  return (x + y - 1) / y;
-}
-MGPU_HOST_DEVICE constexpr int64_t div_up(int64_t x, int64_t y) {
-  return (x + y - 1) / y;
-}
-MGPU_HOST_DEVICE constexpr size_t div_up(size_t x, size_t y) {
-  return (x + y - 1) / y;
-}
-MGPU_HOST_DEVICE constexpr int s_log2(int x, int p = 0) {
-  return x > 1 ? s_log2(x / 2) + 1 : p;
-}
-MGPU_HOST_DEVICE constexpr size_t s_log2(size_t x, size_t p = 0) {
-  return x > 1 ? s_log2(x / 2) + 1 : p;
-}
-
-#endif
-
-#ifdef _MSC_VER
-  #define MGPU_ALIGN(x) __declspec(align(x))
-#else
-  #define MGPU_ALIGN(x) __attribute__((aligned(x)))
-#endif
-
-// Apparently not defined by CUDA.
-template<typename real_t>
-MGPU_HOST_DEVICE constexpr real_t min(real_t a, real_t b) {
-  return (b < a) ? b : a;
-}
-template<typename real_t>
-MGPU_HOST_DEVICE constexpr real_t max(real_t a, real_t b) {
-  return (a < b) ? b : a;
-}
-
-struct empty_t { };
-
-template<typename... args_t>
-MGPU_HOST_DEVICE void swallow(args_t...) { }
-
-template<typename... base_v>
-struct inherit_t;
-
-template<typename base_t, typename... base_v>
-struct inherit_t<base_t, base_v...> : 
-  base_t::template rebind<inherit_t<base_v...> > { };
-
-template<typename base_t>
-struct inherit_t<base_t> : base_t { };
-
-////////////////////////////////////////////////////////////////////////////////
-// Conditional typedefs. 
-
-// Typedef type_a if type_a is not empty_t.
-// Otherwise typedef type_b.
-template<typename type_a, typename type_b>
-struct conditional_typedef_t {
-  typedef typename std::conditional<
-    !std::is_same<type_a, empty_t>::value, 
-    type_a, 
-    type_b
-  >::type type_t;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// Code to treat __restrict__ as a CV qualifier.
-
-template<typename arg_t>
-struct is_restrict {
-  enum { value = false };
-};
-template<typename arg_t>
-struct is_restrict<arg_t __restrict__> {
-  enum { value = true };
-};
-
-// Add __restrict__ only to pointers.
-template<typename arg_t>
-struct add_restrict {
-  typedef arg_t type;
-};
-template<typename arg_t>
-struct add_restrict<arg_t*> {
-  typedef arg_t* __restrict__ type;
-};
-
-template<typename arg_t>
-struct remove_restrict {
-  typedef arg_t type;
-};
-template<typename arg_t>
-struct remove_restrict<arg_t __restrict__> {
-  typedef arg_t type;
-};
-
-template<typename arg_t>
-MGPU_HOST_DEVICE typename add_restrict<arg_t>::type make_restrict(arg_t x) {
-  typename add_restrict<arg_t>::type y = x;
-  return y;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Template unrolled looping construct.
-
-template<int i, int count, bool valid = (i < count)>
-struct iterate_t {
-  #pragma nv_exec_check_disable
-  template<typename func_t>
-  MGPU_HOST_DEVICE static void eval(func_t f) {
-    f(i);
-    iterate_t<i + 1, count>::eval(f);
-  }
-};
-template<int i, int count>
-struct iterate_t<i, count, false> {
-  template<typename func_t>
-  MGPU_HOST_DEVICE static void eval(func_t f) { }
-};
-template<int begin, int end, typename func_t>
-MGPU_HOST_DEVICE void iterate(func_t f) {
-  iterate_t<begin, end>::eval(f);
-}
-template<int count, typename func_t>
-MGPU_HOST_DEVICE void iterate(func_t f) {
-  iterate<0, count>(f);
-}
-
-template<int count, typename type_t>
-MGPU_HOST_DEVICE type_t reduce(const type_t(&x)[count]) {
-  type_t y;
-  iterate<count>([&](int i) { y = i ? x[i] + y : x[i]; });
-  return y;
-}
-
-template<int count, typename type_t>
-MGPU_HOST_DEVICE void fill(type_t(&x)[count], type_t val) {
-  iterate<count>([&](int i) { x[i] = val; });
-}
-
-#ifdef __CUDACC__
-
-// Invoke unconditionally.
-template<int nt, int vt, typename func_t>
-MGPU_DEVICE void strided_iterate(func_t f, int tid) {
-  iterate<vt>([=](int i) { f(i, nt * i + tid); });
-}
-
-// Check range.
-template<int nt, int vt, int vt0 = vt, typename func_t>
-MGPU_DEVICE void strided_iterate(func_t f, int tid, int count) {
-  // Unroll the first vt0 elements of each thread.
-  if(vt0 > 1 && count >= nt * vt0) {
-    strided_iterate<nt, vt0>(f, tid);    // No checking
-  } else {
-    iterate<vt0>([=](int i) {
-      int j = nt * i + tid;
-      if(j < count) f(i, j);
-    });
-  }
-
-  iterate<vt0, vt>([=](int i) {
-    int j = nt * i + tid;
-    if(j < count) f(i, j);
-  });
-}
-template<int vt, typename func_t>
-MGPU_DEVICE void thread_iterate(func_t f, int tid) {
-  iterate<vt>([=](int i) { f(i, vt * tid + i); });
-}
-
-#endif // ifdef __CUDACC__
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/operators.hxx b/src/util/cuda/moderngpu/operators.hxx
deleted file mode 100644
index 2178ce75..00000000
--- a/src/util/cuda/moderngpu/operators.hxx
+++ /dev/null
@@ -1,347 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "meta.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-namespace detail {
-
-template<typename it_t, 
-  typename type_t = typename std::iterator_traits<it_t>::value_type, 
-  bool use_ldg = 
-    std::is_pointer<it_t>::value && 
-    std::is_arithmetic<type_t>::value
->
-struct ldg_load_t {
-  MGPU_HOST_DEVICE static type_t load(it_t it) {
-    return *it;
-  }
-};
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-
-template<typename it_t, typename type_t>
-struct ldg_load_t<it_t, type_t, true> {
-  MGPU_HOST_DEVICE static type_t load(it_t it) {
-    return __ldg(it);
-  }
-};
-
-#endif
-
-} // namespace detail
-
-template<typename it_t>
-MGPU_HOST_DEVICE typename std::iterator_traits<it_t>::value_type
-ldg(it_t it) {
-  return detail::ldg_load_t<it_t>::load(it);
-}
-
-template<typename real_t>
-MGPU_HOST_DEVICE real_t sq(real_t x) { return x * x; }
-
-template<typename type_t>
-MGPU_HOST_DEVICE void swap(type_t& a, type_t& b) {
-  type_t c = a; a = b; b = c;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Device-side comparison operators.
-
-template<typename type_t>
-struct less_t : public std::binary_function<type_t, type_t, bool> {
-  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
-    return a < b;
-  }
-};
-template<typename type_t>
-struct less_equal_t : public std::binary_function<type_t, type_t, bool> {
-  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
-    return a <= b;
-  }
-};
-template<typename type_t>
-struct greater_t : public std::binary_function<type_t, type_t, bool> {
-  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
-    return a > b;
-  }
-};
-template<typename type_t>
-struct greater_equal_t : public std::binary_function<type_t, type_t, bool> {
-  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
-    return a >= b;
-  }
-};
-template<typename type_t>
-struct equal_to_t : public std::binary_function<type_t, type_t, bool> {
-  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
-    return a == b;
-  }
-};
-template<typename type_t>
-struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> {
-  MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
-    return a != b;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// Device-side arithmetic operators.
-
-template<typename type_t>
-struct plus_t : public std::binary_function<type_t, type_t, type_t> {
-	MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
-    return a + b;
-  }
-};
-
-template<typename type_t>
-struct minus_t : public std::binary_function<type_t, type_t, type_t> {
-	MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
-    return a - b;
-  }
-};
-
-template<typename type_t>
-struct multiplies_t : public std::binary_function<type_t, type_t, type_t> {
-  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
-    return a * b;
-  }
-};
-
-template<typename type_t>
-struct maximum_t  : public std::binary_function<type_t, type_t, type_t> {
-  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
-    return max(a, b);
-  }
-};
-
-template<typename type_t>
-struct minimum_t  : public std::binary_function<type_t, type_t, type_t> {
-  MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const {
-    return min(a, b);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// iterator_t and const_iterator_t are base classes for customized iterators.
-
-template<typename outer_t, typename int_t, typename value_type>
-struct iterator_t : public std::iterator_traits<const value_type*> {
-
-  iterator_t() = default;
-  MGPU_HOST_DEVICE iterator_t(int_t i) : index(i) { }
-
-  MGPU_HOST_DEVICE outer_t operator+(int_t diff) const {
-    outer_t next = *static_cast<const outer_t*>(this);
-    next += diff;
-    return next;
-  }
-  MGPU_HOST_DEVICE outer_t operator-(int_t diff) const {
-    outer_t next = *static_cast<const outer_t*>(this);
-    next -= diff;
-    return next;
-  }
-  MGPU_HOST_DEVICE outer_t& operator+=(int_t diff) {
-    index += diff;
-    return *static_cast<outer_t*>(this);
-  }
-  MGPU_HOST_DEVICE outer_t& operator-=(int_t diff) {
-    index -= diff;
-    return *static_cast<outer_t*>(this);
-  }
-
-  int_t index;
-};
-
-template<typename outer_t, typename int_t, typename value_type>
-struct const_iterator_t : public iterator_t<outer_t, int_t, value_type> {
-  typedef iterator_t<outer_t, int_t, value_type> base_t;
-
-  const_iterator_t() = default;
-  MGPU_HOST_DEVICE const_iterator_t(int_t i) : base_t(i) { }
-
-  // operator[] and operator* are tagged as DEVICE-ONLY.  This is to ensure
-  // compatibility with lambda capture in CUDA 7.5, which does not support
-  // marking a lambda as __host__ __device__.
-  // We hope to relax this when a future CUDA fixes this problem.
-  MGPU_HOST_DEVICE value_type operator[](int_t diff) const {
-    return static_cast<const outer_t&>(*this)(base_t::index + diff);
-  }
-  MGPU_HOST_DEVICE value_type operator*() const {
-    return (*this)[0];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// discard_iterator_t is a store iterator that discards its input.
-
-template<typename value_type> 
-struct discard_iterator_t : 
-  iterator_t<discard_iterator_t<value_type>, int, value_type> {
-
-  struct assign_t {
-    MGPU_HOST_DEVICE value_type operator=(value_type v) { 
-      return value_type(); 
-    }
-  };
-
-  MGPU_HOST_DEVICE assign_t operator[](int index) const { 
-    return assign_t(); 
-  }
-  MGPU_HOST_DEVICE assign_t operator*() const { return assign_t(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// counting_iterator_t returns index.
-
-template<typename type_t, typename int_t = int>
-struct counting_iterator_t :
-  const_iterator_t<counting_iterator_t<type_t>, int_t, type_t> {
-
-  counting_iterator_t() = default;
-  MGPU_HOST_DEVICE counting_iterator_t(type_t i) : 
-    const_iterator_t<counting_iterator_t, int_t, type_t>(i) { }
-
-  MGPU_HOST_DEVICE type_t operator()(int_t index) const {
-    return (type_t)index;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// strided_iterator_t returns offset + index * stride.
-
-template<typename type_t, typename int_t = int>
-struct strided_iterator_t :
-  const_iterator_t<strided_iterator_t<type_t>, int_t, int> {
-
-  strided_iterator_t() = default;
-  MGPU_HOST_DEVICE strided_iterator_t(type_t offset_, type_t stride_) : 
-    const_iterator_t<strided_iterator_t, int_t, type_t>(0), 
-    offset(offset_), stride(stride_) { }
-
-  MGPU_HOST_DEVICE type_t operator()(int_t index) const {
-    return offset + index * stride;
-  }
-
-  type_t offset, stride;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// constant_iterator_t returns the value it was initialized with.
-
-template<typename type_t>
-struct constant_iterator_t : 
-  const_iterator_t<constant_iterator_t<type_t>, int, type_t> {
-
-  type_t value;
-
-  MGPU_HOST_DEVICE constant_iterator_t(type_t value_) : value(value_) { }
-
-  MGPU_HOST_DEVICE type_t operator()(int index) const {
-    return value;
-  }
-};
-
-// These types only supported with nvcc until CUDA 8.0 allows host-device
-// lambdas and MGPU_LAMBDA is redefined to MGPU_HOST_DEVICE
-
-#ifdef __CUDACC__
-
-////////////////////////////////////////////////////////////////////////////////
-// lambda_iterator_t
-
-template<typename load_t, typename store_t, typename value_type, typename int_t>
-struct lambda_iterator_t : std::iterator_traits<const value_type*> {
-
-  load_t load;
-  store_t store;
-  int_t base;
-
-  lambda_iterator_t(load_t load_, store_t store_, int_t base_) :
-    load(load_), store(store_), base(base_) { }
-
-  struct assign_t {
-    load_t load;
-    store_t store;
-    int_t index;
-
-    MGPU_LAMBDA assign_t& operator=(value_type rhs) {
-      static_assert(!std::is_same<store_t, empty_t>::value, 
-        "load_iterator is being stored to.");
-      store(rhs, index);
-      return *this;
-    }
-    MGPU_LAMBDA operator value_type() const {
-      static_assert(!std::is_same<load_t, empty_t>::value,
-        "store_iterator is being loaded from.");
-      return load(index);
-    }
-  };
-
-  MGPU_LAMBDA assign_t operator[](int_t index) const {
-    return assign_t { load, store, base + index };
-  } 
-  MGPU_LAMBDA assign_t operator*() const {
-    return assign_t { load, store, base };
-  }
-
-  MGPU_HOST_DEVICE lambda_iterator_t operator+(int_t offset) const {
-    lambda_iterator_t cp = *this;
-    cp += offset;
-    return cp;
-  }
-
-  MGPU_HOST_DEVICE lambda_iterator_t& operator+=(int_t offset) {
-    base += offset;
-    return *this;
-  }
-
-  MGPU_HOST_DEVICE lambda_iterator_t operator-(int_t offset) const {
-    lambda_iterator_t cp = *this;
-    cp -= offset;
-    return cp;
-  }
-
-  MGPU_HOST_DEVICE lambda_iterator_t& operator-=(int_t offset) {
-    base -= offset;
-    return *this;
-  }
-};
-
-template<typename value_type>
-struct trivial_load_functor {
-  template<typename int_t>
-  MGPU_HOST_DEVICE value_type operator()(int_t index) const {
-    return value_type();
-  }
-};
-
-template<typename value_type>
-struct trivial_store_functor {
-  template<typename int_t>
-  MGPU_HOST_DEVICE void operator()(value_type v, int_t index) const { }
-};
-
-template<typename value_type, typename int_t = int, typename load_t, 
-  typename store_t>
-lambda_iterator_t<load_t, store_t, value_type, int_t> 
-  make_load_store_iterator(load_t load, store_t store, int_t base = 0) {
-  return lambda_iterator_t<load_t, store_t, value_type, int_t>(load, store, base);
-}
-
-template<typename value_type, typename int_t = int, typename load_t>
-lambda_iterator_t<load_t, empty_t, value_type, int_t>
-make_load_iterator(load_t load, int_t base = 0) {
-  return make_load_store_iterator<value_type>(load, empty_t(), base);
-}
-
-template<typename value_type, typename int_t = int, typename store_t>
-lambda_iterator_t<empty_t, store_t, value_type, int_t>
-make_store_iterator(store_t store, int_t base = 0) {
-  return make_load_store_iterator<value_type>(empty_t(), store, base);
-}
-
-#endif // #ifdef __CUDACC__
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/search.hxx b/src/util/cuda/moderngpu/search.hxx
deleted file mode 100644
index 2d8b6b40..00000000
--- a/src/util/cuda/moderngpu/search.hxx
+++ /dev/null
@@ -1,53 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "loadstore.hxx"
-#include "operators.hxx"
-#include "cta_search.hxx"
-#include "memory.hxx"
-#include "context.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<bounds_t bounds, typename a_keys_it, typename b_keys_it,
-  typename comp_t>
-mem_t<int> merge_path_partitions(a_keys_it a, int64_t a_count, b_keys_it b,
-  int64_t b_count, int64_t spacing, comp_t comp, context_t& context) {
-
-  typedef int int_t;
-  int num_partitions = (int)div_up(a_count + b_count, spacing) + 1;
-  mem_t<int_t> mem(num_partitions, context);
-  int_t* p = mem.data();
-  transform([=]MGPU_DEVICE(int index) {
-    int_t diag = (int_t)min(spacing * index, a_count + b_count);
-    p[index] = merge_path<bounds>(a, (int_t)a_count, b, (int_t)b_count,
-      diag, comp);
-  }, num_partitions, context);
-  return mem;
-}
-
-template<typename segments_it>
-auto load_balance_partitions(int64_t dest_count, segments_it segments, 
-  int num_segments, int spacing, context_t& context) -> 
-  mem_t<typename std::iterator_traits<segments_it>::value_type> {
-
-  typedef typename std::iterator_traits<segments_it>::value_type int_t;
-  return merge_path_partitions<bounds_upper>(counting_iterator_t<int_t>(0), 
-    dest_count, segments, num_segments, spacing, less_t<int_t>(), context);
-}
-
-template<bounds_t bounds, typename keys_it>
-mem_t<int> binary_search_partitions(keys_it keys, int count, int num_items,
-  int spacing, context_t& context) {
-
-  int num_partitions = div_up(count, spacing) + 1;
-  mem_t<int> mem(num_partitions, context);
-  int* p = mem.data();
-  transform([=]MGPU_DEVICE(int index) {
-    int key = min(spacing * index, count);
-    p[index] = binary_search<bounds>(keys, num_items, key, less_t<int>());
-  }, num_partitions, context);
-  return mem;
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/sort_networks.hxx b/src/util/cuda/moderngpu/sort_networks.hxx
deleted file mode 100644
index 38686edf..00000000
--- a/src/util/cuda/moderngpu/sort_networks.hxx
+++ /dev/null
@@ -1,57 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-#include "operators.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-////////////////////////////////////////////////////////////////////////////////
-// Odd-even transposition sorting network. Sorts keys and values in-place in
-// register.
-// http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort
-
-template<typename type_t, int vt, typename comp_t>
-MGPU_HOST_DEVICE array_t<type_t, vt> 
-odd_even_sort(array_t<type_t, vt> x, comp_t comp, int flags = 0) { 
-  iterate<vt>([&](int I) {
-    PRAGMA_UNROLL
-    for(int i = 1 & I; i < vt - 1; i += 2) {
-      if((0 == ((2<< i) & flags)) && comp(x[i + 1], x[i]))
-        swap(x[i], x[i + 1]);
-    }
-  });
-  return x;
-}
-
-template<typename key_t, typename val_t, int vt, typename comp_t>
-MGPU_HOST_DEVICE kv_array_t<key_t, val_t, vt> 
-odd_even_sort(kv_array_t<key_t, val_t, vt> x, comp_t comp, int flags = 0) { 
-  iterate<vt>([&](int I) {
-    PRAGMA_UNROLL
-    for(int i = 1 & I; i < vt - 1; i += 2) {
-      if((0 == ((2<< i) & flags)) && comp(x.keys[i + 1], x.keys[i])) {
-        swap(x.keys[i], x.keys[i + 1]);
-        swap(x.vals[i], x.vals[i + 1]);
-      }
-    }
-  });
-  return x;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// TODO: Batcher Odd-Even Mergesort network
-// Unstable but executes much faster than the transposition sort.
-// http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
-#if 0
-template<int width, int low, int count>
-struct odd_even_mergesort_t {
-
-};
-
-template<typename key_t, typename val_t, int vt, typename comp_t>
-MGPU_HOST_DEVICE kv_array_t<key_t, val_t, vt> 
-odd_even_mergesort(kv_array_t<key_t, val_t, vt> x, int flags = 0) {
-  return kv_array_t<key_t, val_t, vt>();
-}
-#endif
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/transform.hxx b/src/util/cuda/moderngpu/transform.hxx
deleted file mode 100644
index 99295a81..00000000
--- a/src/util/cuda/moderngpu/transform.hxx
+++ /dev/null
@@ -1,107 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-
-#include <random>
-#include <algorithm>
-#include <cuda.h>
-#include "launch_box.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-////////////////////////////////////////////////////////////////////////////////
-// Launch a grid given a number of CTAs.
-
-template<typename launch_box, typename func_t, typename... args_t>
-void cta_launch(func_t f, int num_ctas, context_t& context, args_t... args) { 
-  cta_dim_t cta = launch_box::cta_dim(context.ptx_version());
-  dim3 grid_dim(num_ctas);
-  if(context.ptx_version() < 30 && num_ctas > 65535)
-    grid_dim = dim3(256, div_up(num_ctas, 256));
-  
-  if(num_ctas)
-  {
-    launch_box_cta_k<launch_box, func_t>
-      <<<grid_dim, cta.nt,0,context.stream()>>>(f, num_ctas, args...);
-  }
-}
-
-template<int nt, int vt = 1, typename func_t, typename... args_t>
-void cta_launch(func_t f, int num_ctas, context_t& context, args_t... args) {
-  cta_launch<launch_params_t<nt, vt> >(f, num_ctas, context, args...);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Launch a grid given a number of work-items.
-
-template<typename launch_box, typename func_t, typename... args_t>
-void cta_transform(func_t f, int count, context_t& context, args_t... args) {
-  cta_dim_t cta = launch_box::cta_dim(context.ptx_version());
-  int num_ctas = div_up(count, cta.nv());
-  cta_launch<launch_box>(f, num_ctas, context, args...);
-}
-
-template<int nt, int vt = 1, typename func_t, typename... args_t>
-void cta_transform(func_t f, int count, context_t& context, args_t... args) {
-  cta_transform<launch_params_t<nt, vt> >(f, count, context, args...);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Launch persistent CTAs and loop through num_ctas values.
-
-template<typename launch_box, typename func_t, typename... args_t>
-void cta_launch(func_t f, const int* num_tiles, context_t& context, 
-  args_t... args) {
-
-  // Over-subscribe the device by a factor of 8.
-  // This reduces the penalty if we can't schedule all the CTAs to run 
-  // concurrently.
-  int num_ctas = 8 * occupancy<launch_box>(f, context);
-
-  auto k = [=] MGPU_DEVICE(int tid, int cta, args_t... args) {
-    int count = *num_tiles;
-    while(cta < count) {
-      f(tid, cta, args...);
-      cta += num_ctas;
-    }
-  };
-  cta_launch<launch_box>(k, num_ctas, context, args...);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Ordinary transform launch. This uses the standard launch box mechanism 
-// so we can query its occupancy and other things.
-
-namespace detail {
-
-template<typename launch_t>
-struct transform_f {
-  template<typename func_t, typename... args_t>
-  MGPU_DEVICE void operator()(int tid, int cta, func_t f, 
-    size_t count, args_t... args) {
-
-    typedef typename launch_t::sm_ptx params_t;
-    enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 };
-
-    range_t range = get_tile(cta, nt * vt, count);
-
-    strided_iterate<nt, vt, vt0>([=](int i, int j) {
-      f(range.begin + j, args...);
-    }, tid, range.count());  
-  }
-};
-
-} 
-
-template<typename launch_t, typename func_t, typename... args_t>
-void transform(func_t f, size_t count, context_t& context, args_t... args) {
-  cta_transform<launch_t>(detail::transform_f<launch_t>(), count, 
-    context, f, count, args...);
-}
-
-template<size_t nt = 128, int vt = 1, typename func_t, typename... args_t>
-void transform(func_t f, size_t count, context_t& context, args_t... args) {
-  transform<launch_params_t<nt, vt> >(f, count, context, args...);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/tuple.hxx b/src/util/cuda/moderngpu/tuple.hxx
deleted file mode 100644
index 2e381f52..00000000
--- a/src/util/cuda/moderngpu/tuple.hxx
+++ /dev/null
@@ -1,393 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "meta.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-template<typename type_t>
-using decay_t = typename std::decay<type_t>::type;
-
-/////////////////
-// index_sequence
-
-// Improved linear index_sequence from
-// http://talesofcpp.fusionfenix.com/post-22/true-story-efficient-packing
-template<size_t... int_s>
-struct index_sequence { 
-  enum { size = sizeof...(int_s) };
-};
-
-namespace detail {
-template<typename seq_t>
-struct _next;
-
-template<size_t... seq_i>
-struct _next<index_sequence<seq_i...> > {
-  // grow the sequence by one element.
-  typedef index_sequence<seq_i..., sizeof...(seq_i)> type;
-};
-
-template<size_t count>
-struct _make_index_sequence : 
-  _next<typename _make_index_sequence<count - 1>::type> { };
-
-template<> struct _make_index_sequence<0> {
-  typedef index_sequence<> type;
-};
-} // namespace detail
-
-template<size_t count>
-using make_index_sequence = 
-  typename detail::_make_index_sequence<count>::type;
-
-//////////
-// var_and
-
-template<bool... args_b>
-struct var_and;
-
-template<bool arg_a, bool... args_b> 
-struct var_and<arg_a, args_b...> {
-  enum { value = arg_a && var_and<args_b...>::value };
-};
-template<bool arg_a>
-struct var_and<arg_a> {
-  enum { value = arg_a };
-};
-template<>
-struct var_and<> {
-  enum { value = true };
-};
-
-//////////
-// var_or
-
-template<bool... args_b>
-struct var_or;
-
-template<bool arg_a, bool... args_b> 
-struct var_or<arg_a, args_b...> {
-  enum { value = arg_a || var_or<args_b...>::value };
-};
-template<bool arg_a>
-struct var_or<arg_a> {
-  enum { value = arg_a };
-};
-template<>
-struct var_or<> {
-  enum { value = false };
-};
-
-
-
-// Forward declare the tuple.
-template<typename... args_t>
-struct tuple;
-
-////////////////
-// tuple_element
-
-template<size_t i, typename tpl_t> 
-struct tuple_element;
-
-template<size_t i, typename arg_t, typename... args_t>
-struct tuple_element<i, tuple<arg_t, args_t...> > : 
-  tuple_element<i - 1, tuple<args_t...> > { };
-
-template<typename arg_t, typename... args_t>
-struct tuple_element<0, tuple<arg_t, args_t...> > {
-  typedef arg_t type;
-};
-
-template<size_t i, typename tpl_t>
-using tuple_element_t = typename tuple_element<i, tpl_t>::type;
-
-/////////////
-// tuple_size
-
-template<typename tpl_t>
-struct tuple_size;
-
-template<typename... args_t>
-struct tuple_size<tuple<args_t...> > {
-  enum { value = sizeof...(args_t) };
-};
-
-
-namespace detail {
-
-template<size_t i, typename arg_t, bool is_empty = std::is_empty<arg_t>::value>
-struct tuple_leaf {
-  arg_t x;
-
-  MGPU_HOST_DEVICE arg_t& get() { return x; }
-  MGPU_HOST_DEVICE const arg_t& get() const { return x; }
-
-  tuple_leaf() = default;
-  tuple_leaf(const tuple_leaf&) = default;
-
-  template<typename arg2_t,
-    typename = typename std::enable_if<
-      std::is_constructible<arg_t, arg2_t&&>::value
-    >::type
-  > MGPU_HOST_DEVICE 
-  tuple_leaf(arg2_t&& arg) : x(std::forward<arg2_t>(arg)) { }
-
-  template<typename arg2_t,
-    typename = typename std::enable_if<
-      std::is_constructible<arg_t, const arg2_t&>::value
-    >::type
-  > MGPU_HOST_DEVICE  
-  tuple_leaf(const arg2_t& arg) : x(arg) { }
-};
-
-template<size_t i, typename arg_t>
-struct tuple_leaf<i, arg_t, true> : arg_t { 
-  arg_t& get() { return *this; }
-  const arg_t& get() const { return *this; }
-
-  template<typename arg2_t,
-    typename = typename std::enable_if<
-      std::is_constructible<arg_t, const arg2_t&>::value
-    >::type
-  > MGPU_HOST_DEVICE 
-  tuple_leaf(const arg2_t& arg) : arg_t(arg) { }
-};
-
-template<size_t i, typename... args_t>
-struct tuple_impl;
-
-template<size_t i>
-struct tuple_impl<i> { };
-
-template<size_t i, typename arg_t, typename... args_t>
-struct tuple_impl<i, arg_t, args_t...> :
-  tuple_leaf<i, arg_t>,
-  tuple_impl<i + 1, args_t...> {
-
-  typedef tuple_leaf<i, arg_t> head_t;
-  typedef tuple_impl<i + 1, args_t...> tail_t;
-
-  MGPU_HOST_DEVICE  arg_t& head() { return head_t::get(); }
-  MGPU_HOST_DEVICE const arg_t& head() const { return head_t::get(); }
-
-  MGPU_HOST_DEVICE  tail_t& tail() { return *this; }
-  MGPU_HOST_DEVICE  const tail_t& tail() const { return *this; }
-
-  // Constructors.
-  tuple_impl() = default;
-  explicit tuple_impl(const tuple_impl&) = default;
-
-  template<typename... args2_t> MGPU_HOST_DEVICE 
-  explicit tuple_impl(const tuple_impl<i, args2_t...>& rhs) :
-    head_t(rhs.head()), tail_t(rhs.tail()) { }
-
-  template<typename... args2_t> MGPU_HOST_DEVICE  
-  explicit tuple_impl(tuple_impl<i, args2_t...>&& rhs) :
-    head_t(std::move(rhs.head())), 
-    tail_t(std::move(rhs.tail())) { }
-
-  template<typename arg2_t, typename... args2_t,
-    typename = typename std::enable_if<
-      sizeof...(args_t) == sizeof...(args2_t) &&
-      std::is_constructible<arg_t, arg2_t&&>::value &&
-      var_and<std::is_constructible<args_t, args2_t&&>::value...>::value
-    >::type
-  > MGPU_HOST_DEVICE 
-  tuple_impl(arg2_t&& arg, args2_t&&... args) :
-    head_t(std::forward<arg2_t>(arg)), 
-    tail_t(std::forward<args2_t>(args)...) { }
-
-  template<typename arg2_t, typename... args2_t,
-    typename = typename std::enable_if<
-      std::is_constructible<arg_t, const arg2_t&>::value &&
-      var_and<std::is_constructible<args_t, const args2_t&>::value...>::value
-    >::type
-  > MGPU_HOST_DEVICE 
-  tuple_impl(const arg2_t& arg, const args2_t&... args) :
-    head_t(arg), tail_t(args...) { }
-
-  // Assignment
-};
-
-template<size_t i, typename arg_t> MGPU_HOST_DEVICE 
-tuple_leaf<i, arg_t>& get_leaf(tuple_leaf<i, arg_t>& leaf) {
-  return leaf;
-}
-
-template<size_t i, typename arg_t> MGPU_HOST_DEVICE 
-const tuple_leaf<i, arg_t>& get_leaf(const tuple_leaf<i, arg_t>& leaf) {
-  return leaf;
-}
-
-} // namespace detail
-
-template<typename... args_t>
-struct tuple : detail::tuple_impl<0, args_t...> { 
-  typedef detail::tuple_impl<0, args_t...> impl_t;
-
-  tuple() = default;
-  tuple(const tuple&) = default;
-
-  template<typename... args2_t,
-    typename = typename std::enable_if<
-      sizeof...(args2_t) == sizeof...(args_t) &&
-      var_and<std::is_constructible<args_t, const args2_t&>::value...>::value
-    >::type
-  > MGPU_HOST_DEVICE 
-  tuple(const tuple<args2_t...>& rhs) : impl_t(rhs) { }
-  
-  template<typename... args2_t,
-    typename = typename std::enable_if<
-      sizeof...(args2_t) == sizeof...(args_t) &&
-      var_and<std::is_constructible<args_t, args2_t&&>::value...>::value
-    >::type
-  > MGPU_HOST_DEVICE 
-  tuple(args2_t&&... args) : impl_t(std::forward<args2_t>(args)...) { }
-
-  template<typename... args2_t,
-    typename = typename std::enable_if<
-      sizeof...(args2_t) == sizeof...(args_t) &&
-      var_and<std::is_constructible<args_t, const args2_t&>::value...>::value
-    >::type
-  > MGPU_HOST_DEVICE  
-  tuple(const args2_t&... args) : impl_t(args...) { }
-} __attribute__((aligned));
-
-namespace detail {
-
-template<size_t i, typename arg_t> MGPU_HOST_DEVICE 
-arg_t& _get(tuple_leaf<i, arg_t>& leaf) {
-  return leaf.get();
-}
-
-template<size_t i, typename arg_t> MGPU_HOST_DEVICE 
-const arg_t& _get(const tuple_leaf<i, arg_t>& leaf) {
-  return leaf.get();
-}
-
-}
-
-template<size_t i, typename... args_t> MGPU_HOST_DEVICE 
-tuple_element_t<i, tuple<args_t...> >&
-get(tuple<args_t...>& tpl) {
-  return detail::_get<i>(tpl);
-}
-
-template<size_t i, typename... args_t> MGPU_HOST_DEVICE 
-const tuple_element_t<i, tuple<args_t...> >&
-get(const tuple<args_t...>& tpl) {
-  return detail::_get<i>(tpl);
-}
-
-template<size_t i, typename... args_t> MGPU_HOST_DEVICE 
-typename std::add_rvalue_reference<
-  tuple_element_t<i, tuple<args_t...> >
->::type
-get(tuple<args_t...>&& tpl) {
-  return std::forward<tuple_element_t<i, tuple<args_t...> >&&>(get<i>(tpl));
-}
-
-template<typename... args_t> MGPU_HOST_DEVICE 
-tuple<decay_t<args_t>...> make_tuple(args_t&&... args) {
-  return tuple<decay_t<args_t>...>(std::forward<args_t>(args)...);
-}
-
-template<typename... args_t> MGPU_HOST_DEVICE
-tuple<args_t&&...> forward_as_tuple(args_t&&... args) {
-  return tuple<args_t&&...>(std::forward<args_t>(args)...);
-}
-
-////////////
-// tuple_cat
-
-namespace detail {
-
-template<typename tuple_t>
-struct _make_tuple {
-  typedef typename std::remove_cv<
-    typename std::remove_reference<tuple_t>::type
-  >::type type;
-};
-
-template<typename... tuples_t>
-struct _combine_type;
-
-template<typename... args_t>
-struct _combine_type<tuple<args_t...> > {
-  typedef tuple<args_t...> type;
-};
-
-template<typename... args1_t, typename... args2_t, typename... tuples_t>
-struct _combine_type<tuple<args1_t...>, tuple<args2_t...>, tuples_t...> {
-  typedef typename _combine_type<
-    tuple<args1_t..., args2_t...>,
-    tuples_t...
-  >::type type;
-};
-
-template<typename... tpls_t>
-struct _tuple_cat_ret {
-  typedef typename _combine_type<
-    typename _make_tuple<tpls_t>::type...
-  >::type type;
-};
-
-template<typename tpl1_t, typename seq1_t, typename tpl2_t, typename seq2_t>
-struct _tuple_cat;
-
-template<typename tpl1_t, size_t... seq1_i, typename tpl2_t, size_t... seq2_i>
-struct _tuple_cat<tpl1_t, index_sequence<seq1_i...>, 
-  tpl2_t, index_sequence<seq2_i...> > {
-
-  typedef typename _tuple_cat_ret<tpl1_t, tpl2_t>::type ret_t;
-
-  MGPU_HOST_DEVICE static ret_t cat(tpl1_t&& tpl1, tpl2_t&& tpl2) {
-    return make_tuple(
-      get<seq1_i>(std::forward<tpl1_t>(tpl1))...,
-      get<seq2_i>(std::forward<tpl2_t>(tpl2))...
-    );
-  }
-};
-
-} // namespace detail
-
-template<typename tpl1_t> MGPU_HOST_DEVICE
-typename detail::_tuple_cat_ret<tpl1_t>::type
-tuple_cat(tpl1_t&& tpl1) {
-  return std::forward<tpl1_t>(tpl1);
-}
-
-template<typename tpl1_t, typename tpl2_t, typename... tpls_t> MGPU_HOST_DEVICE
-typename detail::_tuple_cat_ret<tpl1_t, tpl2_t, tpls_t...>::type
-tuple_cat(tpl1_t&& tpl1, tpl2_t&& tpl2, tpls_t&&... tpls) {
-  typedef typename detail::_make_tuple<tpl1_t>::type tpl1_stripped;
-  typedef typename detail::_make_tuple<tpl2_t>::type tpl2_stripped;
-
-  enum { 
-    size1 = tuple_size<tpl1_stripped>::value, 
-    size2 = tuple_size<tpl2_stripped>::value
-  };
-
-  return tuple_cat(
-    detail::_tuple_cat<
-      tpl1_t, make_index_sequence<size1>, 
-      tpl2_t, make_index_sequence<size2> 
-    >::cat(
-      std::forward<tpl1_t>(tpl1), 
-      std::forward<tpl2_t>(tpl2)
-    ), 
-    std::forward<tpls_t>(tpls)...
-  );
-}
-
-///////
-// tie
-
-template<typename... args_t>
-MGPU_HOST_DEVICE tuple<args_t&...> tie(args_t&... args) {
-  return tuple<args_t&...>(args...);
-}
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/types.hxx b/src/util/cuda/moderngpu/types.hxx
deleted file mode 100644
index 5fa78592..00000000
--- a/src/util/cuda/moderngpu/types.hxx
+++ /dev/null
@@ -1,147 +0,0 @@
-// moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
-#pragma once
-
-#include "meta.hxx"
-#include "operators.hxx"
-
-BEGIN_MGPU_NAMESPACE
-
-struct cuda_exception_t : std::exception {
-  cudaError_t result;
-
-  cuda_exception_t(cudaError_t result_) : result(result_) { }
-  virtual const char* what() const noexcept { 
-    return cudaGetErrorString(result); 
-  }
-};
-
-
-template<typename type_t, int size>
-struct array_t {
-  type_t data[size];
-
-  MGPU_HOST_DEVICE type_t operator[](int i) const { return data[i]; }
-  MGPU_HOST_DEVICE type_t& operator[](int i) { return data[i]; }
-
-  array_t() = default;
-  array_t(const array_t&) = default;
-  array_t& operator=(const array_t&) = default;
-
-  // Fill the array with x.
-  MGPU_HOST_DEVICE array_t(type_t x) { 
-    iterate<size>([&](int i) { data[i] = x; });  
-  }
-};
-
-template<typename type_t>
-struct array_t<type_t, 0> { 
-  MGPU_HOST_DEVICE type_t operator[](int i) const { return type_t(); }
-  MGPU_HOST_DEVICE type_t& operator[](int i) { return *(type_t*)nullptr; }
-};
-
-// Reduce on components of array_t.
-template<typename type_t, int size, typename op_t = plus_t<type_t> >
-MGPU_HOST_DEVICE type_t reduce(array_t<type_t, size> x, op_t op = op_t()) {
-  type_t a;
-  iterate<size>([&](int i) {
-    a = i ? op(a, x[i]) : x[i];
-  });
-  return a;
-}
-
-// Call the operator component-wise on all components.
-template<typename type_t, int size, typename op_t>
-MGPU_HOST_DEVICE array_t<type_t, size> combine(array_t<type_t, size> x,
-  array_t<type_t, size> y, op_t op) {
-
-  array_t<type_t, size> z;
-  iterate<size>([&](int i) { z[i] = op(x[i], y[i]); });
-  return z;
-}
-
-template<typename type_t, int size>
-MGPU_HOST_DEVICE array_t<type_t, size> operator+(
-  array_t<type_t, size> a, array_t<type_t, size> b) {
-  return combine(a, b, plus_t<type_t>());
-}
-
-template<typename type_t, int size>
-MGPU_HOST_DEVICE array_t<type_t, size> operator-(
-  array_t<type_t, size> a, array_t<type_t, size> b) {
-  return combine(a, b, minus_t<type_t>());
-}
-
-
-template<typename key_t, typename val_t, int size>
-struct kv_array_t {
-  array_t<key_t, size> keys;
-  array_t<val_t, size> vals;
-};
-
-enum bounds_t { 
-  bounds_lower,
-  bounds_upper
-};
-
-struct MGPU_ALIGN(8) range_t {
-  int begin, end;
-  MGPU_HOST_DEVICE int size() const { return end - begin; }
-  MGPU_HOST_DEVICE int count() const { return size(); }
-  MGPU_HOST_DEVICE bool valid() const { return end > begin; }
-};
-
-MGPU_HOST_DEVICE range_t get_tile(int cta, int nv, int count) {
-  return range_t { nv * cta, min(count, nv * (cta + 1)) };
-}
-
-
-struct MGPU_ALIGN(16) merge_range_t {
-  int a_begin, a_end, b_begin, b_end;
-
-  MGPU_HOST_DEVICE int a_count() const { return a_end - a_begin; }
-  MGPU_HOST_DEVICE int b_count() const { return b_end - b_begin; }
-  MGPU_HOST_DEVICE int total() const { return a_count() + b_count(); }
-
-  MGPU_HOST_DEVICE range_t a_range() const { 
-    return range_t { a_begin, a_end };
-  }
-  MGPU_HOST_DEVICE range_t b_range() const {
-    return range_t { b_begin, b_end };
-  }
-
-  MGPU_HOST_DEVICE merge_range_t to_local() const {
-    return merge_range_t { 0, a_count(), a_count(), total() };
-  }
-  
-  // Partition from mp to the end.
-  MGPU_HOST_DEVICE merge_range_t partition(int mp0, int diag) const {
-    return merge_range_t { a_begin + mp0, a_end, b_begin + diag - mp0, b_end };
-  }
-
-  // Partition from mp0 to mp1.
-  MGPU_HOST_DEVICE merge_range_t partition(int mp0, int diag0,
-    int mp1, int diag1) const {
-    return merge_range_t { 
-      a_begin + mp0, 
-      a_begin + mp1,
-      b_begin + diag0 - mp0,
-      b_begin + diag1 - mp1
-    };
-  }
-
-  MGPU_HOST_DEVICE bool a_valid() const { 
-    return a_begin < a_end; 
-  }
-  MGPU_HOST_DEVICE bool b_valid() const {
-    return b_begin < b_end;
-  }
-};
-
-template<typename type_t, int size>
-struct merge_pair_t {
-  array_t<type_t, size> keys;
-  array_t<int, size> indices;
-};
-
-
-END_MGPU_NAMESPACE
diff --git a/src/util/cuda/moderngpu/util.hxx b/src/util/cuda/moderngpu/util.hxx
deleted file mode 100644
index 3675d7b9..00000000
--- a/src/util/cuda/moderngpu/util.hxx
+++ /dev/null
@@ -1,30 +0,0 @@
-#pragma once
-#include "types.hxx"
-#include <cstdarg>
-#include <string>
-
-BEGIN_MGPU_NAMESPACE
-
-namespace detail {
-
-inline std::string stringprintf(const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  int len = vsnprintf(0, 0, format, args);
-  va_end(args);
-
-  // allocate space.
-  std::string text;
-  text.resize(len);
-
-  va_start(args, format);
-  vsnprintf(&text[0], len + 1, format, args);
-  va_end(args);
-
-  return text;
-}
-
-} // namespace detail
-
-END_MGPU_NAMESPACE
-
diff --git a/src/util/cuda/ofp_context.hxx b/src/util/cuda/ofp_context.hxx
deleted file mode 100644
index 70c4ed9e..00000000
--- a/src/util/cuda/ofp_context.hxx
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * ofp_context.hxx
- *
- *  Created on: Nov 15, 2018
- *      Author: i-bird
- */
-
-#ifndef OFP_CONTEXT_HXX_
-#define OFP_CONTEXT_HXX_
-
-#include <iostream>
-
-#ifdef CUDA_ON_CPU
-
-namespace mgpu
-{
-	enum gpu_context_opt
-	{
-		no_print_props,//!< no_print_props
-		print_props,   //!< print_props
-		dummy          //!< dummy
-	};
-
-	struct context_t {};
-
-	class ofp_context_t : public context_t
-	{
-		protected:
-
-			std::string _props;
-
-			openfpm::vector<aggregate<unsigned char>> tmem;
-
-			// Making this a template argument means we won't generate an instance
-			// of dummy_k for each translation unit.
-			template<int dummy_arg = 0>
-			void init(int dev_num, gpu_context_opt opt)
-			{}
-
-		public:
-
-			/*! \brief gpu context constructor
-				*
-				* \param opt options for this gpu context
-				*
-				*/
-			ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, int stream_ = 0)
-			{}
-
-			~ofp_context_t()
-			{}
-
-			virtual const std::string& props() const
-			{
-				return _props;
-			}
-
-			virtual int ptx_version() const
-			{
-				return 0;
-			}
-
-			virtual int stream() 
-			{
-				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl;
-				return 0; 
-			}
-
-			// Alloc GPU memory.
-			virtual void* alloc(size_t size, int space)
-			{
-				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented" << std::endl;
-				return NULL;
-			}
-
-			virtual void free(void* p, int space)
-			{
-				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented"  << std::endl;
-			}
-
-			virtual void synchronize()
-			{
-				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented"  << std::endl;
-			}
-
-			virtual int event()
-			{
-				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented"  << std::endl;
-				return 0;
-			}
-
-			virtual void timer_begin()
-			{
-				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented"  << std::endl;
-			}
-
-			virtual double timer_end()
-			{
-				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented"  << std::endl;
-				return 0.0;
-			}
-
-			virtual int getDevice()
-			{
-				std::cout << __FILE__ << ":" << __LINE__ << " Not implemented"  << std::endl;
-				return 0;
-			}
-	};
-
-}
-
-#else
-
-	#ifdef CUDA_GPU
-
-		#ifdef __NVCC__
-		#include "util/cuda/moderngpu/context.hxx"
-		#else
-		#include "util/cuda/moderngpu/context_reduced.hxx"
-		#endif
-
-		namespace mgpu
-		{
-			enum gpu_context_opt
-			{
-				no_print_props,//!< no_print_props
-				print_props,   //!< print_props
-				dummy          //!< dummy
-			};
-
-
-			////////////////////////////////////////////////////////////////////////////////
-			// standard_context_t is a trivial implementation of context_t. Users can
-			// derive this type to provide a custom allocator.
-
-			class ofp_context_t : public context_t
-			{
-				protected:
-					cudaDeviceProp _props;
-					int _ptx_version;
-					cudaStream_t _stream;
-
-					cudaEvent_t _timer[2];
-					cudaEvent_t _event;
-
-					openfpm::vector_gpu<aggregate<unsigned char>> tmem;
-					openfpm::vector_gpu<aggregate<unsigned char>> tmem2;
-					openfpm::vector_gpu<aggregate<unsigned char>> tmem3;
-
-					// Making this a template argument means we won't generate an instance
-					// of dummy_k for each translation unit.
-					template<int dummy_arg = 0>
-					void init(int dev_num, gpu_context_opt opt)
-					{
-						cudaFuncAttributes attr;
-						#ifdef __NVCC__
-						cudaError_t result = cudaFuncGetAttributes(&attr, (void *)dummy_k<0>);
-						if(cudaSuccess != result) throw cuda_exception_t(result);
-						_ptx_version = attr.ptxVersion;
-						#else
-						_ptx_version = 60;
-						//std::cout << __FILE__ << ":" << __LINE__ << " Warning initialization of GPU context has been done from a standard Cpp file, rather than a CUDA or HIP file" << std::endl;
-						#endif
-
-						int num_dev;
-						cudaGetDeviceCount(&num_dev);
-
-						if (num_dev == 0) {return;}
-
-						if (opt != gpu_context_opt::dummy)
-						{
-							cudaSetDevice(dev_num % num_dev);
-						}
-
-						int ord;
-						cudaGetDevice(&ord);
-						cudaGetDeviceProperties(&_props, ord);
-
-						cudaEventCreate(&_timer[0]);
-						cudaEventCreate(&_timer[1]);
-						cudaEventCreate(&_event);
-					}
-
-				public:
-
-
-					/*! \brief gpu context constructor
-					*
-					* \param opt options for this gpu context
-					*
-					*/
-					ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0, cudaStream_t stream_ = 0)
-					:context_t(), _stream(stream_)
-					{
-						init(dev_num,opt);
-						if(opt == gpu_context_opt::print_props)
-						{
-							printf("%s\n", device_prop_string(_props).c_str());
-						}
-					}
-
-					~ofp_context_t()
-					{
-						cudaEventDestroy(_timer[0]);
-						cudaEventDestroy(_timer[1]);
-						cudaEventDestroy(_event);
-					}
-
-					virtual const cudaDeviceProp& props() const { return _props; }
-					virtual int ptx_version() const { return _ptx_version; }
-					virtual cudaStream_t stream() { return _stream; }
-
-					// Alloc GPU memory.
-					virtual void* alloc(size_t size, memory_space_t space)
-					{
-						void* p = nullptr;
-						if(size)
-						{
-							cudaError_t result = (memory_space_device == space) ?cudaMalloc(&p, size) : cudaMallocHost(&p, size);
-							if(cudaSuccess != result) throw cuda_exception_t(result);
-						}
-						return p;
-					}
-
-					virtual void free(void* p, memory_space_t space)
-					{
-						if(p)
-						{
-							cudaError_t result = (memory_space_device == space) ? cudaFree(p) : cudaFreeHost(p);
-							if(cudaSuccess != result) throw cuda_exception_t(result);
-						}
-					}
-
-					virtual void synchronize()
-					{
-						cudaError_t result = _stream ?
-						cudaStreamSynchronize(_stream) :
-						cudaDeviceSynchronize();
-						if(cudaSuccess != result) throw cuda_exception_t(result);
-					}
-
-					virtual cudaEvent_t event()
-					{
-						return _event;
-					}
-
-					virtual void timer_begin()
-					{
-						cudaEventRecord(_timer[0], _stream);
-					}
-
-					virtual double timer_end()
-					{
-						cudaEventRecord(_timer[1], _stream);
-						cudaEventSynchronize(_timer[1]);
-						float ms;
-						cudaEventElapsedTime(&ms, _timer[0], _timer[1]);
-						return ms / 1.0e3;
-					}
-
-					virtual int getDevice()
-					{
-						int dev = 0;
-
-						cudaGetDevice(&dev);
-
-						return dev;
-					}
-
-					virtual int getNDevice()
-					{
-						int num_dev;
-						cudaGetDeviceCount(&num_dev);
-
-						return num_dev;
-					}
-
-					openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB()
-					{
-						return tmem;
-					}
-
-					openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB2()
-					{
-						return tmem2;
-					}
-
-					openfpm::vector_gpu<aggregate<unsigned char>> & getTemporalCUB3()
-					{
-						return tmem3;
-					}
-			};
-
-		}
-
-	#else
-
-		namespace mgpu
-		{
-
-			enum gpu_context_opt
-			{
-				no_print_props,//!< no_print_props
-				print_props,   //!< print_props
-				dummy          //!< dummy
-			};
-
-			// Stub class for modern gpu
-
-			struct ofp_context_t
-			{
-				ofp_context_t(gpu_context_opt opt = gpu_context_opt::no_print_props , int dev_num = 0)
-				{}
-			};
-		}
-
-	#endif
-
-#endif
-
-
-#endif /* OFP_CONTEXT_HXX_ */
diff --git a/src/util/cuda/reduce_ofp.cuh b/src/util/cuda/reduce_ofp.cuh
index 6a4fb65f..53c1c741 100644
--- a/src/util/cuda/reduce_ofp.cuh
+++ b/src/util/cuda/reduce_ofp.cuh
@@ -11,32 +11,26 @@
 #ifdef __NVCC__
 
 #include "util/cuda_launch.hpp"
+#include "util/ofp_context.hpp"
 
 #if CUDART_VERSION >= 11000
-	#ifndef CUDA_ON_CPU 
 	// Here we have for sure CUDA >= 11
-	#ifdef __HIP__
-		#include "hipcub/hipcub.hpp"
-	#else
-		#include "cub/cub.cuh"
-	#endif
-	#ifndef REDUCE_WITH_CUB
-		#define REDUCE_WITH_CUB
-	#endif
+	#ifndef CUDA_ON_CPU
+		#ifdef __HIP__
+			#include "hipcub/hipcub.hpp"
+		#else
+			#include "cub/cub.cuh"
+		#endif
 	#endif
 #else
-	// Here we have old CUDA
 	#include "cub_old/cub.cuh"
-	//#include "util/cuda/moderngpu/kernel_reduce.hxx"
-	#define REDUCE_WITH_CUB
 #endif
 
-#include "util/cuda/ofp_context.hxx"
 
 namespace openfpm
 {
 	template<typename input_it, typename output_it, typename reduce_op>
-			void reduce(input_it input, int count, output_it output, reduce_op op, mgpu::ofp_context_t& context)
+			void reduce(input_it input, int count, output_it output, reduce_op op, gpu::ofp_context_t& context)
 	{
 #ifdef CUDA_ON_CPU
 
@@ -47,51 +41,30 @@ namespace openfpm
 	}
 
 #else
-	#ifdef REDUCE_WITH_CUB
 
-		#ifdef __HIP__
+	#ifdef __HIP__
 
-			void *d_temp_storage = NULL;
-			size_t temp_storage_bytes = 0;
-			hipcub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input,
-																		output,
-																		count,
-																		op,
-																		false);
-
-			auto & temporal = context.getTemporalCUB();
-			temporal.resize(temp_storage_bytes);
-
-			// Run
-			hipcub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
-					output,
-					count,
-					op,
-					false);
-		#else
+		size_t temp_storage_bytes = 0;
+		hipcub::DeviceReduce::Reduce(NULL,
+			temp_storage_bytes,input, output, count, op, false);
 
-			void *d_temp_storage = NULL;
-			size_t temp_storage_bytes = 0;
-			cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,input,
-																	output,
-																	count,
-																	op,
-																	false);
+		auto & temporal = context.getTemporalCUB();
+		temporal.resize(temp_storage_bytes);
 
-			auto & temporal = context.getTemporalCUB();
-			temporal.resize(temp_storage_bytes);
+		hipcub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(),
+			temp_storage_bytes,input, output, count, op, false);
+	#else
 
-			// Run
-			cub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
-				output,
-				count,
-				op,
-				false);
+		size_t temp_storage_bytes = 0;
+		cub::DeviceReduce::Reduce(NULL,
+			temp_storage_bytes, input, output, count, op, false);
 
-		#endif
+		auto & temporal = context.getTemporalCUB();
+		temporal.resize(temp_storage_bytes);
+
+		cub::DeviceReduce::Reduce(temporal.template getDeviceBuffer<0>(),
+			temp_storage_bytes, input, output, count, op, false);
 
-	#else
-			mgpu::reduce(input,count,output,op,context);
 	#endif
 #endif
 	}
diff --git a/src/util/cuda/scan_ofp.cuh b/src/util/cuda/scan_ofp.cuh
index f91d64a1..97648164 100644
--- a/src/util/cuda/scan_ofp.cuh
+++ b/src/util/cuda/scan_ofp.cuh
@@ -11,32 +11,26 @@
 #ifdef __NVCC__
 
 #include "util/cuda_launch.hpp"
+#include "util/ofp_context.hpp"
 
 #if CUDART_VERSION >= 11000
-	#ifndef CUDA_ON_CPU 
 	// Here we have for sure CUDA >= 11
-	#ifdef __HIP__
-		#include "hipcub/hipcub.hpp"
-	#else
-		#include "cub/cub.cuh"
-	#endif
-	#ifndef SCAN_WITH_CUB
-		#define SCAN_WITH_CUB
-	#endif
+	#ifndef CUDA_ON_CPU
+		#ifdef __HIP__
+			#include "hipcub/hipcub.hpp"
+		#else
+			#include "cub/cub.cuh"
+		#endif
 	#endif
 #else
-	// Here we have old CUDA
 	#include "cub_old/cub.cuh"
-	//#include "util/cuda/moderngpu/kernel_scan.hxx"
-	#define SCAN_WITH_CUB
 #endif
 
-#include "util/cuda/ofp_context.hxx"
 
 namespace openfpm
 {
 	template<typename input_it, typename output_it>
-			 void scan(input_it input, int count, output_it output, mgpu::ofp_context_t& context)
+			 void scan(input_it input, int count, output_it output, gpu::ofp_context_t& context)
 	{
 #ifdef CUDA_ON_CPU
 
@@ -52,46 +46,32 @@ namespace openfpm
 	}
 
 #else
-	#ifdef SCAN_WITH_CUB
-
-			#ifdef __HIP__
+	if (count == 0)	return;
 
-				if (count == 0)	{return;}
-
-				void *d_temp_storage = NULL;
-				size_t temp_storage_bytes = 0;
-				hipcub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input,
-																			output,
-																			count);
+	#ifdef __HIP__
 
-				auto & temporal = context.getTemporalCUB();
-				temporal.resize(temp_storage_bytes);
+		size_t temp_storage_bytes = 0;
+		hipcub::DeviceScan::ExclusiveSum(NULL,
+			temp_storage_bytes,input, output, count);
 
-				// Run
-				hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
-						output,
-						count);
+		auto & temporal = context.getTemporalCUB();
+		temporal.resize(temp_storage_bytes);
 
-			#else
+		hipcub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(),
+			temp_storage_bytes, input, output, count);
 
-				void *d_temp_storage = NULL;
-				size_t temp_storage_bytes = 0;
-				cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,input,
-																			output,
-																			count);
+	#else
 
-				auto & temporal = context.getTemporalCUB();
-				temporal.resize(temp_storage_bytes);
+		size_t temp_storage_bytes = 0;
+		cub::DeviceScan::ExclusiveSum(NULL,
+			temp_storage_bytes, input, output, count);
 
-				// Run
-				cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(), temp_storage_bytes,input,
-						output,
-						count);
+		auto & temporal = context.getTemporalCUB();
+		temporal.resize(temp_storage_bytes);
 
-			#endif
+		cub::DeviceScan::ExclusiveSum(temporal.template getDeviceBuffer<0>(),
+			temp_storage_bytes,input, output, count);
 
-	#else
-			mgpu::scan(input,count,output,context);
 	#endif
 #endif
 	}
diff --git a/src/util/cuda/scan_sort_cuda_unit_tests.cu b/src/util/cuda/scan_sort_cuda_unit_tests.cu
index 7314e874..991f7b65 100644
--- a/src/util/cuda/scan_sort_cuda_unit_tests.cu
+++ b/src/util/cuda/scan_sort_cuda_unit_tests.cu
@@ -8,8 +8,6 @@
 #include "util/cuda_util.hpp"
 #include "Vector/map_vector.hpp"
 
-#define SORT_WITH_CUB
-
 #include "sort_ofp.cuh"
 #include "scan_ofp.cuh"
 #include "segreduce_ofp.cuh"
@@ -37,7 +35,7 @@ BOOST_AUTO_TEST_CASE( test_scan_cub_wrapper )
 
 	input.template hostToDevice<0>();
 
-	mgpu::ofp_context_t context;
+	gpu::ofp_context_t context;
 	openfpm::scan((unsigned int *)input.template getDeviceBuffer<0>(),input.size(),(unsigned int *)output.template getDeviceBuffer<0>(),context);
 
     output.template deviceToHost<0>();
@@ -78,11 +76,11 @@ BOOST_AUTO_TEST_CASE( test_sort_cub_wrapper )
 	input.template hostToDevice<0>();
 	input_id.template hostToDevice<0>();
 
-	mgpu::ofp_context_t context;
+	gpu::ofp_context_t context;
 
 	openfpm::sort((unsigned int *)input.template getDeviceBuffer<0>(),
 				  (unsigned int *)input_id.template getDeviceBuffer<0>(),
-			      input.size(),mgpu::template less_t<unsigned int>(),context);
+			      input.size(),gpu::template less_t<unsigned int>(),context);
 
 	input.template deviceToHost<0>();
 	input_id.template deviceToHost<0>();
@@ -94,7 +92,7 @@ BOOST_AUTO_TEST_CASE( test_sort_cub_wrapper )
 
 	openfpm::sort((unsigned int *)input.template getDeviceBuffer<0>(),
 				  (unsigned int *)input_id.template getDeviceBuffer<0>(),
-			      input.size(),mgpu::template greater_t<unsigned int>(),context);
+			      input.size(),gpu::template greater_t<unsigned int>(),context);
 
 	input.template deviceToHost<0>();
 	input_id.template deviceToHost<0>();
@@ -113,7 +111,7 @@ BOOST_AUTO_TEST_CASE( test_seg_reduce_wrapper )
 {
 	std::cout << "Test gpu segmented reduce" << "\n";
 
-	mgpu::ofp_context_t context;
+	gpu::ofp_context_t context;
 
 	int count = 130;
 
@@ -144,23 +142,25 @@ BOOST_AUTO_TEST_CASE( test_seg_reduce_wrapper )
 
 		base += c;
 	}
+	segment_offset.add();
+	segment_offset.template get<0>(segment_offset.size() - 1) = vgpu.size();
 
 	vgpu.hostToDevice<0>();
 
 	segment_offset.hostToDevice<0>();
-	output.resize(segment_offset.size());
+	output.resize(segment_offset.size()-1);
 
 	openfpm::segreduce((int *)vgpu.template getDeviceBuffer<0>(), vgpu.size(),
-					(int *)segment_offset.template getDeviceBuffer<0>(), segment_offset.size(),
+					(int *)segment_offset.template getDeviceBuffer<0>(), segment_offset.size()-1,
 					(int *)output.template getDeviceBuffer<0>(),
-					mgpu::plus_t<int>(), init, context);
+					gpu::plus_t<int>(), init, context);
 
 
 	output.template deviceToHost<0>();
 
 	bool match = true;
 	size_t i = 0;
-	for ( ; i < segment_offset.size()-1 ; i++)
+	for ( ; i < segment_offset.size()-2 ; i++)
 	{
 		size_t red = 0;
 		for (size_t j = 0 ; j < segment_offset.template get<0>(i+1) - segment_offset.template get<0>(i)  ; j++)
@@ -173,7 +173,7 @@ BOOST_AUTO_TEST_CASE( test_seg_reduce_wrapper )
 	BOOST_REQUIRE_EQUAL(match,true);
 
 	size_t red2 = 0;
-	for (size_t j = 0 ; j < vgpu.size() - segment_offset.template get<0>(i)  ; j++)
+	for (size_t j = 0 ; j < segment_offset.template get<0>(i+1) - segment_offset.template get<0>(i)  ; j++)
 	{
 		red2 += vgpu.template get<0>(segment_offset.template get<0>(i) + j);
 	}
diff --git a/src/util/cuda/segreduce_ofp.cuh b/src/util/cuda/segreduce_ofp.cuh
index 9a0c3764..57eae7cd 100644
--- a/src/util/cuda/segreduce_ofp.cuh
+++ b/src/util/cuda/segreduce_ofp.cuh
@@ -10,64 +10,22 @@
  
  #ifdef __NVCC__
  
- #include "Vector/map_vector.hpp"
  #include "util/cuda_launch.hpp"
- #include "util/cuda/segreduce_ofp.cuh"
+ #include "util/ofp_context.hpp"
  
  #if CUDART_VERSION >= 11000
-     #ifndef CUDA_ON_CPU 
-     // Here we have for sure CUDA >= 11
-     #ifdef __HIP__
-        #undef __CUDACC__
-        #undef __CUDA__
-        #include <thrust/reduce.h>
-        #define __CUDACC__
-        #define __CUDA__
-     #else
-        #include "util/cuda/moderngpu/kernel_segreduce.hxx"
-     #endif
-     #endif
- #else
-    #include "util/cuda/moderngpu/kernel_segreduce.hxx"
- #endif
- #include "util/cuda/ofp_context.hxx"
+    // Here we have for sure CUDA >= 11
+    #ifndef CUDA_ON_CPU
+        #ifdef __HIP__
+            #include "hipcub/hipcub.hpp"
+        #else
+            #include "cub/cub.cuh"
+        #endif
+    #endif
+#else
+    #include "cub_old/cub.cuh"
+#endif
  
-template<typename segments_it, typename keys_type, typename output_it, typename seg_type, typename type_t>
-__global__ void seg_to_keys(segments_it segs, keys_type keys, seg_type seg_out ,output_it output, int n_count, int num_segments,type_t init)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (tid >= num_segments)    {return;}
-
-    int s = segs[tid];
-    int s_p1 = (tid == num_segments -1)?n_count:segs[tid+1];
-
-    int n_ele = s_p1 - s;
-
-    seg_out.template get<1>(tid) = (s != s_p1);
-    output[tid] = init;
-
-    for (int j = 0 ; j < n_ele ; j++)
-    {
-        keys.template get<0>(s + j) = tid;
-    }
-}
-
-template<typename output_it, typename out_tmp_type ,typename segs_type>
-__global__ void realign_output(output_it out, out_tmp_type out_tmp, segs_type segs, int num_segments)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (tid >= num_segments)    {return;}
-
-    int t = segs.template get<2>(tid);
-    int to_copy = segs.template get<1>(tid);
-
-    auto op = out_tmp.template get<0>(t);
-
-    if (to_copy == 1)
-    {out[tid] = op;}
-}
 
  namespace openfpm
  {
@@ -75,7 +33,7 @@ __global__ void realign_output(output_it out, out_tmp_type out_tmp, segs_type se
              typename segments_it, typename output_it, typename op_t, typename type_t>
     void segreduce(input_it input, int count, segments_it segments,
                     int num_segments, output_it output, op_t op, type_t init,
-                    mgpu::ofp_context_t & context)
+                    gpu::ofp_context_t & context)
      {
  #ifdef CUDA_ON_CPU
  
@@ -106,47 +64,33 @@ __global__ void realign_output(output_it out, out_tmp_type out_tmp, segs_type se
         }
  
  #else
-
         #ifdef __HIP__
 
-            typedef typename std::remove_pointer<segments_it>::type index_type;
-            typedef typename std::remove_pointer<output_it>::type out_type;
-
-            openfpm::vector_gpu<aggregate<index_type>> keys;
-            keys.resize(count);
-
-            openfpm::vector_gpu<aggregate<index_type,index_type,index_type>> segs_out;
-            segs_out.resize(num_segments);
+            size_t temp_storage_bytes = 0;
 
-            openfpm::vector_gpu<aggregate<out_type>> out_tmp;
-            out_tmp.resize(num_segments);
+            hipcub::DeviceSegmentedReduce::Reduce(NULL, temp_storage_bytes, input, output,
+                num_segments, segments, segments + 1, op, init);
 
-            grid_sm<1,void> g(num_segments);
+            auto & temporal = context.getTemporalCUB();
+            temporal.resize(temp_storage_bytes);
 
-            auto it = g.getGPUIterator();
+            hipcub::DeviceSegmentedReduce::Reduce(temporal.getDeviceBuffer<0>(), temp_storage_bytes, input, output,
+                num_segments, segments, segments + 1, op, init);
 
-            CUDA_LAUNCH(seg_to_keys,it,segments,keys.toKernel(),segs_out.toKernel(),output,count,num_segments,init);
-
-            openfpm::scan((index_type *)segs_out.template getDeviceBuffer<1>(),num_segments,(index_type *)segs_out.template getDeviceBuffer<2>(),context);
+        #else
 
-            thrust::pair<index_type *,out_type *> new_end;
-            new_end = thrust::reduce_by_key(thrust::device, (segments_it)keys.template getDeviceBuffer<0>(),((segments_it)keys.template getDeviceBuffer<0>()) + count, 
-                                            input, 
-                                            (segments_it)segs_out.template getDeviceBuffer<0>(), 
-                                            (output_it)out_tmp.template getDeviceBuffer<0>(),
-                                            thrust::equal_to<int>(),
-                                            op);
+            size_t temp_storage_bytes = 0;
 
-            // .. Not so easy to emulate a segmented reduce we have to track the zeros segments and realign the output
+            cub::DeviceSegmentedReduce::Reduce(NULL, temp_storage_bytes, input, output,
+                num_segments, segments, segments + 1, op, init);
 
-            CUDA_LAUNCH(realign_output,it,output,out_tmp.toKernel(),segs_out.toKernel(),num_segments);
+            auto & temporal = context.getTemporalCUB();
+            temporal.resize(temp_storage_bytes);
 
-        #else
-
-            mgpu::segreduce(input,count,segments,num_segments,output,op,init,context);
+            cub::DeviceSegmentedReduce::Reduce(temporal.template getDeviceBuffer<0>(), temp_storage_bytes, input, output,
+                num_segments, segments, segments + 1, op, init);
 
         #endif
-
  #endif
      }
  }
diff --git a/src/util/cuda/sort_ofp.cuh b/src/util/cuda/sort_ofp.cuh
index 52deb9d6..d689668b 100644
--- a/src/util/cuda/sort_ofp.cuh
+++ b/src/util/cuda/sort_ofp.cuh
@@ -12,27 +12,22 @@
 #ifdef __NVCC__
 
 #include "util/cuda_launch.hpp"
+#include "util/ofp_context.hpp"
 
 #if CUDART_VERSION >= 11000
-	#ifndef CUDA_ON_CPU 
 	// Here we have for sure CUDA >= 11
-	#ifdef __HIP__
-		#include "hipcub/hipcub.hpp"
-	#else
-		#include "cub/cub.cuh"
-	#endif
-	#ifndef SORT_WITH_CUB
-		#define SORT_WITH_CUB
-	#endif
+	#ifndef CUDA_ON_CPU
+		#ifdef __HIP__
+			#include "hipcub/hipcub.hpp"
+		#else
+			#include "cub/cub.cuh"
+		#endif
 	#endif
 #else
 	// Here we have old CUDA
 	#include "cub_old/cub.cuh"
-	//#include "util/cuda/moderngpu/kernel_mergesort.hxx"
-	#define SORT_WITH_CUB
 #endif
 
-#include "util/cuda/ofp_context.hxx"
 
 template<typename key_t, typename val_t>
 struct key_val_ref;
@@ -266,7 +261,7 @@ namespace openfpm
 	template<typename key_t, typename val_t,
 	  typename comp_t>
 	void sort(key_t* keys_input, val_t* vals_input, int count,
-	  comp_t comp, mgpu::ofp_context_t& context)
+	  comp_t comp, gpu::ofp_context_t& context)
 	{
 #ifdef CUDA_ON_CPU
 
@@ -275,142 +270,133 @@ namespace openfpm
 	std::sort(kv,kv+count,comp);
 
 #else
+	#ifdef __HIP__
 
-	#ifdef SORT_WITH_CUB
-
-			#ifdef __HIP__
-
-				void *d_temp_storage = NULL;
-				size_t temp_storage_bytes = 0;
-
-				auto & temporal2 = context.getTemporalCUB2();
-				temporal2.resize(sizeof(key_t)*count);
-
-				auto & temporal3 = context.getTemporalCUB3();
-				temporal3.resize(sizeof(val_t)*count);
-
-				if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true)
-				{
-					hipcub::DeviceRadixSort::SortPairs(d_temp_storage, 
-													temp_storage_bytes,
-													keys_input,
-													(key_t *)temporal2.template getDeviceBuffer<0>(),
-													vals_input,
-													(val_t *)temporal3.template getDeviceBuffer<0>(),
-													count);
-
-					auto & temporal = context.getTemporalCUB();
-					temporal.resize(temp_storage_bytes);
-
-					d_temp_storage = temporal.template getDeviceBuffer<0>();
-
-					// Run
-					hipcub::DeviceRadixSort::SortPairs(d_temp_storage, 
-													temp_storage_bytes,
-													keys_input,
-													(key_t *)temporal2.template getDeviceBuffer<0>(),
-													vals_input,
-													(val_t *)temporal3.template getDeviceBuffer<0>(),
-													count);
-				}
-				else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true)
-				{
-					hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
-													temp_storage_bytes,
-													keys_input,
-													(key_t *)temporal2.template getDeviceBuffer<0>(),
-													vals_input,
-													(val_t *)temporal3.template getDeviceBuffer<0>(),
-													count);
-
-					auto & temporal = context.getTemporalCUB();
-					temporal.resize(temp_storage_bytes);
-
-					d_temp_storage = temporal.template getDeviceBuffer<0>();
-
-					// Run
-					hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
-													temp_storage_bytes,
-													keys_input,
-													(key_t *)temporal2.template getDeviceBuffer<0>(),
-													vals_input,
-													(val_t *)temporal3.template getDeviceBuffer<0>(),
-													count);
-				}
-
-				cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice);
-				cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice);
-			
-
-			#else
-
-				void *d_temp_storage = NULL;
-				size_t temp_storage_bytes = 0;
-
-				auto & temporal2 = context.getTemporalCUB2();
-				temporal2.resize(sizeof(key_t)*count);
-
-				auto & temporal3 = context.getTemporalCUB3();
-				temporal3.resize(sizeof(val_t)*count);
-
-				if (std::is_same<mgpu::template less_t<key_t>,comp_t>::value == true)
-				{
-					cub::DeviceRadixSort::SortPairs(d_temp_storage, 
-													temp_storage_bytes,
-													keys_input,
-													(key_t *)temporal2.template getDeviceBuffer<0>(),
-													vals_input,
-													(val_t *)temporal3.template getDeviceBuffer<0>(),
-													count);
-
-					auto & temporal = context.getTemporalCUB();
-					temporal.resize(temp_storage_bytes);
-
-					d_temp_storage = temporal.template getDeviceBuffer<0>();
-
-					// Run
-					cub::DeviceRadixSort::SortPairs(d_temp_storage, 
-													temp_storage_bytes,
-													keys_input,
-													(key_t *)temporal2.template getDeviceBuffer<0>(),
-													vals_input,
-													(val_t *)temporal3.template getDeviceBuffer<0>(),
-													count);
-				}
-				else if (std::is_same<mgpu::template greater_t<key_t>,comp_t>::value == true)
-				{
-					cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
-													temp_storage_bytes,
-													keys_input,
-													(key_t *)temporal2.template getDeviceBuffer<0>(),
-													vals_input,
-													(val_t *)temporal3.template getDeviceBuffer<0>(),
-													count);
-
-					auto & temporal = context.getTemporalCUB();
-					temporal.resize(temp_storage_bytes);
-
-					d_temp_storage = temporal.template getDeviceBuffer<0>();
-
-					// Run
-					cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, 
-													temp_storage_bytes,
-													keys_input,
-													(key_t *)temporal2.template getDeviceBuffer<0>(),
-													vals_input,
-													(val_t *)temporal3.template getDeviceBuffer<0>(),
-													count);
-				}
-
-				cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice);
-				cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice);
-				
-			#endif
+		void *d_temp_storage = NULL;
+		size_t temp_storage_bytes = 0;
+
+		auto & temporal2 = context.getTemporalCUB2();
+		temporal2.resize(sizeof(key_t)*count);
+
+		auto & temporal3 = context.getTemporalCUB3();
+		temporal3.resize(sizeof(val_t)*count);
+
+		if (std::is_same<gpu::template less_t<key_t>,comp_t>::value == true)
+		{
+			hipcub::DeviceRadixSort::SortPairs(d_temp_storage,
+											temp_storage_bytes,
+											keys_input,
+											(key_t *)temporal2.template getDeviceBuffer<0>(),
+											vals_input,
+											(val_t *)temporal3.template getDeviceBuffer<0>(),
+											count);
+
+			auto & temporal = context.getTemporalCUB();
+			temporal.resize(temp_storage_bytes);
+
+			d_temp_storage = temporal.template getDeviceBuffer<0>();
+
+			// Run
+			hipcub::DeviceRadixSort::SortPairs(d_temp_storage,
+											temp_storage_bytes,
+											keys_input,
+											(key_t *)temporal2.template getDeviceBuffer<0>(),
+											vals_input,
+											(val_t *)temporal3.template getDeviceBuffer<0>(),
+											count);
+		}
+		else if (std::is_same<gpu::template greater_t<key_t>,comp_t>::value == true)
+		{
+			hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+											temp_storage_bytes,
+											keys_input,
+											(key_t *)temporal2.template getDeviceBuffer<0>(),
+											vals_input,
+											(val_t *)temporal3.template getDeviceBuffer<0>(),
+											count);
+
+			auto & temporal = context.getTemporalCUB();
+			temporal.resize(temp_storage_bytes);
+
+			d_temp_storage = temporal.template getDeviceBuffer<0>();
+
+			// Run
+			hipcub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+											temp_storage_bytes,
+											keys_input,
+											(key_t *)temporal2.template getDeviceBuffer<0>(),
+											vals_input,
+											(val_t *)temporal3.template getDeviceBuffer<0>(),
+											count);
+		}
+
+		cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice);
+		cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice);
 
 	#else
-			mgpu::mergesort(keys_input,vals_input,count,comp,context);
-	#endif
 
+		void *d_temp_storage = NULL;
+		size_t temp_storage_bytes = 0;
+
+		auto & temporal2 = context.getTemporalCUB2();
+		temporal2.resize(sizeof(key_t)*count);
+
+		auto & temporal3 = context.getTemporalCUB3();
+		temporal3.resize(sizeof(val_t)*count);
+
+		if (std::is_same<gpu::template less_t<key_t>,comp_t>::value == true)
+		{
+			cub::DeviceRadixSort::SortPairs(d_temp_storage,
+											temp_storage_bytes,
+											keys_input,
+											(key_t *)temporal2.template getDeviceBuffer<0>(),
+											vals_input,
+											(val_t *)temporal3.template getDeviceBuffer<0>(),
+											count);
+
+			auto & temporal = context.getTemporalCUB();
+			temporal.resize(temp_storage_bytes);
+
+			d_temp_storage = temporal.template getDeviceBuffer<0>();
+
+			// Run
+			cub::DeviceRadixSort::SortPairs(d_temp_storage,
+											temp_storage_bytes,
+											keys_input,
+											(key_t *)temporal2.template getDeviceBuffer<0>(),
+											vals_input,
+											(val_t *)temporal3.template getDeviceBuffer<0>(),
+											count);
+		}
+		else if (std::is_same<gpu::template greater_t<key_t>,comp_t>::value == true)
+		{
+			cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+											temp_storage_bytes,
+											keys_input,
+											(key_t *)temporal2.template getDeviceBuffer<0>(),
+											vals_input,
+											(val_t *)temporal3.template getDeviceBuffer<0>(),
+											count);
+
+			auto & temporal = context.getTemporalCUB();
+			temporal.resize(temp_storage_bytes);
+
+			d_temp_storage = temporal.template getDeviceBuffer<0>();
+
+			// Run
+			cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+											temp_storage_bytes,
+											keys_input,
+											(key_t *)temporal2.template getDeviceBuffer<0>(),
+											vals_input,
+											(val_t *)temporal3.template getDeviceBuffer<0>(),
+											count);
+		}
+
+		cudaMemcpy(keys_input,temporal2.getDeviceBuffer<0>(),sizeof(key_t)*count,cudaMemcpyDeviceToDevice);
+		cudaMemcpy(vals_input,temporal3.getDeviceBuffer<0>(),sizeof(val_t)*count,cudaMemcpyDeviceToDevice);
+
+	#endif
 #endif
 	}
 }
diff --git a/src/util/cuda/test/segreduce_block_cuda_tests.cu b/src/util/cuda/test/segreduce_block_cuda_tests.cu
index 573f7f91..5d67eefa 100644
--- a/src/util/cuda/test/segreduce_block_cuda_tests.cu
+++ b/src/util/cuda/test/segreduce_block_cuda_tests.cu
@@ -165,8 +165,8 @@ BOOST_AUTO_TEST_SUITE(segreduce_block_cuda_tests)
 
         // template<unsigned int chunksPerBlock, typename op, typename SegType, typename DataType, typename MaskType>
         // segreduce(DataType *data, SegType *segments, MaskType *masks, DataType *output, MaskType *outputMasks)
-//        segreduce<2, mgpu::maximum_t<ScalarT>> <<< outputData.size(), 2*BlockT::size >>> (
-        CUDA_LAUNCH_DIM3((segreduce_block<2, mgpu::plus_t<ScalarT>>), outputData.size(), 2*BlockT::size,
+//        segreduce<2, gpu::maximum_t<ScalarT>> <<< outputData.size(), 2*BlockT::size >>> (
+        CUDA_LAUNCH_DIM3((segreduce_block<2, gpu::plus_t<ScalarT>>), outputData.size(), 2*BlockT::size,
                         (BlockT *) data.template getDeviceBuffer<BLOCK>(),
                         (int *) segments.template getDeviceBuffer<0>(),
                         (MaskBlockT *) data.template getDeviceBuffer<BITMASK>(),
@@ -174,7 +174,7 @@ BOOST_AUTO_TEST_SUITE(segreduce_block_cuda_tests)
                                 );
 
         // Segreduce on mask
-        CUDA_LAUNCH_DIM3((segreduce_block<2, mgpu::maximum_t<unsigned char>>), outputData.size(), 2*BlockT::size,
+        CUDA_LAUNCH_DIM3((segreduce_block<2, gpu::maximum_t<unsigned char>>), outputData.size(), 2*BlockT::size,
                         (MaskBlockT *) data.template getDeviceBuffer<BITMASK>(),
                         (int *) segments.template getDeviceBuffer<0>(),
                         (MaskBlockT *) data.template getDeviceBuffer<BITMASK>(),
-- 
GitLab