#ifndef CUDIFY_SEQUENCIAL_HPP_ #define CUDIFY_SEQUENCIAL_HPP_ #define CUDA_ON_BACKEND CUDA_BACKEND_SEQUENTIAL #include "config.h" constexpr int default_kernel_wg_threads_ = 1024; #include "cudify_hardware_common.hpp" #ifdef HAVE_BOOST_CONTEXT #include "util/cuda_util.hpp" #include <boost/bind/bind.hpp> #include <type_traits> #ifdef HAVE_BOOST_CONTEXT #include <boost/context/continuation.hpp> #endif #include <vector> #include <string.h> #ifndef CUDIFY_BOOST_CONTEXT_STACK_SIZE #define CUDIFY_BOOST_CONTEXT_STACK_SIZE 8192 #endif extern std::vector<void *>mem_stack; extern thread_local dim3 threadIdx; extern thread_local dim3 blockIdx; static dim3 blockDim; static dim3 gridDim; extern std::vector<void *> mem_stack; extern std::vector<boost::context::detail::fcontext_t> contexts; extern thread_local void * par_glob; extern thread_local boost::context::detail::fcontext_t main_ctx; static void __syncthreads() { boost::context::detail::jump_fcontext(main_ctx,par_glob); }; extern int thread_local vct_atomic_add; extern int thread_local vct_atomic_rem; namespace cub { template<typename T, unsigned int dim> class BlockScan { public: typedef std::array<T,dim> TempStorage; private: TempStorage & tmp; public: BlockScan(TempStorage & tmp) :tmp(tmp) {}; void ExclusiveSum(T & in, T & out) { tmp[threadIdx.x] = in; __syncthreads(); if (threadIdx.x == 0) { T prec = tmp[0]; tmp[0] = 0; for (int i = 1 ; i < dim ; i++) { auto next = tmp[i-1] + prec; prec = tmp[i]; tmp[i] = next; } } __syncthreads(); out = tmp[threadIdx.x]; return; } }; } template<typename T, typename T2> static T atomicAdd(T * address, T2 val) { T old = *address; *address += val; return old; }; #define MGPU_HOST_DEVICE namespace mgpu { template<typename type_t> struct less_t : public std::binary_function<type_t, type_t, bool> { bool operator()(type_t a, type_t b) const { return a < b; } template<typename type2_t, typename type3_t> bool operator()(type2_t a, type3_t b) const { return a < b; } }; /* template<typename type_t> struct less_equal_t : public std::binary_function<type_t, type_t, bool> { MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { return a <= b; } };*/ template<typename type_t> struct greater_t : public std::binary_function<type_t, type_t, bool> { MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { return a > b; } template<typename type2_t, typename type3_t> MGPU_HOST_DEVICE bool operator()(type2_t a, type3_t b) const { return a > b; } }; /* template<typename type_t> struct greater_equal_t : public std::binary_function<type_t, type_t, bool> { MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { return a >= b; } }; template<typename type_t> struct equal_to_t : public std::binary_function<type_t, type_t, bool> { MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { return a == b; } }; template<typename type_t> struct not_equal_to_t : public std::binary_function<type_t, type_t, bool> { MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const { return a != b; } };*/ //////////////////////////////////////////////////////////////////////////////// // Device-side arithmetic operators. template<typename type_t> struct plus_t : public std::binary_function<type_t, type_t, type_t> { type_t operator()(type_t a, type_t b) const { return a + b; } }; /* template<typename type_t> struct minus_t : public std::binary_function<type_t, type_t, type_t> { MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { return a - b; } }; template<typename type_t> struct multiplies_t : public std::binary_function<type_t, type_t, type_t> { MGPU_HOST_DEVICE type_t operator()(type_t a, type_t b) const { return a * b; } };*/ template<typename type_t> struct maximum_t : public std::binary_function<type_t, type_t, type_t> { type_t operator()(type_t a, type_t b) const { return std::max(a, b); } }; template<typename type_t> struct minimum_t : public std::binary_function<type_t, type_t, type_t> { type_t operator()(type_t a, type_t b) const { return std::min(a, b); } }; } namespace mgpu { template<typename input_it, typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t> void segreduce(input_it input, int count, segments_it segments, int num_segments, output_it output, op_t op, type_t init, context_t& context) { int i = 0; for ( ; i < num_segments - 1; i++) { int j = segments[i]; output[i] = input[j]; ++j; for ( ; j < segments[i+1] ; j++) { output[i] = op(output[i],input[j]); } } // Last segment int j = segments[i]; output[i] = input[j]; ++j; for ( ; j < count ; j++) { output[i] = op(output[i],input[j]); } } // Key-value merge. template<typename a_keys_it, typename a_vals_it, typename b_keys_it, typename b_vals_it, typename c_keys_it, typename c_vals_it, typename comp_t, typename context_t> void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count, b_keys_it b_keys, b_vals_it b_vals, int b_count, c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context) { int a_it = 0; int b_it = 0; int c_it = 0; while (a_it < a_count || b_it < b_count) { if (a_it < a_count) { if (b_it < b_count) { if (comp(b_keys[b_it],a_keys[a_it])) { c_keys[c_it] = b_keys[b_it]; c_vals[c_it] = b_vals[b_it]; c_it++; b_it++; } else { c_keys[c_it] = a_keys[a_it]; c_vals[c_it] = a_vals[a_it]; c_it++; a_it++; } } else { c_keys[c_it] = a_keys[a_it]; c_vals[c_it] = a_vals[a_it]; c_it++; a_it++; } } else { c_keys[c_it] = b_keys[b_it]; c_vals[c_it] = b_vals[b_it]; c_it++; b_it++; } } } } static void init_wrappers() {} template<typename lambda_f> struct Fun_enc { lambda_f Fn; Fun_enc(lambda_f Fn) :Fn(Fn) {} void run() { Fn(); } }; template<typename lambda_f> struct Fun_enc_bt { lambda_f Fn; dim3 & blockIdx; dim3 & threadIdx; Fun_enc_bt(lambda_f Fn,dim3 & blockIdx,dim3 & threadIdx) :Fn(Fn),blockIdx(blockIdx),threadIdx(threadIdx) {} void run() { Fn(blockIdx,threadIdx); } }; template<typename Fun_enc_type> void launch_kernel(boost::context::detail::transfer_t par) { main_ctx = par.fctx; par_glob = par.data; Fun_enc_type * ptr = (Fun_enc_type *)par.data; ptr->run(); boost::context::detail::jump_fcontext(par.fctx,0); } template<typename lambda_f, typename ite_type> static void exe_kernel(lambda_f f, ite_type & ite) { if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;} if (mem_stack.size() < ite.nthrs()) { int old_size = mem_stack.size(); mem_stack.resize(ite.nthrs()); for (int i = old_size ; i < mem_stack.size() ; i++) { mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE]; } } // Resize contexts contexts.resize(mem_stack.size()); Fun_enc<lambda_f> fe(f); for (int i = 0 ; i < ite.wthr.z ; i++) { blockIdx.z = i; for (int j = 0 ; j < ite.wthr.y ; j++) { blockIdx.y = j; for (int k = 0 ; k < ite.wthr.x ; k++) { blockIdx.x = k; int nc = 0; for (int it = 0 ; it < ite.thr.z ; it++) { for (int jt = 0 ; jt < ite.thr.y ; jt++) { for (int kt = 0 ; kt < ite.thr.x ; kt++) { contexts[nc] = boost::context::detail::make_fcontext((char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>); nc++; } } } bool work_to_do = true; while(work_to_do) { nc = 0; // Work threads for (int it = 0 ; it < ite.thr.z ; it++) { threadIdx.z = it; for (int jt = 0 ; jt < ite.thr.y ; jt++) { threadIdx.y = jt; for (int kt = 0 ; kt < ite.thr.x ; kt++) { threadIdx.x = kt; auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe); contexts[nc] = t.fctx; work_to_do &= (t.data != 0); nc++; } } } } } } } } template<typename lambda_f, typename ite_type> static void exe_kernel_lambda(lambda_f f, ite_type & ite) { if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;} if (mem_stack.size() < ite.nthrs()) { int old_size = mem_stack.size(); mem_stack.resize(ite.nthrs()); for (int i = old_size ; i < mem_stack.size() ; i++) { mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE]; } } // Resize contexts contexts.resize(mem_stack.size()); bool is_sync_free = true; bool first_block = true; for (int i = 0 ; i < ite.wthr.z ; i++) { for (int j = 0 ; j < ite.wthr.y ; j++) { for (int k = 0 ; k < ite.wthr.x ; k++) { dim3 blockIdx; dim3 threadIdx; Fun_enc_bt<lambda_f> fe(f,blockIdx,threadIdx); if (first_block == true || is_sync_free == false) { blockIdx.z = i; blockIdx.y = j; blockIdx.x = k; int nc = 0; for (int it = 0 ; it < ite.thr.z ; it++) { for (int jt = 0 ; jt < ite.thr.y ; jt++) { for (int kt = 0 ; kt < ite.thr.x ; kt++) { contexts[nc] = boost::context::detail::make_fcontext((char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc_bt<lambda_f>>); nc++; } } } bool work_to_do = true; while(work_to_do) { nc = 0; // Work threads for (int it = 0 ; it < ite.thr.z ; it++) { threadIdx.z = it; for (int jt = 0 ; jt < ite.thr.y ; jt++) { threadIdx.y = jt; for (int kt = 0 ; kt < ite.thr.x ; kt++) { threadIdx.x = kt; auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe); contexts[nc] = t.fctx; work_to_do &= (t.data != 0); is_sync_free &= !(work_to_do); nc++; } } } } } else { blockIdx.z = i; blockIdx.y = j; blockIdx.x = k; int fb = 0; // Work threads for (int it = 0 ; it < ite.thr.z ; it++) { threadIdx.z = it; for (int jt = 0 ; jt < ite.thr.y ; jt++) { threadIdx.y = jt; for (int kt = 0 ; kt < ite.thr.x ; kt++) { threadIdx.x = kt; f(blockIdx,threadIdx); } } } } first_block = false; } } } } template<typename lambda_f, typename ite_type> static void exe_kernel_lambda_tls(lambda_f f, ite_type & ite) { if (ite.nthrs() == 0 || ite.nblocks() == 0) {return;} if (mem_stack.size() < ite.nthrs()) { int old_size = mem_stack.size(); mem_stack.resize(ite.nthrs()); for (int i = old_size ; i < mem_stack.size() ; i++) { mem_stack[i] = new char [CUDIFY_BOOST_CONTEXT_STACK_SIZE]; } } // Resize contexts contexts.resize(mem_stack.size()); bool is_sync_free = true; bool first_block = true; for (int i = 0 ; i < ite.wthr.z ; i++) { for (int j = 0 ; j < ite.wthr.y ; j++) { for (int k = 0 ; k < ite.wthr.x ; k++) { Fun_enc<lambda_f> fe(f); if (first_block == true || is_sync_free == false) { blockIdx.z = i; blockIdx.y = j; blockIdx.x = k; int nc = 0; for (int it = 0 ; it < ite.thr.z ; it++) { for (int jt = 0 ; jt < ite.thr.y ; jt++) { for (int kt = 0 ; kt < ite.thr.x ; kt++) { contexts[nc] = boost::context::detail::make_fcontext((char *)mem_stack[nc]+CUDIFY_BOOST_CONTEXT_STACK_SIZE-16,CUDIFY_BOOST_CONTEXT_STACK_SIZE,launch_kernel<Fun_enc<lambda_f>>); nc++; } } } bool work_to_do = true; while(work_to_do) { nc = 0; // Work threads for (int it = 0 ; it < ite.thr.z ; it++) { threadIdx.z = it; for (int jt = 0 ; jt < ite.thr.y ; jt++) { threadIdx.y = jt; for (int kt = 0 ; kt < ite.thr.x ; kt++) { threadIdx.x = kt; auto t = boost::context::detail::jump_fcontext(contexts[nc],&fe); contexts[nc] = t.fctx; work_to_do &= (t.data != 0); is_sync_free &= !(work_to_do); nc++; } } } } } else { blockIdx.z = i; blockIdx.y = j; blockIdx.x = k; int fb = 0; // Work threads for (int it = 0 ; it < ite.thr.z ; it++) { threadIdx.z = it; for (int jt = 0 ; jt < ite.thr.y ; jt++) { threadIdx.y = jt; for (int kt = 0 ; kt < ite.thr.x ; kt++) { threadIdx.x = kt; f(); } } } } first_block = false; } } } } template<typename lambda_f, typename ite_type> static void exe_kernel_no_sync(lambda_f f, ite_type & ite) { for (int i = 0 ; i < ite.wthr.z ; i++) { blockIdx.z = i; for (int j = 0 ; j < ite.wthr.y ; j++) { blockIdx.y = j; for (int k = 0 ; k < ite.wthr.x ; k++) { blockIdx.x = k; int fb = 0; // Work threads for (int it = 0 ; it < ite.wthr.z ; it++) { threadIdx.z = it; for (int jt = 0 ; jt < ite.wthr.y ; jt++) { threadIdx.y = jt; for (int kt = 0 ; kt < ite.wthr.x ; kt++) { threadIdx.x = kt; f(); } } } } } } } #ifdef PRINT_CUDA_LAUNCHES #define CUDA_LAUNCH(cuda_call,ite, ...)\ \ gridDim.x = ite.wthr.x;\ gridDim.y = ite.wthr.y;\ gridDim.z = ite.wthr.z;\ \ blockDim.x = ite.thr.x;\ blockDim.y = ite.thr.y;\ blockDim.z = ite.thr.z;\ \ CHECK_SE_CLASS1_PRE\ \ std::cout << "Launching: " << #cuda_call << std::endl;\ \ exe_kernel(\ [&](boost::context::fiber && main) -> void {\ \ \ \ cuda_call(__VA_ARGS__);\ },ite);\ CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ } #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ {\ dim3 wthr__(wthr_);\ dim3 thr__(thr_);\ \ ite_gpu<1> itg;\ itg.wthr = wthr;\ itg.thr = thr;\ \ gridDim.x = wthr__.x;\ gridDim.y = wthr__.y;\ gridDim.z = wthr__.z;\ \ blockDim.x = thr__.x;\ blockDim.y = thr__.y;\ blockDim.z = thr__.z;\ \ CHECK_SE_CLASS1_PRE\ std::cout << "Launching: " << #cuda_call << std::endl;\ \ exe_kernel(\ [&] (boost::context::fiber && main) -> void {\ \ \ \ cuda_call(__VA_ARGS__);\ \ \ });\ CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ } #define CUDA_CHECK() #else #define CUDA_LAUNCH(cuda_call,ite, ...) \ {\ gridDim.x = ite.wthr.x;\ gridDim.y = ite.wthr.y;\ gridDim.z = ite.wthr.z;\ \ blockDim.x = ite.thr.x;\ blockDim.y = ite.thr.y;\ blockDim.z = ite.thr.z;\ \ CHECK_SE_CLASS1_PRE\ \ exe_kernel([&]() -> void {\ \ \ cuda_call(__VA_ARGS__);\ \ },ite);\ \ CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ } #define CUDA_LAUNCH_LAMBDA(ite,lambda_f) \ {\ gridDim.x = ite.wthr.x;\ gridDim.y = ite.wthr.y;\ gridDim.z = ite.wthr.z;\ \ blockDim.x = ite.thr.x;\ blockDim.y = ite.thr.y;\ blockDim.z = ite.thr.z;\ \ CHECK_SE_CLASS1_PRE\ \ exe_kernel_lambda(lambda_f,ite);\ \ CHECK_SE_CLASS1_POST("lambda",0)\ } #define CUDA_LAUNCH_LAMBDA_TLS(ite,lambda_f) \ {\ gridDim.x = ite.wthr.x;\ gridDim.y = ite.wthr.y;\ gridDim.z = ite.wthr.z;\ \ blockDim.x = ite.thr.x;\ blockDim.y = ite.thr.y;\ blockDim.z = ite.thr.z;\ \ CHECK_SE_CLASS1_PRE\ \ exe_kernel_lambda_tls(lambda_f,ite);\ \ CHECK_SE_CLASS1_POST("lambda",0)\ } #define CUDA_LAUNCH_DIM3(cuda_call,wthr_,thr_, ...)\ {\ dim3 wthr__(wthr_);\ dim3 thr__(thr_);\ \ ite_gpu<1> itg;\ itg.wthr = wthr_;\ itg.thr = thr_;\ \ gridDim.x = wthr__.x;\ gridDim.y = wthr__.y;\ gridDim.z = wthr__.z;\ \ blockDim.x = thr__.x;\ blockDim.y = thr__.y;\ blockDim.z = thr__.z;\ \ CHECK_SE_CLASS1_PRE\ \ exe_kernel([&]() -> void {\ \ cuda_call(__VA_ARGS__);\ \ },itg);\ \ CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\ } #define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_,lambda_f) \ {\ dim3 wthr__(wthr_);\ dim3 thr__(thr_);\ \ ite_gpu<1> itg;\ itg.wthr = wthr_;\ itg.thr = thr_;\ gridDim.x = itg.wthr.x;\ gridDim.y = itg.wthr.y;\ gridDim.z = itg.wthr.z;\ \ blockDim.x = itg.thr.x;\ blockDim.y = itg.thr.y;\ blockDim.z = itg.thr.z;\ \ CHECK_SE_CLASS1_PRE\ \ exe_kernel_lambda_tls(lambda_f,itg);\ \ } #define CUDA_LAUNCH_DIM3_DEBUG_SE1(cuda_call,wthr_,thr_, ...)\ {\ dim3 wthr__(wthr_);\ dim3 thr__(thr_);\ \ ite_gpu<1> itg;\ itg.wthr = wthr_;\ itg.thr = thr_;\ \ gridDim.x = wthr__.x;\ gridDim.y = wthr__.y;\ gridDim.z = wthr__.z;\ \ blockDim.x = thr__.x;\ blockDim.y = thr__.y;\ blockDim.z = thr__.z;\ \ CHECK_SE_CLASS1_PRE\ \ exe_kernel([&]() -> void {\ \ cuda_call(__VA_ARGS__);\ \ },itg);\ \ } #define CUDA_CHECK() #endif #endif #else constexpr int default_kernel_wg_threads_ = 1024; #endif /* CUDIFY_SEQUENCIAL_HPP_ */