Compare revisions

Pietro Incardona · syaskovets · ece04603 · ece04603 · ece04603
--- a/src/util/cuda_util.hpp
+++ b/src/util/cuda_util.hpp
@@ -61,6 +61,8 @@
 #define CUDA_BACKEND_OPENMP 4
 #define CUDA_BACKEND_HIP 5

+
+
 #if defined(CUDIFY_USE_CUDA)
 #include "cudify/cuda/cudify_cuda.hpp"
 #elif defined(CUDIFY_USE_ALPAKA)
@@ -78,6 +80,7 @@ constexpr int default_kernel_wg_threads_ = 1024;

 static void init_wrappers() {}

+
 #endif

 #endif /* OPENFPM_DATA_SRC_UTIL_CUDA_UTIL_HPP_ */
--- a/src/util/cudify/cuda/cudify_cuda.hpp
+++ b/src/util/cudify/cuda/cudify_cuda.hpp
@@ -3,8 +3,13 @@

 #define CUDA_ON_BACKEND CUDA_BACKEND_CUDA
 #include <cuda_runtime.h>
+#include <boost/preprocessor.hpp>

-constexpr int default_kernel_wg_threads_ = 1024;
+#ifdef DEFAULT_CUDA_THREADS
+constexpr size_t default_kernel_wg_threads_ = static_cast<size_t>(DEFAULT_CUDA_THREADS);
+#else
+constexpr size_t default_kernel_wg_threads_ = static_cast<size_t>(1024);
+#endif

 #if CUDART_VERSION >= 11000 && defined(__NVCC__)
    #include "cub/util_type.cuh"
@@ -29,6 +34,118 @@ __global__ void kernel_launch_lambda_tls(lambda_f f)
 }


+/**
+ * @brief Find appropriate grid and block size based on statistics of register usage during compilation
+ * @note
+ * - According to https://developer.nvidia.com/blog/cuda-pro-tip-occupancy-api-simplifies-launch-configuration/:
+ * This can greatly simplify the task of frameworks (such as Thrust), that must launch user-defined kernels. This is
+ * also handy for kernels that are not primary performance bottlenecks, where the programmer just wants a simple way
+ * to run the kernel with correct results, rather than hand-tuning the execution configuration.
+ *
+ * -For advanced kernel hand-tuning depending on compute capability, the launchbox feature of moderngpu
+ * (https://moderngpu.github.io/performance.html) should be considered.
+ */
+template<typename dim3Type, typename... Args>
+void FixConfigLaunch(void (* _kernel)(Args...), dim3Type & wthr, dim3Type & thr) {
+
+        if (thr.x != 0xFFFFFFFF) {
+            return;
+        }
+
+        int blockSize = 0; // The launch configurator returned block size
+        int minGridSize;   // The minimum grid size needed to achieve the
+                           // maximum occupancy for a full device launch
+
+        cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, *_kernel, 0, 0);
+
+        int dim = (wthr.x != 0) + (wthr.y != 0) + (wthr.z != 0);
+        if (dim == 0) {
+            return;
+        }
+
+	    size_t tot_work;
+
+        unsigned int wthr_x = wthr.x;
+        unsigned int wthr_y = wthr.y;
+        unsigned int wthr_z = wthr.z;
+        
+        if (dim == 1)
+            tot_work = wthr.x;
+        else if (dim == 2)
+            tot_work = wthr.x * wthr.y;
+        else if (dim == 3)
+            tot_work = wthr.x * wthr.y * wthr.z;
+
+        // round to the nearest bigger power of 2
+        size_t tot_work_2 = tot_work; 
+		tot_work_2--;
+		tot_work_2 |= tot_work_2 >> 1;   
+		tot_work_2 |= tot_work_2 >> 2;
+		tot_work_2 |= tot_work_2 >> 4;
+		tot_work_2 |= tot_work_2 >> 8;
+		tot_work_2 |= tot_work_2 >> 16;
+		tot_work_2++;
+
+	    size_t n = (tot_work <= blockSize)?tot_work_2:blockSize;
+
+	    if (tot_work == 0)
+	    {
+		    thr.x = 0;
+		    thr.y = 0;
+		    thr.z = 0;
+
+		    wthr.x = 0;
+		    wthr.y = 0;
+		    wthr.z = 0;
+	    }
+
+	    thr.x = 1;
+	    thr.y = 1;
+	    thr.z = 1;
+
+	    int dir = 0;
+
+	    while (n != 1)
+	    {
+		    if (dir % 3 == 0)
+		    {thr.x = thr.x << 1;}
+		    else if (dir % 3 == 1)
+		    {thr.y = thr.y << 1;}
+		    else if (dir % 3 == 2)
+		    {thr.z = thr.z << 1;}
+
+		    n = n >> 1;
+
+		    dir++;
+		    dir %= dim;
+	    }
+
+	    if (dim >= 1)
+	    {wthr.x = (wthr.x) / thr.x + (((wthr_x)%thr.x != 0)?1:0);}
+
+
+	    if (dim >= 2)
+	    {wthr.y = (wthr.y) / thr.y + (((wthr_y)%thr.y != 0)?1:0);}
+	    else
+	    {wthr.y = 1;}
+
+	    if (dim >= 3)
+	    {wthr.z = (wthr.z) / thr.z + (((wthr_z)%thr.z != 0)?1:0);}
+	    else
+	    {wthr.z = 1;}
+
+	// crop if wthr == 1
+
+	if (dim >= 1 && wthr.x == 1)
+	{thr.x = wthr_x;}
+
+	if (dim >= 2 && wthr.y == 1)
+	{thr.y = wthr_y;}
+
+	if (dim == 3 && wthr.z == 1)
+	{thr.z = wthr_z;}
+}
+
 #endif

 static void init_wrappers()
@@ -215,39 +332,94 @@ static void init_wrappers()

 #else

-#define CUDA_LAUNCH(cuda_call,ite, ...) \
-        if (ite.wthr.x != 0)\
-        {cuda_call<<<ite.wthr,ite.thr>>>(__VA_ARGS__);}
+template<typename... Args, typename ite_type>
+void CUDA_LAUNCH(void (* _kernel)(Args...),ite_type ite,Args... args)
+{
+    FixConfigLaunch(_kernel,ite.wthr,ite.thr);

-#define CUDA_LAUNCH_DIM3(cuda_call,wthr,thr, ...) \
-        cuda_call<<<wthr,thr>>>(__VA_ARGS__);
+//    std::cout << "DEMANGLE " << typeid(decltype(_kernel)).name() << " " << ite.wthr.x << " " << ite.wthr.y << " " << ite.wthr.z << "/" << ite.thr.x << " " << ite.thr.y << " " << ite.thr.z  << std::endl;

-#define CUDA_LAUNCH_LAMBDA(ite,lambda_f, ...) \
-        kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);
+    #ifdef __NVCC__
+    _kernel<<<ite.wthr,ite.thr>>>(args...);
+    #else
+    std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH not implemented for this compiler" << std::endl;
+    #endif
+}

-#define CUDA_CHECK()
+template<typename... Args>
+void CUDA_LAUNCH_DIM3(void (* _kernel)(Args...),dim3 wthr, dim3 thr,Args... args)
+{
+    FixConfigLaunch(_kernel,wthr,thr);

-#define CUDA_LAUNCH_LAMBDA_TLS(ite, lambda_f, ...) \
-        {\
-        if (ite.wthr.x != 0)\
-        {kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);}\
-        }
+//    std::cout << "DEMANGLE " << typeid(decltype(_kernel)).name() << "  " << wthr.x << " " << wthr.y << " " << wthr.z << "/" << thr.x << " " << thr.y << " " << thr.z  << std::endl;

-#define CUDA_LAUNCH_LAMBDA_DIM3(wthr_,thr_, lambda_f, ...) \
-        {\
-        dim3 wthr__(wthr_);\
-        dim3 thr__(thr_);\
-        if (ite.wthr.x != 0)\
-        {kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}\
-        }
+    #ifdef __NVCC__
+    _kernel<<<wthr,thr>>>(args...);
+    #else
+    std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_DIM3 not implemented for this compiler" << std::endl;
+    #endif
+}

-#define CUDA_LAUNCH_LAMBDA_DIM3_TLS(wthr_,thr_, lambda_f, ...) \
-        {\
-        dim3 wthr__(wthr_);\
-        dim3 thr__(thr_);\
-        if (ite.wthr.x != 0)\
-        {kernel_launch_lambda_tls<<<wthr__,thr__>>>(lambda_f);}\
-        }
+template<typename lambda_type, typename ite_type, typename... Args>
+void CUDA_LAUNCH_LAMBDA(ite_type ite, lambda_type lambda_f, Args... args)
+{
+    #ifdef __NVCC__
+    void (* _ker)(lambda_type) = kernel_launch_lambda;
+    FixConfigLaunch(_ker,ite.wthr,ite.thr);
+    
+    kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);
+    #else
+    std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA not implemented for this compiler" << std::endl;
+    #endif
+}
+
+static void CUDA_CHECK() {}
+
+template<typename lambda_type, typename ite_type, typename... Args>
+void CUDA_LAUNCH_LAMBDA_TLS(ite_type ite, lambda_type lambda_f, Args... args)
+{
+    #ifdef __NVCC__
+    void (* _ker)(lambda_type) = kernel_launch_lambda;
+    FixConfigLaunch(_ker,ite.wthr,ite.thr);
+
+    if (ite.wthr.x != 0)
+    {kernel_launch_lambda<<<ite.wthr,ite.thr>>>(lambda_f);}
+    #else
+    std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
+    #endif
+}
+
+template<typename lambda_type, typename... Args>
+void CUDA_LAUNCH_LAMBDA_DIM3(dim3 wthr_, dim3 thr_, lambda_type lambda_f, Args... args)
+{
+    #ifdef __NVCC__
+    void (* _ker)(lambda_type) = kernel_launch_lambda;
+    FixConfigLaunch(_ker,wthr_,thr_);
+
+    dim3 wthr__(wthr_);
+    dim3 thr__(thr_);
+    if (wthr__.x != 0)
+    {kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}
+    #else
+    std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
+    #endif
+}
+
+template<typename lambda_type, typename... Args>
+void CUDA_LAUNCH_LAMBDA_DIM3_TLS(dim3 wthr_, dim3 thr_, lambda_type lambda_f, Args... args)
+{
+    #ifdef __NVCC__
+    void (* _ker)(lambda_type) = kernel_launch_lambda;
+    FixConfigLaunch(_ker,wthr_,thr_);
+
+    dim3 wthr__(wthr_);
+    dim3 thr__(thr_);
+    if (wthr__.x != 0)
+    {kernel_launch_lambda<<<wthr__,thr__>>>(lambda_f);}
+    #else
+    std::cout << __FILE__ << ":" << __LINE__ << " " << "CUDA_LAUNCH_LAMBDA_TLS not implemented for this compiler" << std::endl;
+    #endif
+}

 #endif


--- a/src/util/cudify/openmp/cudify_openmp.hpp
+++ b/src/util/cudify/openmp/cudify_openmp.hpp
@@ -818,6 +818,7 @@ static void exe_kernel_no_sync(lambda_f f, ite_type & ite)

 #else

+
 #define CUDA_LAUNCH(cuda_call,ite, ...) \
        {\
        gridDim.x = ite.wthr.x;\
No results found