Fixing CUDA_LAUNCH for only one orgument

fdbff4c0 · Pietro Incardona · ba4fe180 · fdbff4c0 · fdbff4c0 · fdbff4c0
Commit fdbff4c0 authored 6 years ago by Pietro Incardona
--- a/src/NN/CellList/cuda/CellList_gpu.hpp
+++ b/src/NN/CellList/cuda/CellList_gpu.hpp
@@ -246,7 +246,7 @@ public:

 		part_ids.resize(pl.size());

-		CUDA_LAUNCH((subindex<dim,T,cnt_type,ids_type>),ite_gpu.wthr,ite_gpu.thr,div_c,
+		CUDA_LAUNCH((subindex<dim,T,cnt_type,ids_type>),ite_gpu,div_c,
 																		spacing_c,
 																		off,
 																		this->getTransform(),
@@ -266,7 +266,7 @@ public:
 		cells.resize(pl.size());
 		auto itgg = part_ids.getGPUIterator();

-		CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg.wthr,itgg.thr,0,
+		CUDA_LAUNCH((fill_cells<dim,cnt_type,ids_type,shift_ph<0,cnt_type>>),itgg,0,
 																					   div_c,
 																					   off,
 																					   part_ids.size(),
@@ -287,7 +287,7 @@ public:
 		CUDA_LAUNCH((reorder_parts<decltype(pl_prp.toKernel()),
 				      decltype(pl.toKernel()),
 				      decltype(sorted_to_not_sorted.toKernel()),
-				      cnt_type,shift_ph<0,cnt_type>>),ite.wthr,ite.thr,pl.size(),
+				      cnt_type,shift_ph<0,cnt_type>>),ite,pl.size(),
 				                                                           pl_prp.toKernel(),
 				                                                           pl_prp_out.toKernel(),
 				                                                           pl.toKernel(),
@@ -301,7 +301,7 @@ public:
 		{
 			ite = sorted_domain_particles_ids.getGPUIterator();

-			CUDA_LAUNCH((mark_domain_particles),ite.wthr,ite.thr,sorted_to_not_sorted.toKernel(),sorted_domain_particles_ids.toKernel(),sorted_domain_particles_dg.toKernel(),g_m);
+			CUDA_LAUNCH((mark_domain_particles),ite,sorted_to_not_sorted.toKernel(),sorted_domain_particles_ids.toKernel(),sorted_domain_particles_dg.toKernel(),g_m);


 			// now we sort the particles

--- a/src/Vector/map_vector.hpp
+++ b/src/Vector/map_vector.hpp
@@ -104,7 +104,7 @@ namespace openfpm

 				auto ite = v.getGPUIterator();

-				CUDA_LAUNCH((merge_add_prp_device_impl<decltype(v.toKernel()),decltype(this_.toKernel()),args...>),ite.wthr,ite.thr,v.toKernel(),this_.toKernel(),(unsigned int)old_sz);
+				CUDA_LAUNCH((merge_add_prp_device_impl<decltype(v.toKernel()),decltype(this_.toKernel()),args...>),ite,v.toKernel(),this_.toKernel(),(unsigned int)old_sz);

 	#else
 				std::cout << __FILE__ << ":" << __LINE__ << " Error the function add_prp_device only work when map_vector is compiled with nvcc" << std::endl;
@@ -136,7 +136,7 @@ namespace openfpm

 				auto ite = v.getGPUIterator();

-				CUDA_LAUNCH((merge_add_prp_device_impl<decltype(v.toKernel()),decltype(this_.toKernel()),args...>),ite.wthr,ite.thr,v.toKernel(),this_.toKernel(),(unsigned int)offset);
+				CUDA_LAUNCH((merge_add_prp_device_impl<decltype(v.toKernel()),decltype(this_.toKernel()),args...>),ite,v.toKernel(),this_.toKernel(),(unsigned int)offset);

 	#else
 				std::cout << __FILE__ << ":" << __LINE__ << " Error the function merge_prp_device only work when map_vector is compiled with nvcc" << std::endl;

--- a/src/util/cuda/cuda_launch.hpp
+++ b/src/util/cuda/cuda_launch.hpp
@@ -15,10 +15,10 @@

 #include "cuda_kernel_error_checker.hpp"

-#define CUDA_LAUNCH(cuda_call,grid_size,block_size, ...) \
+#define CUDA_LAUNCH(cuda_call,ite, ...) \
        {\
 	    CHECK_SE_CLASS1_PRE\
-		cuda_call<<<(grid_size),(block_size)>>>(__VA_ARGS__); \
+		cuda_call<<<ite.wthr,ite.thr>>>(__VA_ARGS__); \
 		cudaDeviceSynchronize(); \
 		{\
 			cudaError_t e = cudaGetLastError();\
@@ -30,10 +30,11 @@
 			CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
 		}\
        }
+
 #else

-#define CUDA_LAUNCH(cuda_call,grid_size,block_size, ...) \
-		cuda_call<<<(grid_size),(block_size)>>>(__VA_ARGS__);
+#define CUDA_LAUNCH(cuda_call,ite, ...) \
+		cuda_call<<<ite.wthr,ite.thr>>>(__VA_ARGS__);


 #endif