vector_dist_comm_util_funcs.cuh 13.1 KB
Newer Older
incardon's avatar
incardon committed
1 2 3 4 5 6 7 8 9 10
/*
 * vector_dist_comm_util_funcs.hpp
 *
 *  Created on: Sep 13, 2018
 *      Author: i-bird
 */

#ifndef VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
#define VECTOR_DIST_COMM_UTIL_FUNCS_HPP_

11
template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, typename scan_type, bool is_ok_cuda>
incardon's avatar
incardon committed
12 13
struct labelParticlesGhost_impl
{
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
	static void run(CudaMemory & mem,
					scan_type & sc,
					Decomposition & dec,
					openfpm::vector<aggregate<unsigned int,unsigned long int>,
							CudaMemory,
							typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type,
							memory_traits_inte> & g_opart_device,
				    openfpm::vector<aggregate<unsigned int>,
				                            Memory,
				                            typename layout_base<aggregate<unsigned int>>::type,
				                            layout_base> & proc_id_out,
				    openfpm::vector<aggregate<unsigned int>,
				                             Memory,
				                             typename layout_base<aggregate<unsigned int>>::type,
				                             layout_base> & starts,
incardon's avatar
incardon committed
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
		            Vcluster<Memory> & v_cl,
					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
            		openfpm::vector<size_t> & prc,
            		openfpm::vector<size_t> & prc_sz,
            		openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_offset,
            		size_t & g_m,
            		size_t opt)
	{
		std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
	}
};



44 45
template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, typename scan_type>
struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,scan_type,true>
incardon's avatar
incardon committed
46
{
47 48 49
	static void run(CudaMemory & mem,
					scan_type & sc,
					Decomposition & dec,
incardon's avatar
incardon committed
50 51 52 53
					openfpm::vector<aggregate<unsigned int,unsigned long int>,
							CudaMemory,
							typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type,
							memory_traits_inte> & g_opart_device,
54 55 56 57 58 59 60 61
				    openfpm::vector<aggregate<unsigned int>,
				                            Memory,
				                            typename layout_base<aggregate<unsigned int>>::type,
				                            layout_base> & proc_id_out,
				    openfpm::vector<aggregate<unsigned int>,
				                             Memory,
				                             typename layout_base<aggregate<unsigned int>>::type,
				                             layout_base> & starts,
incardon's avatar
incardon committed
62 63 64 65 66 67 68 69 70 71 72
					Vcluster<Memory> & v_cl,
					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
            		openfpm::vector<size_t> & prc,
            		openfpm::vector<size_t> & prc_sz,
            		openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_offset,
            		size_t & g_m,
            		size_t opt)
	{
#if defined(CUDA_GPU) && defined(__NVCC__)

incardon's avatar
incardon committed
73 74
			if (v_cl.size() == 1)
			{return;}
incardon's avatar
incardon committed
75 76 77 78 79 80 81

			proc_id_out.resize(v_pos.size()+1);
			proc_id_out.template get<0>(proc_id_out.size()-1) = 0;
			proc_id_out.template hostToDevice(proc_id_out.size()-1,proc_id_out.size()-1);

			auto ite = v_pos.getGPUIterator();

incardon's avatar
incardon committed
82 83 84 85
			// no work to do return
			if (ite.wthr.x == 0)
			{return;}

incardon's avatar
incardon committed
86
			// First we have to see how many entry each particle produce
87
			num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>
incardon's avatar
incardon committed
88 89 90 91
			<<<ite.wthr,ite.thr>>>
			(dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel());

			// scan
92
			sc.scan_(proc_id_out,starts);
incardon's avatar
incardon committed
93 94 95 96 97 98 99 100 101 102 103
			starts.resize(proc_id_out.size());
			starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
			size_t sz = starts.template get<0>(starts.size()-1);

			// we compute processor id for each particle

		    g_opart_device.resize(sz);

			ite = v_pos.getGPUIterator();

			// we compute processor id for each particle
104
			proc_label_id_ghost<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>
incardon's avatar
incardon committed
105 106 107 108 109 110 111 112 113 114
			<<<ite.wthr,ite.thr>>>
			(dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel());

			// sort particles
			mergesort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());

			mem.allocate(sizeof(int));
			mem.fill(0);
			prc_offset.resize(v_cl.size());

incardon's avatar
incardon committed
115 116
			ite = g_opart_device.getGPUIterator();

incardon's avatar
incardon committed
117 118 119 120 121 122 123
			// Find the buffer bases
			find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())><<<ite.wthr,ite.thr>>>
					           (g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel());

			// Trasfer the number of offsets on CPU
			mem.deviceToHost();
			int noff = *(int *)mem.getPointer();
incardon's avatar
incardon committed
124

incardon's avatar
incardon committed
125 126
			// create the terminal of prc_offset
			prc_offset.resize(noff+1,DATA_ON_DEVICE);
incardon's avatar
incardon committed
127

incardon's avatar
incardon committed
128 129 130
			// Move the last processor index on device (id)
			if (g_opart_device.size() != 0)
			{g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);}
incardon's avatar
incardon committed
131
			prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size();
incardon's avatar
incardon committed
132 133 134 135 136
			if (g_opart_device.size() != 0)
			{prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);}
			else
			{prc_offset.template get<1>(prc_offset.size()-1) = 0;}

incardon's avatar
incardon committed
137 138 139 140 141 142 143 144 145 146 147
			prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1);

			// Here we reorder the offsets in ascending order
			mergesort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());

			prc_offset.template deviceToHost<0,1>();

			// In this case we do not have communications at all
			if (g_opart_device.size() == 0)
			{noff = -1;}

incardon's avatar
incardon committed
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
			prc.resize(noff+1);
			prc_sz.resize(noff+1);

			size_t base_offset = 0;

			// Transfert to prc the list of processors
			prc.resize(noff+1);
			for (size_t i = 0 ; i < noff+1 ; i++)
			{
				prc.get(i) = prc_offset.template get<1>(i);
				prc_sz.get(i) = prc_offset.template get<0>(i) - base_offset;
				base_offset = prc_offset.template get<0>(i);
			}
#else

			std::cout << __FILE__ << ":" << __LINE__ << " error: to use gpu computation you must compile vector_dist.hpp with NVCC" << std::endl;

#endif
	}
};

template<bool with_pos,unsigned int dim, typename St,  typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda>
struct local_ghost_from_opart_impl
{
	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc,
					const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts,
					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
            		size_t opt)
	{
		std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
	}
};

template<bool with_pos, unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base>
struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true>
{
	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc,
					const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts,
					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
            		size_t opt)
	{
#if defined(CUDA_GPU) && defined(__NVCC__)

				auto ite = o_part_loc.getGPUIterator();

				size_t old = v_pos.size();

				v_pos.resize(v_pos.size() + o_part_loc.size(),DATA_ON_DEVICE);
				v_prp.resize(v_prp.size() + o_part_loc.size(),DATA_ON_DEVICE);

				process_ghost_particles_local<with_pos,dim,decltype(o_part_loc.toKernel()),decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(shifts.toKernel())>
				<<<ite.wthr,ite.thr>>>
				(o_part_loc.toKernel(),v_pos.toKernel(),v_prp.toKernel(),shifts.toKernel(),old);

#else
				std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
#endif
	}
};

template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda>
struct local_ghost_from_dec_impl
{
	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc,
					const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts,
					openfpm::vector<Box<dim, St>,Memory,typename layout_base<Box<dim,St>>::type,layout_base> & box_f_dev,
					openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & box_f_sv,
					Vcluster<Memory> & v_cl,
218
					openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & starts,
incardon's avatar
incardon committed
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
            		size_t & g_m,
            		size_t opt)
	{
		std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
	}
};


template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base>
struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
{
	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc,
					const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts,
					openfpm::vector<Box<dim, St>,Memory,typename layout_base<Box<dim,St>>::type,layout_base> & box_f_dev,
					openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & box_f_sv,
					Vcluster<Memory> & v_cl,
237
					openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & starts,
incardon's avatar
incardon committed
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
            		size_t & g_m,
            		size_t opt)
	{
#if defined(CUDA_GPU) && defined(__NVCC__)

		o_part_loc.resize(g_m+1);
		o_part_loc.template get<0>(o_part_loc.size()-1) = 0;
		o_part_loc.template hostToDevice(o_part_loc.size()-1,o_part_loc.size()-1);

		// Label the internal (assigned) particles
		auto ite = v_pos.getGPUIteratorTo(g_m);

		// label particle processor
incardon's avatar
incardon committed
253
		num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
incardon's avatar
incardon committed
254
		<<<ite.wthr,ite.thr>>>
incardon's avatar
incardon committed
255
		(box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m);
incardon's avatar
incardon committed
256 257 258 259 260 261 262 263 264 265 266 267 268 269

		starts.resize(o_part_loc.size());
		mgpu::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());

		starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
		size_t total = starts.template get<0>(starts.size()-1);
		size_t old = v_pos.size();

		v_pos.resize(v_pos.size() + total);
		v_prp.resize(v_prp.size() + total);

		// Label the internal (assigned) particles
		ite = v_pos.getGPUIteratorTo(g_m);

incardon's avatar
incardon committed
270 271 272
		// resize o_part_loc
		o_part_loc.resize(total);

incardon's avatar
incardon committed
273 274 275 276 277 278 279
		shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
									 decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),
									 decltype(starts.toKernel()),decltype(shifts.toKernel()),
									 decltype(o_part_loc.toKernel())>
		<<<ite.wthr,ite.thr>>>
		(box_f_dev.toKernel(),box_f_sv.toKernel(),
		 v_pos.toKernel(),v_prp.toKernel(),
incardon's avatar
incardon committed
280
		 starts.toKernel(),shifts.toKernel(),o_part_loc.toKernel(),old,g_m);
incardon's avatar
incardon committed
281 282 283 284 285 286 287 288

#else
		std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
#endif
	}
};

#endif /* VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ */