vector_dist_comm_util_funcs.cuh 13.2 KB
Newer Older
incardon's avatar
incardon committed
1 2 3 4 5 6 7 8 9 10
/*
 * vector_dist_comm_util_funcs.hpp
 *
 *  Created on: Sep 13, 2018
 *      Author: i-bird
 */

#ifndef VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
#define VECTOR_DIST_COMM_UTIL_FUNCS_HPP_

11 12 13
#define SKIP_LABELLING 512
#define KEEP_PROPERTIES 512

14
template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, typename scan_type, bool is_ok_cuda>
incardon's avatar
incardon committed
15 16
struct labelParticlesGhost_impl
{
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
	static void run(CudaMemory & mem,
					scan_type & sc,
					Decomposition & dec,
					openfpm::vector<aggregate<unsigned int,unsigned long int>,
							CudaMemory,
							typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type,
							memory_traits_inte> & g_opart_device,
				    openfpm::vector<aggregate<unsigned int>,
				                            Memory,
				                            typename layout_base<aggregate<unsigned int>>::type,
				                            layout_base> & proc_id_out,
				    openfpm::vector<aggregate<unsigned int>,
				                             Memory,
				                             typename layout_base<aggregate<unsigned int>>::type,
				                             layout_base> & starts,
incardon's avatar
incardon committed
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
		            Vcluster<Memory> & v_cl,
					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
            		openfpm::vector<size_t> & prc,
            		openfpm::vector<size_t> & prc_sz,
            		openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_offset,
            		size_t & g_m,
            		size_t opt)
	{
		std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
	}
};



47 48
template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, typename scan_type>
struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,scan_type,true>
incardon's avatar
incardon committed
49
{
50 51 52
	static void run(CudaMemory & mem,
					scan_type & sc,
					Decomposition & dec,
incardon's avatar
incardon committed
53 54 55 56
					openfpm::vector<aggregate<unsigned int,unsigned long int>,
							CudaMemory,
							typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type,
							memory_traits_inte> & g_opart_device,
57 58 59 60 61 62 63 64
				    openfpm::vector<aggregate<unsigned int>,
				                            Memory,
				                            typename layout_base<aggregate<unsigned int>>::type,
				                            layout_base> & proc_id_out,
				    openfpm::vector<aggregate<unsigned int>,
				                             Memory,
				                             typename layout_base<aggregate<unsigned int>>::type,
				                             layout_base> & starts,
incardon's avatar
incardon committed
65 66 67 68 69 70 71 72 73 74 75
					Vcluster<Memory> & v_cl,
					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
            		openfpm::vector<size_t> & prc,
            		openfpm::vector<size_t> & prc_sz,
            		openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & prc_offset,
            		size_t & g_m,
            		size_t opt)
	{
#if defined(CUDA_GPU) && defined(__NVCC__)

incardon's avatar
incardon committed
76 77
			if (v_cl.size() == 1)
			{return;}
incardon's avatar
incardon committed
78 79 80 81 82 83 84

			proc_id_out.resize(v_pos.size()+1);
			proc_id_out.template get<0>(proc_id_out.size()-1) = 0;
			proc_id_out.template hostToDevice(proc_id_out.size()-1,proc_id_out.size()-1);

			auto ite = v_pos.getGPUIterator();

incardon's avatar
incardon committed
85 86 87 88
			// no work to do return
			if (ite.wthr.x == 0)
			{return;}

incardon's avatar
incardon committed
89
			// First we have to see how many entry each particle produce
90
			num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>
incardon's avatar
incardon committed
91 92 93 94
			<<<ite.wthr,ite.thr>>>
			(dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel());

			// scan
95
			sc.scan_(proc_id_out,starts);
incardon's avatar
incardon committed
96 97 98 99 100 101 102 103 104 105 106
			starts.resize(proc_id_out.size());
			starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
			size_t sz = starts.template get<0>(starts.size()-1);

			// we compute processor id for each particle

		    g_opart_device.resize(sz);

			ite = v_pos.getGPUIterator();

			// we compute processor id for each particle
107
			proc_label_id_ghost<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>
incardon's avatar
incardon committed
108 109 110 111 112 113 114 115 116 117
			<<<ite.wthr,ite.thr>>>
			(dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel());

			// sort particles
			mergesort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());

			mem.allocate(sizeof(int));
			mem.fill(0);
			prc_offset.resize(v_cl.size());

incardon's avatar
incardon committed
118 119
			ite = g_opart_device.getGPUIterator();

incardon's avatar
incardon committed
120 121 122 123 124 125 126
			// Find the buffer bases
			find_buffer_offsets<0,decltype(g_opart_device.toKernel()),decltype(prc_offset.toKernel())><<<ite.wthr,ite.thr>>>
					           (g_opart_device.toKernel(),(int *)mem.getDevicePointer(),prc_offset.toKernel());

			// Trasfer the number of offsets on CPU
			mem.deviceToHost();
			int noff = *(int *)mem.getPointer();
incardon's avatar
incardon committed
127

incardon's avatar
incardon committed
128 129
			// create the terminal of prc_offset
			prc_offset.resize(noff+1,DATA_ON_DEVICE);
incardon's avatar
incardon committed
130

incardon's avatar
incardon committed
131 132 133
			// Move the last processor index on device (id)
			if (g_opart_device.size() != 0)
			{g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);}
incardon's avatar
incardon committed
134
			prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size();
incardon's avatar
incardon committed
135 136 137 138 139
			if (g_opart_device.size() != 0)
			{prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);}
			else
			{prc_offset.template get<1>(prc_offset.size()-1) = 0;}

incardon's avatar
incardon committed
140 141 142 143 144 145 146 147 148 149 150
			prc_offset.template hostToDevice<0,1>(prc_offset.size()-1,prc_offset.size()-1);

			// Here we reorder the offsets in ascending order
			mergesort((int *)prc_offset.template getDeviceBuffer<0>(),(int *)prc_offset.template getDeviceBuffer<1>(), prc_offset.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());

			prc_offset.template deviceToHost<0,1>();

			// In this case we do not have communications at all
			if (g_opart_device.size() == 0)
			{noff = -1;}

incardon's avatar
incardon committed
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
			prc.resize(noff+1);
			prc_sz.resize(noff+1);

			size_t base_offset = 0;

			// Transfert to prc the list of processors
			prc.resize(noff+1);
			for (size_t i = 0 ; i < noff+1 ; i++)
			{
				prc.get(i) = prc_offset.template get<1>(i);
				prc_sz.get(i) = prc_offset.template get<0>(i) - base_offset;
				base_offset = prc_offset.template get<0>(i);
			}
#else

			std::cout << __FILE__ << ":" << __LINE__ << " error: to use gpu computation you must compile vector_dist.hpp with NVCC" << std::endl;

#endif
	}
};

template<bool with_pos,unsigned int dim, typename St,  typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda>
struct local_ghost_from_opart_impl
{
	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc,
					const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts,
					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
            		size_t opt)
	{
		std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
	}
};

template<bool with_pos, unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base>
struct local_ghost_from_opart_impl<with_pos,dim,St,prop,Memory,layout_base,true>
{
	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc,
					const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts,
					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
            		size_t opt)
	{
#if defined(CUDA_GPU) && defined(__NVCC__)

				auto ite = o_part_loc.getGPUIterator();

				size_t old = v_pos.size();

				v_pos.resize(v_pos.size() + o_part_loc.size(),DATA_ON_DEVICE);
201 202 203 204 205

				if (!(opt & SKIP_LABELLING))
				{
					v_prp.resize(v_prp.size() + o_part_loc.size(),DATA_ON_DEVICE);
				}
incardon's avatar
incardon committed
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224

				process_ghost_particles_local<with_pos,dim,decltype(o_part_loc.toKernel()),decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(shifts.toKernel())>
				<<<ite.wthr,ite.thr>>>
				(o_part_loc.toKernel(),v_pos.toKernel(),v_prp.toKernel(),shifts.toKernel(),old);

#else
				std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
#endif
	}
};

template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base, bool is_ok_cuda>
struct local_ghost_from_dec_impl
{
	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc,
					const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts,
					openfpm::vector<Box<dim, St>,Memory,typename layout_base<Box<dim,St>>::type,layout_base> & box_f_dev,
					openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & box_f_sv,
					Vcluster<Memory> & v_cl,
225
					openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & starts,
incardon's avatar
incardon committed
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
            		size_t & g_m,
            		size_t opt)
	{
		std::cout << __FILE__ << ":" << __LINE__ << " error, you are trying to use using Cuda functions for a non cuda enabled data-structures" << std::endl;
	}
};


template<unsigned int dim, typename St, typename prop, typename Memory, template <typename> class layout_base>
struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
{
	static void run(openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> & o_part_loc,
					const openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & shifts,
					openfpm::vector<Box<dim, St>,Memory,typename layout_base<Box<dim,St>>::type,layout_base> & box_f_dev,
					openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & box_f_sv,
					Vcluster<Memory> & v_cl,
244
					openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & starts,
incardon's avatar
incardon committed
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
					openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
            		openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
            		size_t & g_m,
            		size_t opt)
	{
#if defined(CUDA_GPU) && defined(__NVCC__)

		o_part_loc.resize(g_m+1);
		o_part_loc.template get<0>(o_part_loc.size()-1) = 0;
		o_part_loc.template hostToDevice(o_part_loc.size()-1,o_part_loc.size()-1);

		// Label the internal (assigned) particles
		auto ite = v_pos.getGPUIteratorTo(g_m);

		// label particle processor
incardon's avatar
incardon committed
260
		num_shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),decltype(v_pos.toKernel()),decltype(o_part_loc.toKernel())>
incardon's avatar
incardon committed
261
		<<<ite.wthr,ite.thr>>>
incardon's avatar
incardon committed
262
		(box_f_dev.toKernel(),box_f_sv.toKernel(),v_pos.toKernel(),o_part_loc.toKernel(),g_m);
incardon's avatar
incardon committed
263 264 265 266 267 268 269 270 271 272 273 274 275 276

		starts.resize(o_part_loc.size());
		mgpu::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());

		starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
		size_t total = starts.template get<0>(starts.size()-1);
		size_t old = v_pos.size();

		v_pos.resize(v_pos.size() + total);
		v_prp.resize(v_prp.size() + total);

		// Label the internal (assigned) particles
		ite = v_pos.getGPUIteratorTo(g_m);

incardon's avatar
incardon committed
277 278 279
		// resize o_part_loc
		o_part_loc.resize(total);

incardon's avatar
incardon committed
280 281 282 283 284 285 286
		shift_ghost_each_part<dim,St,decltype(box_f_dev.toKernel()),decltype(box_f_sv.toKernel()),
									 decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),
									 decltype(starts.toKernel()),decltype(shifts.toKernel()),
									 decltype(o_part_loc.toKernel())>
		<<<ite.wthr,ite.thr>>>
		(box_f_dev.toKernel(),box_f_sv.toKernel(),
		 v_pos.toKernel(),v_prp.toKernel(),
incardon's avatar
incardon committed
287
		 starts.toKernel(),shifts.toKernel(),o_part_loc.toKernel(),old,g_m);
incardon's avatar
incardon committed
288 289 290 291 292 293 294 295

#else
		std::cout << __FILE__ << ":" << __LINE__ << " error: to use the option RUN_ON_DEVICE you must compile with NVCC" << std::endl;
#endif
	}
};

#endif /* VECTOR_DIST_COMM_UTIL_FUNCS_HPP_ */