SparseGridGpu_ker_util.hpp 34.1 KB
Newer Older
incardon's avatar
incardon committed
1 2 3 4 5 6 7 8 9 10
/*
 * SparseGridGpu_ker_util.hpp
 *
 *  Created on: Aug 7, 2019
 *      Author: i-bird
 */

#ifndef SPARSEGRIDGPU_KER_UTIL_HPP_
#define SPARSEGRIDGPU_KER_UTIL_HPP_

incardon's avatar
Fixing  
incardon committed
11 12
#include "util/variadic_to_vmpl.hpp"

incardon's avatar
incardon committed
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
template<bool to_set>
struct set_compile_condition
{
	template<unsigned int p, typename SrcType, typename AggrType>
	__device__ __host__ static inline void set(SrcType & src,AggrType & aggr)
	{
		src = aggr.template get<p>();
	}
};

template<>
struct set_compile_condition<false>
{
	template<unsigned int p, typename SrcType, typename AggrType>
	__device__ __host__ static inline void set(SrcType & src,AggrType & aggr)
	{}
};

incardon's avatar
incardon committed
31 32 33 34 35 36 37
template<unsigned int dim,typename T>
struct cross_stencil
{
	T xm[dim];
	T xp[dim];
};

incardon's avatar
incardon committed
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
/*template<unsigned int dim, unsigned int block_edge_size>
struct shift_position
{
    __device__ static inline int shift(int pos, int stencilRadius)
    {
        int accu = 1;
        int pos_s = 0;
        for (int i = 0 ; i < dim ; i++)
        {
            pos_s += (pos % block_edge_size + stencilRadius)*accu;
            accu *= (block_edge_size + 2*stencilRadius);
            pos /= block_edge_size;
        }

        return pos_s;
    }
};

template<unsigned int block_edge_size>
struct shift_position<2,block_edge_size>
{
    __device__ static inline int shift(int pos, int stencilRadius)
    {
        unsigned int x = pos % block_edge_size;
        unsigned int y = (pos / block_edge_size);

        unsigned int g_sz = block_edge_size + 2*stencilRadius;

        return (x+stencilRadius) + (y+stencilRadius)*g_sz;
    }
};


template<unsigned int block_edge_size>
struct shift_position<3,block_edge_size>
{
    __device__ static inline int shift(int pos, int stencilRadius)
    {
        unsigned int x = pos % block_edge_size;
        unsigned int y = (pos / block_edge_size) % block_edge_size;
        unsigned int z = (pos / (block_edge_size*block_edge_size));

        unsigned int g_sz = block_edge_size + 2*stencilRadius;

        return (x+stencilRadius) + (y+stencilRadius)*g_sz + (z+stencilRadius)*g_sz*g_sz;
    }
};*/

incardon's avatar
incardon committed
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
template<unsigned int dim>
struct NNStar
{
	static const int nNN = IntPow<2, dim>::value;

	template<typename indexT, typename blockCoord_type, typename blockMap_type, typename SparseGrid_type>
	__device__ static inline indexT getNNpos(blockCoord_type & blockCoord,
								  blockMap_type & blockMap,
								  SparseGrid_type & sparseGrid,
								  const unsigned int offset)
	{
        //todo: also do the full neighbourhood version, this is just cross
        int neighbourPos = -1;
        if (offset < 2*dim)
        {
            unsigned int d = offset/2;
            int dPos = blockCoord.get(d) + (offset%2)*2 - 1;
            blockCoord.set_d(d, dPos);
incardon's avatar
incardon committed
104 105 106 107 108 109

            int bl = sparseGrid.getBlockLinId(blockCoord);

            bl = (dPos < 0)?-1:bl;

            neighbourPos = blockMap.get_sparse(bl).id;
incardon's avatar
incardon committed
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
        }
        return neighbourPos;
	}

	template<typename indexT, unsigned int blockEdgeSize, typename coordType>
	__host__ static inline indexT getNNskin(coordType & coord, int stencilSupportRadius)
	{
        int neighbourNum = -1;
        int ctr = 0;
        for (int j = 0; j < dim; ++j)
        {
            int c = static_cast<int>(coord.get(j)) - static_cast<int>(stencilSupportRadius);
            if (c < 0)
            {
                neighbourNum = 2*j;
                ++ctr;
            }
            else if (c >= blockEdgeSize)
            {
                neighbourNum = 2*j + 1;
                ++ctr;
            }
        }
        if (ctr > 1) // If we are on a "corner"
        {
            neighbourNum = 0;
        }

        return neighbourNum;
	}

	template<typename sparseGrid_type, typename coord_type, typename Mask_type,unsigned int eb_size>
	__device__ static inline bool isPadding(sparseGrid_type & sparseGrid, coord_type & coord, Mask_type (& enlargedBlock)[eb_size])
	{
		bool isPadding_ = false;
		for (int d=0; d<dim; ++d)
		{
			auto nPlusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, 1);
			auto nMinusId = sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, d, -1);
			typename std::remove_all_extents<Mask_type>::type neighbourPlus = enlargedBlock[nPlusId];
			typename std::remove_all_extents<Mask_type>::type neighbourMinus = enlargedBlock[nMinusId];
			isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourPlus));
			isPadding_ = isPadding_ || (!sparseGrid.exist(neighbourMinus));
			if (isPadding_) break;
		}

		return isPadding_;
	}

	/*! \brief given a coordinate give the neighborhood chunk position and the offset in the neighborhood chunk
	 *
	 *
	 */
	__device__ static inline bool getNNindex_offset()
	{
		return false;
	}
};

incardon's avatar
incardon committed
169 170 171 172 173 174
template<unsigned int n_it>
struct arr_ptr
{
	void * ptr[n_it];
};

incardon's avatar
incardon committed
175 176 177
template<unsigned int n_it,unsigned int n_prp>
struct arr_arr_ptr
{
178
	void * ptr[n_it][n_prp+1];
incardon's avatar
incardon committed
179 180
};

incardon's avatar
incardon committed
181
template<typename copy_type, unsigned int nprp, unsigned int prp_val, unsigned int prp_id>
incardon's avatar
incardon committed
182 183 184 185 186
struct meta_copy_block
{
	template<typename dataBuffer_type>
	__device__ __host__ static void copy(void * (& data_ptr)[nprp], dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
	{
incardon's avatar
incardon committed
187
		((copy_type *)data_ptr[prp_id])[ppos] = dataBuff.template get<prp_val>(dataBlockPos)[offset];
incardon's avatar
incardon committed
188 189 190 191 192
	}

	template<typename dataBuffer_type>
	__device__ __host__ static void copy_inv(arr_arr_ptr<1,nprp> & data_ptr, dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
	{
incardon's avatar
incardon committed
193
		dataBuff.template get<prp_val>(dataBlockPos)[offset] = ((copy_type *)data_ptr.ptr[0][prp_id])[ppos];
incardon's avatar
incardon committed
194 195 196
	}
};

incardon's avatar
incardon committed
197 198
template<typename copy_type, unsigned int nprp, unsigned int prp_val, unsigned int prp_id, unsigned int N1>
struct meta_copy_block<copy_type[N1],nprp,prp_val,prp_id>
incardon's avatar
incardon committed
199 200 201 202 203 204
{
	template<typename dataBuffer_type>
	__device__ __host__ static void copy(void * (& data_ptr)[nprp], dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
	{
		for (int i = 0 ; i < N1 ; i++)
		{
incardon's avatar
incardon committed
205
			((copy_type *)data_ptr[prp_id])[ppos+i*n_pnt] = dataBuff.template get<prp_val>(dataBlockPos)[i][offset];
incardon's avatar
incardon committed
206 207 208 209 210 211 212 213
		}
	}

	template<typename dataBuffer_type>
	__device__ __host__ static void copy_inv(arr_arr_ptr<1,nprp> & data_ptr, dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
	{
		for (int i = 0 ; i < N1 ; i++)
		{
incardon's avatar
incardon committed
214
			dataBuff.template get<prp_val>(dataBlockPos)[i][offset] = ((copy_type *)data_ptr.ptr[0][prp_id])[ppos+i*n_pnt];
incardon's avatar
incardon committed
215 216 217 218
		}
	}
};

incardon's avatar
incardon committed
219 220
template<typename copy_type, unsigned int nprp, unsigned int prp_val, unsigned int prp_id, unsigned int N1, unsigned int N2>
struct meta_copy_block<copy_type[N1][N2],nprp,prp_val,prp_id>
incardon's avatar
incardon committed
221 222 223 224 225 226 227 228
{
	template<typename dataBuffer_type>
	__device__ __host__ static void copy(void * (& data_ptr)[nprp], dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
	{
		for (int i = 0 ; i < N1 ; i++)
		{
			for (int j = 0 ; j < N2 ; j++)
			{
incardon's avatar
incardon committed
229
				((copy_type *)data_ptr[prp_id])[ppos + (i*N2 + j)*n_pnt] = dataBuff.template get<prp_val>(dataBlockPos)[i][j][offset];
incardon's avatar
incardon committed
230 231 232 233 234 235 236 237 238 239 240
			}
		}
	}

	template<typename dataBuffer_type>
	__device__ __host__ static void copy_inv(arr_arr_ptr<1,nprp> & data_ptr, dataBuffer_type & dataBuff, unsigned int ppos, unsigned int dataBlockPos, unsigned int offset, unsigned int n_pnt)
	{
		for (int i = 0 ; i < N1 ; i++)
		{
			for (int j = 0 ; j < N2 ; j++)
			{
incardon's avatar
incardon committed
241
				dataBuff.template get<prp_val>(dataBlockPos)[i][j][offset] = ((copy_type *)data_ptr.ptr[0][prp_id])[ppos + (i*N2 + j)*n_pnt];
incardon's avatar
incardon committed
242 243 244 245 246
			}
		}
	}
};

incardon's avatar
incardon committed
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
/*! \brief this class is a functor for "for_each" algorithm
 *
 * This class is a functor for "for_each" algorithm. For each
 * element of the boost::vector the operator() is called.
 * Is mainly used to calculate the size to pack a point
 *
 * \tparam prp set for properties
 *
 */
template<typename AggregateT, typename dataBuffer_type, int ... prp>
struct sparsegridgpu_pack_impl
{
	typedef typename to_boost_vmpl<prp...>::type vprp;

	//! position of the block
	unsigned int dataBlockPos;

	//! offset
	unsigned int offset;

	//! data buffer
	dataBuffer_type & dataBuff;

	//! point
	unsigned int ppos;

	//! data pointer
274
	void * (& data_ptr)[sizeof...(prp)+1];
incardon's avatar
incardon committed
275

incardon's avatar
incardon committed
276 277 278
	//! Number of points to pack
	unsigned int n_pnt;

incardon's avatar
incardon committed
279 280 281 282 283 284 285
	/*! \brief constructor
	 *
	 */
	__device__ __host__ inline sparsegridgpu_pack_impl(unsigned int dataBlockPos,
								   unsigned int offset,
								   dataBuffer_type & dataBuff,
								   unsigned int ppos,
286
								   void * (& data_ptr)[sizeof...(prp)+1],
incardon's avatar
incardon committed
287
								   unsigned int n_pnt)
incardon's avatar
incardon committed
288
	:dataBlockPos(dataBlockPos),offset(offset),dataBuff(dataBuff),ppos(ppos),data_ptr(data_ptr),n_pnt(n_pnt)
incardon's avatar
incardon committed
289 290 291 292 293 294 295 296 297 298 299
	{};

	//! It call the copy function for each property
	template<typename T>
	__device__ __host__ inline void operator()(T& t)
	{
		typedef typename boost::mpl::at<vprp,T>::type prp_cp;

		// Remove the reference from the type to copy
		typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;

300
		meta_copy_block<pack_type,sizeof...(prp)+1,prp_cp::value,T::value>::copy(data_ptr,dataBuff,ppos,dataBlockPos,offset,n_pnt);
incardon's avatar
incardon committed
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
	}
};


/*! \brief this class is a functor for "for_each" algorithm
 *
 * This class is a functor for "for_each" algorithm. For each
 * element of the boost::vector the operator() is called.
 * Is mainly used to calculate the size to pack a point
 *
 * \tparam prp set for properties
 *
 */
template<typename AggregateT, typename dataBuffer_type, int ... prp>
struct sparsegridgpu_unpack_impl
{
	typedef typename to_boost_vmpl<prp...>::type vprp;

	//! position of the block
	unsigned int dataBlockPos;

	//! offset
	unsigned int offset;

	//! data buffer
	dataBuffer_type & dataBuff;

	//! point
	unsigned int ppos;

	//! data pointer
	arr_arr_ptr<1,sizeof...(prp)> & data_ptr;

incardon's avatar
incardon committed
334 335 336
	//! Number of points to pack
	unsigned int n_pnt;

incardon's avatar
incardon committed
337 338 339 340 341 342 343
	/*! \brief constructor
	 *
	 */
	__device__ __host__ inline sparsegridgpu_unpack_impl(unsigned int dataBlockPos,
								   unsigned int offset,
								   dataBuffer_type & dataBuff,
								   unsigned int ppos,
incardon's avatar
incardon committed
344 345
								   arr_arr_ptr<1,sizeof...(prp)> & data_ptr,
								   unsigned int n_pnt)
incardon's avatar
incardon committed
346
	:dataBlockPos(dataBlockPos),offset(offset),dataBuff(dataBuff),ppos(ppos),data_ptr(data_ptr),n_pnt(n_pnt)
incardon's avatar
incardon committed
347 348 349 350 351 352 353 354 355 356 357
	{};

	//! It call the copy function for each property
	template<typename T>
	__device__ __host__ inline void operator()(T& t)
	{
		typedef typename boost::mpl::at<vprp,T>::type prp_cp;

		// Remove the reference from the type to copy
		typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;

incardon's avatar
incardon committed
358
		meta_copy_block<pack_type,sizeof...(prp),prp_cp::value,T::value>::copy_inv(data_ptr,dataBuff,ppos,dataBlockPos,offset,n_pnt);
incardon's avatar
incardon committed
359 360 361
	}
};

incardon's avatar
Fixing  
incardon committed
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
/*! \brief this class is a functor for "for_each" algorithm
 *
 * This class is a functor for "for_each" algorithm. For each
 * element of the boost::vector the operator() is called.
 * Is mainly used to calculate the size to pack a point
 *
 * \tparam prp set for properties
 *
 */
template<typename AggregateT, int ... prp>
struct sparsegridgpu_pack_request
{
	typedef typename to_boost_vmpl<prp...>::type vprp;

	//! point size
	size_t point_size;

	/*! \brief constructor
	 *
	 */
	inline sparsegridgpu_pack_request()
	:point_size(0)
	{};

	//! It call the copy function for each property
	template<typename T>
	inline void operator()(T& t)
	{
		typedef typename boost::mpl::at<vprp,T>::type prp_cp;

		// Remove the reference from the type to copy
		typedef typename boost::mpl::at<typename AggregateT::type,prp_cp>::type pack_type;

		point_size += sizeof(pack_type);
	}
};

incardon's avatar
incardon committed
399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
template<unsigned int edgeSize, unsigned int dim>
inline __device__ unsigned int coordToLin(const unsigned int (&coord)[dim], const unsigned int paddingSize = 0)
{
	unsigned int linId = coord[dim - 1];
	for (int d = dim - 2; d >= 0; --d)
	{
		linId *= edgeSize + 2 * paddingSize;
		linId += coord[d];
	}
	return linId;
}


template<unsigned int edgeSize, typename CoordT, unsigned int dim>
inline __device__ unsigned int coordToLin(const grid_key_dx<dim, CoordT> &coord, const unsigned int paddingSize = 0)
{
	unsigned int linId = coord.get(dim - 1);
	for (int d = dim - 2; d >= 0; --d)
	{
		linId *= edgeSize + 2 * paddingSize;
		linId += coord.get(d);
	}
	return linId;
}

template <typename CoordT,unsigned int dim>
inline __device__ unsigned int coordToLin(const grid_key_dx<dim, CoordT> & coord, grid_key_dx<dim, int> & blockDimensions)
{
	unsigned int linId = coord.get(dim - 1);
	for (int d = dim - 2; d >= 0; --d)
	{
		linId *= blockDimensions.get(d);
		linId += coord.get(d);
	}
	return linId;
}



template<unsigned int edgeSize, unsigned int dim>
inline __device__ void linToCoordWithOffset(const unsigned int linId, const unsigned int offset, unsigned int (&coord)[dim])
{
	unsigned int linIdTmp = linId;
	for (unsigned int d = 0; d < dim; ++d)
	{
		coord[d] = linIdTmp % edgeSize;
		coord[d] += offset;
		linIdTmp /= edgeSize;
	}
}

incardon's avatar
incardon committed
450 451 452 453 454 455 456 457 458 459 460
template<unsigned int edgeSize, unsigned int dim>
inline __device__ void linToCoord(const unsigned int linId, unsigned int (&coord)[dim])
{
	unsigned int linIdTmp = linId;
	for (unsigned int d = 0; d < dim; ++d)
	{
		coord[d] = linIdTmp % edgeSize;
		linIdTmp /= edgeSize;
	}
}

incardon's avatar
incardon committed
461 462 463
constexpr int gt = 0;
constexpr int nt = 1;

incardon's avatar
incardon committed
464
template<unsigned int nLoop, unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
incardon's avatar
incardon committed
465 466
struct loadGhostBlock_impl
{
incardon's avatar
incardon committed
467 468 469 470 471 472
	template<typename AggrWrapperT,
	         typename SharedPtrT,
	         typename vector_type,
	         typename vector_type2,
	         typename blockMapType,
	         typename AggrBck>
incardon's avatar
incardon committed
473 474 475 476 477 478 479
	__device__ static inline void load(const AggrWrapperT &block,
							SharedPtrT * sharedRegionPtr,
							const vector_type & ghostLayerToThreadsMapping,
							const vector_type2 & nn_blocks,
							const blockMapType & blockMap,
							unsigned int stencilSupportRadius,
							unsigned int ghostLayerSize,
incardon's avatar
incardon committed
480 481
							const unsigned int blockId,
							AggrBck & bck)
incardon's avatar
incardon committed
482 483 484 485 486
	{
		printf("Error to implement loadGhostBlock_impl with nLoop=%d \n",nLoop);
	}
};

incardon's avatar
incardon committed
487 488
template<unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
struct loadGhostBlock_impl<1,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize>
incardon's avatar
incardon committed
489
{
incardon's avatar
incardon committed
490 491 492 493 494 495
	template<typename AggrWrapperT,
			 typename SharedPtrT,
			 typename vector_type,
			 typename vector_type2,
			 typename blockMapType,
			 typename AggrBck>
incardon's avatar
incardon committed
496 497 498 499 500 501 502
	__device__ static inline void load(const AggrWrapperT &block,
							SharedPtrT * sharedRegionPtr,
							const vector_type & ghostLayerToThreadsMapping,
							const vector_type2 & nn_blocks,
							const blockMapType & blockMap,
							unsigned int stencilSupportRadius,
							unsigned int ghostLayerSize,
incardon's avatar
incardon committed
503 504
							const unsigned int blockIdPos,
							AggrBck & bck)
incardon's avatar
incardon committed
505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
	{
			typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;

			const int pos = threadIdx.x % ghostLayerSize;

			__shared__ int neighboursPos[ct_params::nNN];

			const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
			short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);

			// Convert pos into a linear id accounting for the inner domain offsets
			const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
			// Now get linear offset wrt the first element of the block

			int ctr = linId;
			unsigned int acc = 1;
			unsigned int offset = 0;
			for (int i = 0; i < dim; ++i)
			{
				int v = (ctr % edge) - stencilSupportRadius;
				v = (v < 0)?(v + blockEdgeSize):v;
				v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
				offset += v*acc;
				ctr /= edge;
				acc *= blockEdgeSize;
			}

			// Convert pos into a linear id accounting for the ghost offsets
			unsigned int coord[dim];
			linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
incardon's avatar
incardon committed
535
			const unsigned int linId2 = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
incardon's avatar
incardon committed
536 537

			unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));
incardon's avatar
incardon committed
538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554

			if (threadIdx.x < ct_params::nNN)
			{
				neighboursPos[threadIdx.x] = nnb;
			}

			__syncthreads();

			// Actually load the data into the shared region
			auto nPos = neighboursPos[neighbourNum];

			auto gdata = blockMap.template get_ele<p>(nPos)[offset];

			// Actually load the data into the shared region
			//ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
			auto bdata = block.template get<p>()[threadIdx.x];

incardon's avatar
incardon committed
555 556 557 558 559 560
			auto bmask = block.template get<pMask>()[threadIdx.x];
			auto gmask = blockMap.template get_ele<pMask>(nPos)[offset];

			if (bmask == 0)	{ set_compile_condition<pMask != p>::template set<p>(bdata,bck);}
			if (gmask == 0)	{ set_compile_condition<pMask != p>::template set<p>(gdata,bck);}

incardon's avatar
incardon committed
561 562 563 564 565
			sharedRegionPtr[linId] = gdata;
			sharedRegionPtr[linId2] = bdata;
	}
};

incardon's avatar
incardon committed
566 567
template<unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
struct loadGhostBlock_impl<2,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize>
568
{
incardon's avatar
incardon committed
569 570 571 572 573 574
    template<typename AggrWrapperT,
    		 typename SharedPtrT,
    		 typename vector_type,
    		 typename vector_type2,
    		 typename blockMapType,
    		 typename AggrBck>
575 576 577 578 579 580 581
    __device__ static inline void load(const AggrWrapperT &block,
                                       SharedPtrT * sharedRegionPtr,
                                       const vector_type & ghostLayerToThreadsMapping,
                                       const vector_type2 & nn_blocks,
                                       const blockMapType & blockMap,
                                       unsigned int stencilSupportRadius,
                                       unsigned int ghostLayerSize,
incardon's avatar
incardon committed
582 583
                                       const unsigned int blockIdPos,
                                       AggrBck & bck)
584 585 586 587
    {
        typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;

        const int pos = threadIdx.x % ghostLayerSize;
incardon's avatar
incardon committed
588
        const int pos_d1 = (threadIdx.x + blockDim.x) % ghostLayerSize;
589 590 591 592 593

        __shared__ int neighboursPos[ct_params::nNN];

        const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
        short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
incardon's avatar
incardon committed
594
        short int neighbourNum2 = ghostLayerToThreadsMapping.template get<nt>(pos_d1);
595 596 597

        // Convert pos into a linear id accounting for the inner domain offsets
        const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
incardon's avatar
incardon committed
598
        const unsigned int linId2 = ghostLayerToThreadsMapping.template get<gt>(pos_d1);
599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
        // Now get linear offset wrt the first element of the block

        int ctr = linId;
        int ctr2 = linId2;
        unsigned int acc = 1;
        unsigned int offset = 0;
        unsigned int offset2 = 0;
        for (int i = 0; i < dim; ++i)
        {
            int v = (ctr % edge) - stencilSupportRadius;
            int v2 = (ctr2 % edge) - stencilSupportRadius;
            v = (v < 0)?(v + blockEdgeSize):v;
            v2 = (v2 < 0)?(v2 + blockEdgeSize):v2;
            v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
            v2 = (v2 >= blockEdgeSize)?v2-blockEdgeSize:v2;
            offset += v*acc;
            offset2 += v2*acc;
            ctr /= edge;
            ctr2 /= edge;
            acc *= blockEdgeSize;
        }

        // Convert pos into a linear id accounting for the ghost offsets
        unsigned int coord[dim];
        linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
        const int linId_b = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
incardon's avatar
incardon committed
625 626 627
//        const unsigned int linId_b = shift_position<dim,blockEdgeSize>::shift(threadIdx.x,stencilSupportRadius);

//        printf("AAA %d %d \n",linId_b,linId_b_test);
628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648

        unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));

        if (threadIdx.x < ct_params::nNN)
        {
            neighboursPos[threadIdx.x] = nnb;
        }

        __syncthreads();

        // Actually load the data into the shared region
        auto nPos = neighboursPos[neighbourNum];
        auto nPos2 = neighboursPos[neighbourNum2];

        auto gdata = blockMap.template get_ele<p>(nPos)[offset];
        auto gdata2 = blockMap.template get_ele<p>(nPos2)[offset2];

        // Actually load the data into the shared region
        //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
        auto bdata = block.template get<p>()[threadIdx.x];

incardon's avatar
incardon committed
649 650 651 652 653 654 655 656 657 658 659
        auto gmask = blockMap.template get_ele<pMask>(nPos)[offset];
        auto gmask2 = blockMap.template get_ele<pMask>(nPos2)[offset2];

        // Actually load the data into the shared region
        //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
        auto bmask = block.template get<pMask>()[threadIdx.x];

		if (bmask == 0)	{set_compile_condition<pMask != p>::template set<p>(bdata,bck);}
		if (gmask == 0)	{set_compile_condition<pMask != p>::template set<p>(gdata,bck);}
		if (gmask2 == 0)	{set_compile_condition<pMask != p>::template set<p>(gdata2,bck);}

660 661 662 663 664 665
        sharedRegionPtr[linId] = gdata;
        sharedRegionPtr[linId2] = gdata2;
        sharedRegionPtr[linId_b] = bdata;
    }
};

incardon's avatar
incardon committed
666 667
template<unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
struct loadGhostBlock_impl<3,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize>
668
{
incardon's avatar
incardon committed
669 670 671 672 673 674
    template<typename AggrWrapperT,
             typename SharedPtrT,
             typename vector_type,
             typename vector_type2,
             typename blockMapType,
             typename AggrBck>
675 676 677 678 679 680 681
    __device__ static inline void load(const AggrWrapperT &block,
                                       SharedPtrT * sharedRegionPtr,
                                       const vector_type & ghostLayerToThreadsMapping,
                                       const vector_type2 & nn_blocks,
                                       const blockMapType & blockMap,
                                       unsigned int stencilSupportRadius,
                                       unsigned int ghostLayerSize,
incardon's avatar
incardon committed
682 683
                                       const unsigned int blockIdPos,
                                       AggrBck & bck)
684 685 686 687
    {
        typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;

        const int pos = threadIdx.x % ghostLayerSize;
incardon's avatar
incardon committed
688
        const int pos_d1 = (threadIdx.x + 2*blockDim.x) % ghostLayerSize;
689 690 691 692 693 694

        __shared__ int neighboursPos[ct_params::nNN];

        const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
        short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
        short int neighbourNum2 = ghostLayerToThreadsMapping.template get<nt>(pos + blockDim.x);
incardon's avatar
incardon committed
695
        short int neighbourNum3 = ghostLayerToThreadsMapping.template get<nt>(pos_d1);
696 697 698 699

        // Convert pos into a linear id accounting for the inner domain offsets
        const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
        const unsigned int linId2 = ghostLayerToThreadsMapping.template get<gt>(pos + blockDim.x);
incardon's avatar
incardon committed
700
        const unsigned int linId3 = ghostLayerToThreadsMapping.template get<gt>(pos_d1);
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733
        // Now get linear offset wrt the first element of the block

        int ctr = linId;
        int ctr2 = linId2;
        int ctr3 = linId3;
        unsigned int acc = 1;
        unsigned int offset = 0;
        unsigned int offset2 = 0;
        unsigned int offset3 = 0;
        for (int i = 0; i < dim; ++i)
        {
            int v = (ctr % edge) - stencilSupportRadius;
            int v2 = (ctr2 % edge) - stencilSupportRadius;
            int v3 = (ctr3 % edge) - stencilSupportRadius;
            v = (v < 0)?(v + blockEdgeSize):v;
            v2 = (v2 < 0)?(v2 + blockEdgeSize):v2;
            v3 = (v3 < 0)?(v3 + blockEdgeSize):v3;
            v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
            v2 = (v2 >= blockEdgeSize)?v2-blockEdgeSize:v2;
            v3 = (v3 >= blockEdgeSize)?v3-blockEdgeSize:v3;
            offset += v*acc;
            offset2 += v2*acc;
            offset3 += v3*acc;
            ctr /= edge;
            ctr2 /= edge;
            ctr3 /= edge;
            acc *= blockEdgeSize;
        }

        // Convert pos into a linear id accounting for the ghost offsets
        unsigned int coord[dim];
        linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
        const int linId_b = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
incardon's avatar
incardon committed
734 735 736
//        const unsigned int linId_b = shift_position<dim,blockEdgeSize>::shift(threadIdx.x,stencilSupportRadius);

//        printf("AAA %d %d \n",linId_b,linId_b_test);
737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759

        unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));

        if (threadIdx.x < ct_params::nNN)
        {
            neighboursPos[threadIdx.x] = nnb;
        }

        __syncthreads();

        // Actually load the data into the shared region
        auto nPos = neighboursPos[neighbourNum];
        auto nPos2 = neighboursPos[neighbourNum2];
        auto nPos3 = neighboursPos[neighbourNum3];

        auto gdata = blockMap.template get_ele<p>(nPos)[offset];
        auto gdata2 = blockMap.template get_ele<p>(nPos2)[offset2];
        auto gdata3 = blockMap.template get_ele<p>(nPos3)[offset3];

        // Actually load the data into the shared region
        //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
        auto bdata = block.template get<p>()[threadIdx.x];

incardon's avatar
incardon committed
760 761 762 763 764 765 766 767 768 769 770 771 772
        auto gmask = blockMap.template get_ele<pMask>(nPos)[offset];
        auto gmask2 = blockMap.template get_ele<pMask>(nPos2)[offset2];
        auto gmask3 = blockMap.template get_ele<pMask>(nPos3)[offset3];

        // Actually load the data into the shared region
        //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
        auto bmask = block.template get<pMask>()[threadIdx.x];

		if (bmask == 0)	{set_compile_condition<pMask != p>::template set<p>(bdata,bck);}
		if (gmask == 0)	{set_compile_condition<pMask != p>::template set<p>(gdata,bck);}
		if (gmask2 == 0)	{set_compile_condition<pMask != p>::template set<p>(gdata2,bck);}
		if (gmask3 == 0)	{set_compile_condition<pMask != p>::template set<p>(gdata3,bck);}

773 774 775 776 777 778
        sharedRegionPtr[linId] = gdata;
        sharedRegionPtr[linId2] = gdata2;
        sharedRegionPtr[linId3] = gdata3;
        sharedRegionPtr[linId_b] = bdata;
    }
};
incardon's avatar
incardon committed
779

incardon's avatar
incardon committed
780 781
template<unsigned int dim, typename AggregateBlockT, unsigned int pMask , unsigned int p, typename ct_params, unsigned int blockEdgeSize>
struct loadGhostBlock_impl<7,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize>
782
{
incardon's avatar
incardon committed
783 784 785 786 787 788
    template<typename AggrWrapperT,
             typename SharedPtrT,
             typename vector_type,
             typename vector_type2,
             typename blockMapType,
             typename AggrBck>
789 790 791 792 793 794 795
    __device__ static inline void load(const AggrWrapperT &block,
                                       SharedPtrT * sharedRegionPtr,
                                       const vector_type & ghostLayerToThreadsMapping,
                                       const vector_type2 & nn_blocks,
                                       const blockMapType & blockMap,
                                       unsigned int stencilSupportRadius,
                                       unsigned int ghostLayerSize,
incardon's avatar
incardon committed
796 797
                                       const unsigned int blockIdPos,
                                       AggrBck & bck)
798 799 800 801
    {
        typedef ScalarTypeOf<AggregateBlockT, p> ScalarT;

        const int pos = threadIdx.x % ghostLayerSize;
incardon's avatar
incardon committed
802
        const int pos_d1 = (threadIdx.x + 6*blockDim.x) % ghostLayerSize;
803 804 805 806 807 808 809 810 811 812

        __shared__ int neighboursPos[ct_params::nNN];

        const unsigned int edge = blockEdgeSize + 2*stencilSupportRadius;
        short int neighbourNum = ghostLayerToThreadsMapping.template get<nt>(pos);
        short int neighbourNum2 = ghostLayerToThreadsMapping.template get<nt>(pos + blockDim.x);
        short int neighbourNum3 = ghostLayerToThreadsMapping.template get<nt>(pos + 2*blockDim.x);
        short int neighbourNum4 = ghostLayerToThreadsMapping.template get<nt>(pos + 3*blockDim.x);
        short int neighbourNum5 = ghostLayerToThreadsMapping.template get<nt>(pos + 4*blockDim.x);
        short int neighbourNum6 = ghostLayerToThreadsMapping.template get<nt>(pos + 5*blockDim.x);
incardon's avatar
incardon committed
813
        short int neighbourNum7 = ghostLayerToThreadsMapping.template get<nt>(pos_d1);
814 815 816 817 818 819 820 821

        // Convert pos into a linear id accounting for the inner domain offsets
        const unsigned int linId = ghostLayerToThreadsMapping.template get<gt>(pos);
        const unsigned int linId2 = ghostLayerToThreadsMapping.template get<gt>(pos + blockDim.x);
        const unsigned int linId3 = ghostLayerToThreadsMapping.template get<gt>(pos + 2*blockDim.x);
        const unsigned int linId4 = ghostLayerToThreadsMapping.template get<gt>(pos + 3*blockDim.x);
        const unsigned int linId5 = ghostLayerToThreadsMapping.template get<gt>(pos + 4*blockDim.x);
        const unsigned int linId6 = ghostLayerToThreadsMapping.template get<gt>(pos + 5*blockDim.x);
incardon's avatar
incardon committed
822
        const unsigned int linId7 = ghostLayerToThreadsMapping.template get<gt>(pos_d1);
823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883
        // Now get linear offset wrt the first element of the block

        int ctr = linId;
        int ctr2 = linId2;
        int ctr3 = linId3;
        int ctr4 = linId4;
        int ctr5 = linId5;
        int ctr6 = linId6;
        int ctr7 = linId7;
        unsigned int acc = 1;
        unsigned int offset = 0;
        unsigned int offset2 = 0;
        unsigned int offset3 = 0;
        unsigned int offset4 = 0;
        unsigned int offset5 = 0;
        unsigned int offset6 = 0;
        unsigned int offset7 = 0;
        for (int i = 0; i < dim; ++i)
        {
            int v = (ctr % edge) - stencilSupportRadius;
            int v2 = (ctr2 % edge) - stencilSupportRadius;
            int v3 = (ctr3 % edge) - stencilSupportRadius;
            int v4 = (ctr4 % edge) - stencilSupportRadius;
            int v5 = (ctr5 % edge) - stencilSupportRadius;
            int v6 = (ctr6 % edge) - stencilSupportRadius;
            int v7 = (ctr7 % edge) - stencilSupportRadius;
            v = (v < 0)?(v + blockEdgeSize):v;
            v2 = (v2 < 0)?(v2 + blockEdgeSize):v2;
            v3 = (v3 < 0)?(v3 + blockEdgeSize):v3;
            v4 = (v4 < 0)?(v4 + blockEdgeSize):v4;
            v5 = (v5 < 0)?(v5 + blockEdgeSize):v5;
            v6 = (v6 < 0)?(v6 + blockEdgeSize):v6;
            v7 = (v7 < 0)?(v7 + blockEdgeSize):v7;
            v = (v >= blockEdgeSize)?v-blockEdgeSize:v;
            v2 = (v2 >= blockEdgeSize)?v2-blockEdgeSize:v2;
            v3 = (v3 >= blockEdgeSize)?v3-blockEdgeSize:v3;
            v4 = (v4 >= blockEdgeSize)?v4-blockEdgeSize:v4;
            v5 = (v5 >= blockEdgeSize)?v5-blockEdgeSize:v5;
            v6 = (v6 >= blockEdgeSize)?v6-blockEdgeSize:v6;
            v7 = (v7 >= blockEdgeSize)?v7-blockEdgeSize:v7;
            offset += v*acc;
            offset2 += v2*acc;
            offset3 += v3*acc;
            offset4 += v4*acc;
            offset5 += v5*acc;
            offset6 += v6*acc;
            offset7 += v7*acc;
            ctr /= edge;
            ctr2 /= edge;
            ctr3 /= edge;
            ctr4 /= edge;
            ctr5 /= edge;
            ctr6 /= edge;
            ctr7 /= edge;
            acc *= blockEdgeSize;
        }

        // Convert pos into a linear id accounting for the ghost offsets
        unsigned int coord[dim];
        linToCoordWithOffset<blockEdgeSize>(threadIdx.x, stencilSupportRadius, coord);
        const int linId_b = coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
incardon's avatar
incardon committed
884 885 886
//        const unsigned int linId_b = shift_position<dim,blockEdgeSize>::shift(threadIdx.x,stencilSupportRadius);

//        printf("AAA %d %d \n",linId_b,linId_b_test);
887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917

        unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN));

        if (threadIdx.x < ct_params::nNN)
        {
            neighboursPos[threadIdx.x] = nnb;
        }

        __syncthreads();

        // Actually load the data into the shared region
        auto nPos = neighboursPos[neighbourNum];
        auto nPos2 = neighboursPos[neighbourNum2];
        auto nPos3 = neighboursPos[neighbourNum3];
        auto nPos4 = neighboursPos[neighbourNum4];
        auto nPos5 = neighboursPos[neighbourNum5];
        auto nPos6 = neighboursPos[neighbourNum6];
        auto nPos7 = neighboursPos[neighbourNum7];

        auto gdata = blockMap.template get_ele<p>(nPos)[offset];
        auto gdata2 = blockMap.template get_ele<p>(nPos2)[offset2];
        auto gdata3 = blockMap.template get_ele<p>(nPos3)[offset3];
        auto gdata4 = blockMap.template get_ele<p>(nPos4)[offset4];
        auto gdata5 = blockMap.template get_ele<p>(nPos5)[offset5];
        auto gdata6 = blockMap.template get_ele<p>(nPos6)[offset6];
        auto gdata7 = blockMap.template get_ele<p>(nPos7)[offset7];

        // Actually load the data into the shared region
        //ScalarT *basePtr = (ScalarT *)sharedRegionPtr;
        auto bdata = block.template get<p>()[threadIdx.x];

incardon's avatar
incardon committed
918 919 920 921 922 923 924 925 926
        auto bmask = block.template get<pMask>()[threadIdx.x];
        auto gmask = blockMap.template get_ele<pMask>(nPos)[offset];
        auto gmask2 = blockMap.template get_ele<pMask>(nPos2)[offset2];
        auto gmask3 = blockMap.template get_ele<pMask>(nPos3)[offset3];
        auto gmask4 = blockMap.template get_ele<pMask>(nPos4)[offset4];
        auto gmask5 = blockMap.template get_ele<pMask>(nPos5)[offset5];
        auto gmask6 = blockMap.template get_ele<pMask>(nPos6)[offset6];
        auto gmask7 = blockMap.template get_ele<pMask>(nPos7)[offset7];

927 928 929 930 931 932 933 934
		if (bmask == 0)	{set_compile_condition<pMask != p>::template set<p>(bdata,bck);}
		if (gmask == 0)	{set_compile_condition<pMask != p>::template set<p>(gdata,bck);}
		if (gmask2 == 0)	{set_compile_condition<pMask != p>::template set<p>(gdata2,bck);}
		if (gmask3 == 0)	{set_compile_condition<pMask != p>::template set<p>(gdata3,bck);}
		if (gmask4 == 0)	{set_compile_condition<pMask != p>::template set<p>(gdata4,bck);}
		if (gmask5 == 0)	{set_compile_condition<pMask != p>::template set<p>(gdata5,bck);}
		if (gmask6 == 0)	{set_compile_condition<pMask != p>::template set<p>(gdata6,bck);}
		if (gmask7 == 0)	{set_compile_condition<pMask != p>::template set<p>(gdata7,bck);}
incardon's avatar
incardon committed
935

936 937 938 939 940 941 942 943 944 945 946
        sharedRegionPtr[linId] = gdata;
        sharedRegionPtr[linId2] = gdata2;
        sharedRegionPtr[linId3] = gdata3;
        sharedRegionPtr[linId4] = gdata4;
        sharedRegionPtr[linId5] = gdata5;
        sharedRegionPtr[linId6] = gdata6;
        sharedRegionPtr[linId7] = gdata7;
        sharedRegionPtr[linId_b] = bdata;
    }
};

incardon's avatar
incardon committed
947
#endif /* SPARSEGRIDGPU_KER_UTIL_HPP_ */