diff --git a/CMakeLists.txt b/CMakeLists.txt index fffdd0c889d7bf142f675cd6a7e4b2d203dba38c..57e0ebf829f0a191f09b04ebc5404117306573f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,6 @@ endif() set (CMAKE_CXX_STANDARD 14) set (CMAKE_CUDA_STANDARD 14) - set(Vc_DIR "${Vc_ROOT}/lib/cmake/Vc/") message("Searching Vc in ${Vc_DIR}") diff --git a/src/Grid/cuda/cuda_grid_gpu_int.cu b/src/Grid/cuda/cuda_grid_gpu_int.cu index abe26d5250713c2221583a5a90f0cb9991ea8b44..aaab8292a3a1e4ce879c88c981f1d3dc896597d8 100644 --- a/src/Grid/cuda/cuda_grid_gpu_int.cu +++ b/src/Grid/cuda/cuda_grid_gpu_int.cu @@ -70,7 +70,7 @@ __global__ void test_launch_cuda_native(float * scalar, float * vector, int sxy, constexpr int NN_num = 4; -template +/*template __global__ void test_launch_cell_list(celllist_type cell, ite_gpu<3> ite_gpu) { GRID_ID_3(ite_gpu) @@ -98,7 +98,7 @@ __global__ void test_launch_cell_list(celllist_type cell, ite_gpu<3> ite_gpu) } printf("CELLLIST %d %d %d nn_part: %d NN: %d %d %d %d \n",(int)key.get(0),(int)key.get(1),(int)key.get(2),nn_part,NN[0],NN[1],NN[2],NN[3]); -} +}*/ BOOST_AUTO_TEST_SUITE( grid_gpu_func_interp ) @@ -107,7 +107,6 @@ BOOST_AUTO_TEST_CASE (gpu_p2m) openfpm::vector_gpu> pos; openfpm::vector_gpu> prop; - pos.resize(100); prop.resize(100); diff --git a/src/Grid/grid_base_implementation.hpp b/src/Grid/grid_base_implementation.hpp index 821f8e2ac804d27bb453d81a9c864509c2e4ecb8..0bcf49873d505a7f3225b2a835a6a4943c1c5b21 100644 --- a/src/Grid/grid_base_implementation.hpp +++ b/src/Grid/grid_base_implementation.hpp @@ -1256,6 +1256,14 @@ public: } } + /*! \brief Indicate that unpacking the header is supported + * + * \return false + * + */ + static bool is_unpack_header_supported() + {return false;} + /*! \brief Resize the grid * * Resize the grid to the old information is retained on the new grid, diff --git a/src/Grid/map_grid.hpp b/src/Grid/map_grid.hpp index cbb7ba1d4705877412e9508bbd9251b74edaa956..91355de35731e356dcf805e7ac8cb26d206c4e8c 100755 --- a/src/Grid/map_grid.hpp +++ b/src/Grid/map_grid.hpp @@ -163,6 +163,26 @@ public: { } + /*! \brief Stub does not do anything + * + */ + template + static void unpack_headers(pointers_type & pointers, headers_type & headers, result_type & result, int n_slot) + {} + + template + void unpack_with_headers(ExtPreAlloc & mem, + ite_type & sub_it, + header_type & headers, + int ih, + Unpack_stat & ps, + context_type &context, + rem_copy_opt opt = rem_copy_opt::NONE_OPT) + {} + /*! \brief It copy a grid * * \param g grid to copy @@ -743,6 +763,12 @@ public: { } + /*! \brief Stub does not do anything + * + */ + static void unpack_headers() + {} + /*! \brief Fill the memory with a byte * */ diff --git a/src/Packer_Unpacker/Unpacker.hpp b/src/Packer_Unpacker/Unpacker.hpp index 07ffeb0480a972dd09a1a3dd8c0a95caffa33f76..3d1257015dd30d03b271affc0384285f20f93f22 100644 --- a/src/Packer_Unpacker/Unpacker.hpp +++ b/src/Packer_Unpacker/Unpacker.hpp @@ -266,6 +266,21 @@ public: { obj.template unpack(mem, sub_it, ps, context, opt); } + + template static void unpack_with_header(ExtPreAlloc & mem, + grid_sub_it_type & sub_it, + T & obj, + header_type & header, + int ih, + Unpack_stat & ps, + context_type & context, + rem_copy_opt opt) + { + obj.template unpack_with_headers(mem, sub_it, header, ih, ps, context, opt); + } }; diff --git a/src/Space/Shape/Point.hpp b/src/Space/Shape/Point.hpp index bf2d657c8df5fdd3fe916e3ea67313b0eaba2407..687a0ad17e5eb50efbce57e0a1d2c1880d56b26e 100644 --- a/src/Space/Shape/Point.hpp +++ b/src/Space/Shape/Point.hpp @@ -100,7 +100,7 @@ template class Point * \param p Point * */ - template inline Point(const Point & p) + template __device__ __host__ inline Point(const Point & p) { for (size_t i = 0 ; i < dim ; i++) {get(i) = static_cast(p.get(i));} diff --git a/src/SparseGrid/SparseGrid.hpp b/src/SparseGrid/SparseGrid.hpp index a2477d5e26e87ae896f7ced8ddf68d79ddb3d4a3..5f177c008c7b8adcb4b5b7f8b77d2d1953909611 100644 --- a/src/SparseGrid/SparseGrid.hpp +++ b/src/SparseGrid/SparseGrid.hpp @@ -987,6 +987,26 @@ public: return sparse_grid_bck_value::type>(chunks.get(0)); } + /*! \brief Stub does not do anything + * + */ + template + static void unpack_headers(pointers_type & pointers, headers_type & headers, result_type & result, int n_slot) + {} + + template + void unpack_with_headers(ExtPreAlloc & mem, + ite_type & sub_it, + header_type & headers, + int ih, + Unpack_stat & ps, + context_type &context, + rem_copy_opt opt = rem_copy_opt::NONE_OPT) + {} + /*! \brief Get the background value * * \return background value @@ -1093,6 +1113,14 @@ public: return get_selector< typename boost::mpl::at>::type >::template get_const

(chunks,active_cnk,sub_id); } + /*! \brief Indicate that unpacking the header is supported + * + * \return false + * + */ + static bool is_unpack_header_supported() + {return false;} + /*! \brief Get the reference of the selected element * * \param v1 grid_key that identify the element in the grid diff --git a/src/SparseGridGpu/Geometry/grid_zmb.hpp b/src/SparseGridGpu/Geometry/grid_zmb.hpp index e6bd19e6b9933b8d3a2361e9277f15a361ef7bff..cf719c0659265c55f6299e061dd30bb37f075e6d 100644 --- a/src/SparseGridGpu/Geometry/grid_zmb.hpp +++ b/src/SparseGridGpu/Geometry/grid_zmb.hpp @@ -22,7 +22,7 @@ * tuned for blocked data. */ template -class grid_zmb : private grid_smb +class grid_zmb : private grid_smb { public: @@ -30,34 +30,34 @@ public: grid_zmb() {} __host__ __device__ grid_zmb(const size_t (& sz)[dim]) - :grid_smb(sz) + :grid_smb(sz) {} __host__ __device__ grid_zmb(const size_t domainBlockEdgeSize) - :grid_smb(domainBlockEdgeSize) + :grid_smb(domainBlockEdgeSize) {} template __host__ __device__ grid_zmb(const grid_sm blockGrid) - :grid_smb(blockGrid) + :grid_smb(blockGrid) {} #ifdef __NVCC__ //Constructors from dim3 and uint3 objects __host__ __device__ grid_zmb(const dim3 blockDimensions) - :grid_smb(blockDimensions) + :grid_smb(blockDimensions) {} #endif // __NVCC__ __host__ __device__ grid_zmb(const grid_zmb &other) - :grid_smb(other) + :grid_smb(other) {} __host__ __device__ grid_zmb &operator=(const grid_zmb &other) { - ((grid_smb *)this)->operator=(other); + ((grid_smb *)this)->operator=(other); return *this; } @@ -115,7 +115,7 @@ public: __host__ __device__ const indexT (& getSize() const)[dim] { - return grid_smb::getSize(); + return grid_smb::getSize(); } @@ -123,12 +123,12 @@ public: template inline __host__ __device__ grid_key_dx getGlobalCoord(const grid_key_dx & blockCoord, unsigned int offset) const { - return grid_smb::getGlobalCoord(blockCoord,offset); + return grid_smb::getGlobalCoord(blockCoord,offset); } inline indexT getBlockSize() const { - return grid_smb::getBlockSize(); + return grid_smb::getBlockSize(); } }; diff --git a/src/SparseGridGpu/SparseGridGpu.hpp b/src/SparseGridGpu/SparseGridGpu.hpp index ed8983e42c146146fabb75f99531d2d7b9d6c233..19e2834aea36f934022d0ef09e5a9ebf530da8aa 100644 --- a/src/SparseGridGpu/SparseGridGpu.hpp +++ b/src/SparseGridGpu/SparseGridGpu.hpp @@ -602,6 +602,9 @@ private: //! Memory to remove copy finalize ExtPreAlloc * prAlloc_prp; + //! shifts for chunk conversion + openfpm::vector_gpu> shifts; + bool findNN = false; inline void swap_internal_remote() @@ -1099,7 +1102,7 @@ private: auto & o_map = this->getSegmentToOutMap(); auto & segments_data = this->getSegmentToMergeIndexMap(); - new_map.resize(a_map.size()); + new_map.resize(a_map.size(),0); // construct new to old map @@ -1386,7 +1389,7 @@ private: if (n_cnk != 0) { - openfpm::vector_gpu> shifts; + shifts.clear(); int n_shift = 1; shifts.add(); @@ -3152,6 +3155,99 @@ public: grid_src.copySect.add(sgs); } + /*! \brief Stub does not do anything + * + */ + template + static void unpack_headers(pointers_type & pointers, headers_type & headers, result_type & result, int n_slot) + { + // we have to increment ps by the right amount + sparsegridgpu_pack_request spq; + boost::mpl::for_each_ref>(spq); + + result.allocate(sizeof(int)); + + CUDA_LAUNCH_DIM3((SparseGridGpuKernels::unpack_headers().toKernel())>),1,pointers.size(), + pointers.toKernel(), + headers.toKernel(), + (int *)result.getDevicePointer(), + spq.point_size, + n_slot) + } + + /*! \brief unpack the sub-grid object + * + * \tparam prp properties to unpack + * + * \param mem preallocated memory from where to unpack the object + * \param sub sub-grid iterator + * \param obj object where to unpack + * + */ + template + void unpack_with_headers(ExtPreAlloc & mem, + SparseGridGpu_iterator_sub & sub_it, + header_type & headers, + int ih, + Unpack_stat & ps, + mgpu::ofp_context_t &context, + rem_copy_opt opt = rem_copy_opt::NONE_OPT) + { + //////////////////////////////////////////////////////////// + + if ((opt & rem_copy_opt::KEEP_GEOMETRY) == false) + { + this->template addAndConvertPackedChunkToTmp(mem,sub_it,ps,context); + + // readjust mem + } + else + { + // we have to increment ps by the right amount + sparsegridgpu_pack_request spq; + boost::mpl::for_each_ref>(spq); + + // First get the number of chunks + + size_t n_cnk = headers.template get<1>(ih); + ps.addOffset(sizeof(size_t)); + ps.addOffset(2*dim*sizeof(unsigned int)); + + size_t actual_offset = n_cnk*sizeof(indexT); + unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT)); + + // Unpack number of points + // calculate the number of total points + size_t n_pnt = headers.template get<2>(ih); + actual_offset += align_number(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int)); + + void * data_base_ptr = (void *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset ); + + actual_offset += align_number(sizeof(indexT),n_pnt*(spq.point_size)); + short int * offsets = (short int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + actual_offset); + + actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(short)); + actual_offset += align_number(sizeof(indexT),n_pnt*sizeof(unsigned char)); + + scan_ptrs_cp.add(scan); + offset_ptrs_cp.add(offsets); + data_base_ptr_cp.add(data_base_ptr); + + ps.addOffset(actual_offset); + } + } + + /*! \brief Indicate that unpacking the header is supported + * + * \return true + * + */ + static bool is_unpack_header_supported() + {return true;} + /*! \brief unpack the sub-grid object * * \tparam prp properties to unpack @@ -3191,7 +3287,7 @@ public: Unpacker::unpack(mem,n_cnk,ps); // Unpack origin of the chunk indexing - for (int i = 0 ; i < dim ; i++) +/* for (int i = 0 ; i < dim ; i++) { int tmp; Unpacker::unpack(mem,tmp,ps); @@ -3201,7 +3297,9 @@ public: { int tmp; Unpacker::unpack(mem,tmp,ps); - } + }*/ + + ps.addOffset(2*dim*sizeof(unsigned int)); size_t actual_offset = n_cnk*sizeof(indexT); unsigned int * scan = (unsigned int *)((unsigned char *)mem.getDevicePointer() + ps.getOffset() + n_cnk*sizeof(indexT)); @@ -3465,7 +3563,7 @@ public: template::type::value, - unsigned int threadBlockSize = 128, + unsigned int threadBlockSize = default_edge::tb::value, typename indexT=long int, template class layout_base=memory_traits_inte, typename linearizer = grid_zmb> @@ -3474,11 +3572,19 @@ using SparseGridGpu_z = SparseGridGpu::type::value, - unsigned int threadBlockSize = 128, + unsigned int threadBlockSize = default_edge::tb::value, typename indexT=int, template class layout_base=memory_traits_inte, typename linearizer = grid_zmb> using SparseGridGpu_zi = SparseGridGpu; +template::type::value, + unsigned int threadBlockSize = default_edge::tb::value, + typename indexT=int, + template class layout_base=memory_traits_inte, + typename linearizer = grid_smb> +using SparseGridGpu_i = SparseGridGpu; #endif //OPENFPM_PDATA_SPARSEGRIDGPU_HPP diff --git a/src/SparseGridGpu/SparseGridGpu_ker.cuh b/src/SparseGridGpu/SparseGridGpu_ker.cuh index 3313e23564bfebd93e4bd88f99713185f896284a..d0eae1f20b601431311f95d691f60d5baef474b1 100644 --- a/src/SparseGridGpu/SparseGridGpu_ker.cuh +++ b/src/SparseGridGpu/SparseGridGpu_ker.cuh @@ -81,6 +81,44 @@ public: background(bck) {} + /*! \brief + * + * \param + * + */ + template + __device__ static int unpack_headers(headers_type & headers, unsigned char * data, int ih, int sz_pack) + { + size_t n_cnk; + if (sizeof(indexT) == 8) + {n_cnk = ((size_t *)data)[0];} + else + { + unsigned int dp1 = ((unsigned int *)data)[0]; + unsigned int dp2 = ((unsigned int *)&data[4])[0]; + n_cnk = (size_t)dp1 + ((size_t)dp2 << 32); + } + headers.template get<1>(ih) = n_cnk; +// for (int i = 0 ; i < dim ; i++) +// {headers.template get<3>(ih)[i] = data[2*sizeof(size_t) + i*sizeof(int)];} + +// for (int i = 0 ; i < dim ; i++) +// {headers.template get<4>(ih)[i] = data[2*sizeof(size_t) + dim*sizeof(int) + i*sizeof(int)];} + + size_t actual_offset = n_cnk*sizeof(indexT); + + unsigned int n_pnt = *(unsigned int *)&(data[sizeof(size_t) + 2*dim*sizeof(int) + actual_offset + n_cnk*sizeof(unsigned int)]); + headers.template get<2>(ih) = n_pnt; + + return sizeof(size_t) + // byte required to pack the number of chunk packed + 2*dim*sizeof(int) + // starting point + size of the indexing packing + sizeof(indexT)*n_cnk + // byte required to pack the chunk indexes + align_number_device(sizeof(indexT),(n_cnk+1)*sizeof(unsigned int)) + // byte required to pack the scan of the chunk point + align_number_device(sizeof(indexT),n_pnt*sz_pack) + // byte required to pack data + align_number_device(sizeof(indexT),n_pnt*sizeof(short int)) + // byte required to pack offsets + align_number_device(sizeof(indexT),n_pnt*sizeof(unsigned char)); // byte required to pack masks; + } + /*! \brief Get the coordinate of the block and the offset id inside the block it give the global coordinate * * \param blockCoord block coordinate @@ -299,6 +337,8 @@ public: unsigned int coord[dim]; linToCoordWithOffset(pos, stencilSupportRadius, coord); const unsigned int linId = coordToLin(coord, stencilSupportRadius); +// const unsigned int linId = shift_position::shift(pos,stencilSupportRadius); + return linId; } @@ -316,6 +356,8 @@ public: unsigned int coord[dim]; linToCoordWithOffset(offset, stencilSupportRadius, coord); return coordToLin(coord, stencilSupportRadius); + +// return shift_position::shift(offset,stencilSupportRadius); } template @@ -714,6 +756,9 @@ private: unsigned int coord[dim]; linToCoordWithOffset(pos, stencilSupportRadius, coord); const unsigned int linId = coordToLin(coord, stencilSupportRadius); + +// const unsigned int linId = shift_position::shift(pos,stencilSupportRadius); + // Actually load the data into the shared region //ScalarT *basePtr = (ScalarT *)sharedRegionPtr; @@ -846,6 +891,8 @@ private: unsigned int coord[dim]; linToCoordWithOffset(pos, stencilSupportRadius, coord); const unsigned int linId = coordToLin(coord, stencilSupportRadius); +// const unsigned int linId = shift_position::shift(pos,stencilSupportRadius); + // Actually store the data from the shared region ScalarT *basePtr = (ScalarT *)sharedRegionPtr; diff --git a/src/SparseGridGpu/SparseGridGpu_ker_util.hpp b/src/SparseGridGpu/SparseGridGpu_ker_util.hpp index 67d801da5e6db2931892896b0e9ef4c9a9bee6ea..cf94c556976e107922dffdda2ead534b6b5e72ca 100644 --- a/src/SparseGridGpu/SparseGridGpu_ker_util.hpp +++ b/src/SparseGridGpu/SparseGridGpu_ker_util.hpp @@ -35,6 +35,54 @@ struct cross_stencil T xp[dim]; }; +/*template +struct shift_position +{ + __device__ static inline int shift(int pos, int stencilRadius) + { + int accu = 1; + int pos_s = 0; + for (int i = 0 ; i < dim ; i++) + { + pos_s += (pos % block_edge_size + stencilRadius)*accu; + accu *= (block_edge_size + 2*stencilRadius); + pos /= block_edge_size; + } + + return pos_s; + } +}; + +template +struct shift_position<2,block_edge_size> +{ + __device__ static inline int shift(int pos, int stencilRadius) + { + unsigned int x = pos % block_edge_size; + unsigned int y = (pos / block_edge_size); + + unsigned int g_sz = block_edge_size + 2*stencilRadius; + + return (x+stencilRadius) + (y+stencilRadius)*g_sz; + } +}; + + +template +struct shift_position<3,block_edge_size> +{ + __device__ static inline int shift(int pos, int stencilRadius) + { + unsigned int x = pos % block_edge_size; + unsigned int y = (pos / block_edge_size) % block_edge_size; + unsigned int z = (pos / (block_edge_size*block_edge_size)); + + unsigned int g_sz = block_edge_size + 2*stencilRadius; + + return (x+stencilRadius) + (y+stencilRadius)*g_sz + (z+stencilRadius)*g_sz*g_sz; + } +};*/ + template struct NNStar { @@ -53,7 +101,12 @@ struct NNStar unsigned int d = offset/2; int dPos = blockCoord.get(d) + (offset%2)*2 - 1; blockCoord.set_d(d, dPos); - neighbourPos = blockMap.get_sparse(sparseGrid.getBlockLinId(blockCoord)).id; + + int bl = sparseGrid.getBlockLinId(blockCoord); + + bl = (dPos < 0)?-1:bl; + + neighbourPos = blockMap.get_sparse(bl).id; } return neighbourPos; } @@ -479,7 +532,7 @@ struct loadGhostBlock_impl<1,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize // Convert pos into a linear id accounting for the ghost offsets unsigned int coord[dim]; linToCoordWithOffset(threadIdx.x, stencilSupportRadius, coord); - const int linId2 = coordToLin(coord, stencilSupportRadius); + const unsigned int linId2 = coordToLin(coord, stencilSupportRadius); unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN)); @@ -569,6 +622,9 @@ struct loadGhostBlock_impl<2,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize unsigned int coord[dim]; linToCoordWithOffset(threadIdx.x, stencilSupportRadius, coord); const int linId_b = coordToLin(coord, stencilSupportRadius); +// const unsigned int linId_b = shift_position::shift(threadIdx.x,stencilSupportRadius); + +// printf("AAA %d %d \n",linId_b,linId_b_test); unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN)); @@ -675,6 +731,9 @@ struct loadGhostBlock_impl<3,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize unsigned int coord[dim]; linToCoordWithOffset(threadIdx.x, stencilSupportRadius, coord); const int linId_b = coordToLin(coord, stencilSupportRadius); +// const unsigned int linId_b = shift_position::shift(threadIdx.x,stencilSupportRadius); + +// printf("AAA %d %d \n",linId_b,linId_b_test); unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN)); @@ -822,6 +881,9 @@ struct loadGhostBlock_impl<7,dim,AggregateBlockT,pMask,p,ct_params,blockEdgeSize unsigned int coord[dim]; linToCoordWithOffset(threadIdx.x, stencilSupportRadius, coord); const int linId_b = coordToLin(coord, stencilSupportRadius); +// const unsigned int linId_b = shift_position::shift(threadIdx.x,stencilSupportRadius); + +// printf("AAA %d %d \n",linId_b,linId_b_test); unsigned int nnb = nn_blocks.template get<0>(blockIdPos*ct_params::nNN + (threadIdx.x % ct_params::nNN)); diff --git a/src/SparseGridGpu/SparseGridGpu_kernels.cuh b/src/SparseGridGpu/SparseGridGpu_kernels.cuh index c114e444d10a063432671d7ab7c17e0e121b91df..33edebbf6987a2a555eda1053903086e3ced7739 100644 --- a/src/SparseGridGpu/SparseGridGpu_kernels.cuh +++ b/src/SparseGridGpu/SparseGridGpu_kernels.cuh @@ -29,6 +29,42 @@ enum mask_sparse // Kernels for SparseGridGpu namespace SparseGridGpuKernels { + template + __global__ void unpack_headers(pointers_type pointers, headers_type headers, int * result, unsigned int sz_pack, int n_slot) + { + int t = threadIdx.x; + + if (t > pointers.size()) {return;} + + unsigned char * data_pack = (unsigned char *)pointers.template get<0>(t); + + while (data_pack < pointers.template get<1>(t) ) + { + int ih = pointers.template get<2>(t); + if (n_slot > ih) + { + if (sizeof(typename SparseGridGpuType::indexT_) == 8) + {headers.template get<0>(t*n_slot + ih) = *(size_t *)data_pack;} + else + { + unsigned int dp1 = *(unsigned int *)data_pack; + unsigned int dp2 = *(unsigned int *)&(data_pack[4]); + headers.template get<0>(t*n_slot + ih) = (size_t)dp1 + (((size_t)dp2) << 32); + } + data_pack += sizeof(size_t); + data_pack += SparseGridGpuType::unpack_headers(headers,data_pack,t*n_slot + ih,sz_pack); + pointers.template get<2>(t) += 1; + } + else + { + // report error + result[0] = 1; + return; + } + } + } + + template struct stencil_cross_func_impl { @@ -810,8 +846,8 @@ namespace SparseGridGpuKernels { // Read local mask to register curMask = dataBlockLoad.template get()[offset]; - if (bx.isInsideKey(pointCoord) == false) - {curMask = 0;} + for (int i = 0 ; i < dim ; i++) + {curMask &= (pointCoord.get(i) < bx.getLow(i) || pointCoord.get(i) > bx.getHigh(i))?0:0xFF;} } openfpm::sparse_index sdataBlockPos; diff --git a/src/SparseGridGpu/tests/SparseGridGpu_tests.cu b/src/SparseGridGpu/tests/SparseGridGpu_tests.cu index fcc11696f4b98d49259bc119b19ba70c73b78d7a..06cb79a116519aeafefd4ddfbe83bc79bc89232c 100644 --- a/src/SparseGridGpu/tests/SparseGridGpu_tests.cu +++ b/src/SparseGridGpu/tests/SparseGridGpu_tests.cu @@ -499,10 +499,10 @@ BOOST_AUTO_TEST_CASE(testStencilHeat) for (unsigned int iter=0; iter>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1); - sparseGrid.applyStencils>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.1); + sparseGrid.applyStencils>(sparseGrid.getBox(),STENCIL_MODE_INPLACE, 0.01); } - sparseGrid.template deviceToHost<0>(); + sparseGrid.template deviceToHost<0,1>(); // Compare bool match = true; @@ -511,7 +511,7 @@ BOOST_AUTO_TEST_CASE(testStencilHeat) auto coord = sparseGrid.getCoord(i); float expectedValue = 10.0 * coord.get(0) / (gridSize.x * blockEdgeSize - 1); - match &= fabs(sparseGrid.template get<0>(coord) - expectedValue) < 1e-2; + match &= fabs(sparseGrid.template get<1>(coord) - expectedValue) < 1e-2; } diff --git a/src/SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh b/src/SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh index 101bab8ee7a74e3183215bd2a637113fc655d16c..295f4ce40044c9c127afb6a58a51f62178b359f3 100644 --- a/src/SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh +++ b/src/SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh @@ -27,7 +27,7 @@ struct BoundaryStencilSetX DataBlockWrapperT & dataBlockStore, unsigned char curMask) { - if (curMask & mask_sparse::EXIST_AND_PADDING) + if (curMask == mask_sparse::EXIST_AND_PADDING) { dataBlockStore.template get()[offset] = pointCoord.get(0); } @@ -55,9 +55,10 @@ struct BoundaryStencilSetXRescaled unsigned char curMask, ScalarT minX, ScalarT maxX, ScalarT minValue, ScalarT maxValue) { - if (curMask & mask_sparse::EXIST_AND_PADDING) + if (curMask == mask_sparse::EXIST_AND_PADDING) { const ScalarT x = pointCoord.get(0); + auto value = maxValue * (x - minX) / (maxX - minX - 1); if (x < minX) { diff --git a/src/SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh b/src/SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh index 7e74d82e7410d5fb84bde0f11a08dc77c944b5dd..6137706e4e8162a7c28b7f12725ae8a7ed6bf0b5 100644 --- a/src/SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh +++ b/src/SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh @@ -266,7 +266,17 @@ struct HeatStencil } // enlargedBlock[linId] = cur + dt * laplacian; res = cur + dt * laplacian; + + if (pointCoord.get(0) == 0 && pointCoord.get(1) == 8) + { + auto nPlusIdX = enlargedBlock[sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, 0, 1)]; + auto nMinusIdX = enlargedBlock[sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, 0, -1)]; + auto nPlusIdY = enlargedBlock[sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, 1, 1)]; + auto nMinusIdY = enlargedBlock[sparseGrid.getNeighbourLinIdInEnlargedBlock(coord, 1, -1)]; + printf("POINT STENCIL: %f %f %f %f %f DIFF: %f\n",nPlusIdX,nMinusIdX,nPlusIdY,nMinusIdY,cur,nPlusIdX+nMinusIdX+nPlusIdY+nMinusIdY-4.0*cur); + } } + __syncthreads(); if ((curMask & mask_sparse::EXIST) && !(curMask & mask_sparse::PADDING)) { diff --git a/src/timer.hpp b/src/timer.hpp index 64ea9a60f715c69d980f71ec471effb0640f81f2..5362fbc9bf1e989feabe8c2ecab8ffada3ab2827 100644 --- a/src/timer.hpp +++ b/src/timer.hpp @@ -42,6 +42,9 @@ class timer // Fill the stop point void check() { +#if defined(SYNC_BEFORE_TAKE_TIME) && defined(__NVCC__) + cudaDeviceSynchronize(); +#endif #ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time clock_serv_t cclock; @@ -75,6 +78,10 @@ public: // time is running running = true; +#if defined(SYNC_BEFORE_TAKE_TIME) && defined(__NVCC__) + cudaDeviceSynchronize(); +#endif + #ifdef __MACH__ // OS X does not have clock_gettime, use clock_get_time clock_serv_t cclock; mach_timespec_t mts; @@ -86,6 +93,7 @@ public: #else clock_gettime(CLOCK_REALTIME, &tsstart); #endif + cstart = clock(); }