...
 
Commits (18)
......@@ -24,7 +24,7 @@ if (ENABLE_GPU)
set(WARNING_SUPPRESSION_AND_OPTION_NVCC -Xcudafe "--display_error_number --diag_suppress=611 --diag_suppress=2885 --diag_suppress=2886 --diag_suppress=2887 --diag_suppress=2888 --diag_suppress=186 --diag_suppress=111" --expt-extended-lambda)
elseif ( CUDA_VERSION_MAJOR EQUAL 10 AND CUDA_VERSION_MINOR EQUAL 1 )
message("CUDA is compatible")
set(WARNING_SUPPRESSION_AND_OPTION_NVCC -Xcudafe "--display_error_number --diag_suppress=2915 --diag_suppress=2914 --diag_suppress=2912 --diag_suppress=2913 --diag_suppress=111 --diag_suppress=186 --diag_suppress=611 " --expt-extended-lambda)
set(WARNING_SUPPRESSION_AND_OPTION_NVCC -Xcudafe "--display_error_number --diag_suppress=2915 --diag_suppress=2914 --diag_suppress=2912 --diag_suppress=2913 --diag_suppress=111 --diag_suppress=186 --diag_suppress=611 --diag_suppress=1301 --diag_suppress=177 --diag_suppress=2928 --diag_suppress=2929 --diag_suppress=2930 --diag_suppress=2931" --expt-extended-lambda)
else()
message(FATAL_ERROR "CUDA is incompatible, version 9.2 is only supported")
endif()
......
......@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
########################### Executables
if (TEST_PERFORMANCE)
set(CUDA_SOURCES SparseGridGpu/performance/Stencil_performance_tests.cu)
set(CUDA_SOURCES SparseGridGpu/performance/Stencil_performance_tests.cu SparseGridGpu/performance/performancePlots.hpp SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh)
endif ()
if (CUDA_FOUND)
......@@ -22,8 +22,8 @@ if (CUDA_FOUND)
util/test/zmorton_unit_tests.cpp
util/cuda/test/segreduce_block_cuda_tests.cu
SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu
SparseGridGpu/tests/SparseGridGpu_tests.cu
)
SparseGridGpu/tests/SparseGridGpu_tests.cu
SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh)
else ()
set(CUDA_SOURCES)
endif ()
......@@ -66,14 +66,16 @@ if (CUDA_FOUND)
# target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >)
# The below is for better performance on CUDA
# "-rdc=true" is for Dynamic Parallelism (cooperative groups)
target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -gencode arch=compute_50,code=sm_50 -g -lineinfo >)
# target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -gencode arch=compute_50,code=sm_50 -g -lineinfo >) # For Maxwell
target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -gencode arch=compute_61,code=sm_61 -g -lineinfo >) # For Pascal
if (TEST_COVERAGE)
target_compile_options(mem_map PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fprofile-arcs -ftest-coverage" >)
endif ()
# target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >)
# The below is for better performance on CUDA
# "-rdc=true" is for Dynamic Parallelism (cooperative groups)
target_compile_options(isolation PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -gencode arch=compute_50,code=sm_50 -g -lineinfo >)
# target_compile_options(isolation PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -gencode arch=compute_50,code=sm_50 -g -lineinfo >) # For Maxwell
target_compile_options(isolation PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -gencode arch=compute_61,code=sm_61 -g -lineinfo >) # For Pascal
if (TEST_COVERAGE)
target_compile_options(mem_map PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fprofile-arcs -ftest-coverage" >)
endif ()
......
......@@ -60,10 +60,59 @@ public:
template<unsigned int p>
auto get(unsigned int linId) const -> const ScalarTypeOf<AggregateBlockT, p> &;
// auto insert(unsigned int linId) -> decltype(blockMap.insert(0));
/*! \brief insert data, host version
*
* \tparam property id
*
* \param linId linearized id block + local linearization
*
* \return a reference to the data
*
*/
template<unsigned int p>
auto insert(unsigned int linId) -> ScalarTypeOf<AggregateBlockT, p> &
{
typedef BlockTypeOf<AggregateBlockT, p> BlockT;
unsigned int blockId = linId / BlockT::size;
unsigned int offset = linId % BlockT::size;
auto aggregate = blockMap.insert(blockId);
auto &block = aggregate.template get<p>();
auto &mask = aggregate.template get<pMask>();
setExist(mask[offset]);
return block[offset];
}
/*! \brief insert a block + flush, host version
*
* \tparam property id
*
* \param linId linearized id block
*
* \return a reference to the block data
*
*/
template<unsigned int p>
auto insert(unsigned int linId) -> ScalarTypeOf<AggregateBlockT, p> &;
auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId).template get<p>())
{
typedef BlockTypeOf<AggregateBlockT, p> BlockT;
auto aggregate = blockMap.insertFlush(blockId);
auto &block = aggregate.template get<p>();
return block;
}
/*! \brief insert a block + flush, host version
*
* \param linId linearized id block
*
* \return a reference to the block data
*
*/
auto insertBlockFlush(size_t blockId) -> decltype(blockMap.insertFlush(blockId))
{
return blockMap.insertFlush(blockId);
}
BlockMapGpu_ker<AggregateInternalT, indexT, layout_base> toKernel()
{
......@@ -72,7 +121,10 @@ public:
}
template<unsigned int ... prp>
void deviceToHost();
void deviceToHost()
{
blockMap.template deviceToHost<prp..., pMask>();
}
void deviceToHost();
......@@ -160,28 +212,6 @@ BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::get(unsigned
}
}
template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
template<unsigned int p>
auto
BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::insert(unsigned int linId) -> ScalarTypeOf<AggregateBlockT, p> &
{
typedef BlockTypeOf<AggregateBlockT, p> BlockT;
unsigned int blockId = linId / BlockT::size;
unsigned int offset = linId % BlockT::size;
auto aggregate = blockMap.insert(blockId);
auto &block = aggregate.template get<p>();
auto &mask = aggregate.template get<pMask>();
setExist(mask[offset]);
return block[offset];
}
template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
template<unsigned int ... prp>
void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::deviceToHost()
{
blockMap.template deviceToHost<prp..., pMask>();
}
template<typename AggregateBlockT, unsigned int threadBlockSize, typename indexT, template<typename> class layout_base>
void BlockMapGpu<AggregateBlockT, threadBlockSize, indexT, layout_base>::deviceToHost()
{
......
......@@ -757,12 +757,15 @@ namespace BlockMapGpuFunctors
// Create the output for the keys
keysOut.resize(data_out_size); // The final number of keys is one less than the segments values
ite = keysOut.getGPUIterator();
ite = keys.getGPUIterator();
CUDA_LAUNCH(BlockMapGpuKernels::copyKeyToDstIndexIfPredicate,ite,keys.toKernel(), s_ids.toKernel(), keysOut.toKernel());
// the new keys are now in keysOut
// Phase 2 - segreduce on all properties
dataOut.reserve(data_out_size+1);
dataOut.resize(data_out_size); // Right size for output, i.e. the number of segments
typedef boost::mpl::vector<v_reduce...> vv_reduce;
......
This diff is collapsed.
......@@ -36,6 +36,7 @@ public:
static constexpr unsigned int blockEdgeSize_ = blockEdgeSize;
unsigned int stencilSupportRadius;
typedef AggregateBlockT AggregateBlockType;
typedef indexT indexT_;
//! Indicate this structure has a function to check the device pointer
typedef int yes_has_check_device_pointer;
......@@ -129,7 +130,7 @@ public:
return linId;
}
inline __device__ grid_key_dx<dim, int>
inline __device__ grid_key_dx<dim,int>
getCoordInEnlargedBlock(const unsigned int offset) const
{
unsigned int coord[dim];
......@@ -145,8 +146,17 @@ public:
return coordToLin<blockEdgeSize>(coord, stencilSupportRadius);
}
template<typename Coordtype>
inline __device__ unsigned int
getNeighbourLinIdInEnlargedBlock(grid_key_dx<dim, int> base, unsigned int dimension, char offset) const
getNeighbourLinIdInEnlargedBlock(const grid_key_dx<dim, Coordtype> & base, grid_key_dx<dim, Coordtype> & offsets) const
{
grid_key_dx<dim, int> res = base + offsets;
return coordToLin<blockEdgeSize>(res, stencilSupportRadius);
}
template<typename Coordtype>
inline __device__ unsigned int
getNeighbourLinIdInEnlargedBlock(const grid_key_dx<dim,Coordtype> & base, unsigned int dimension, char offset) const
{
grid_key_dx<dim, int> res = getNeighbour(base, dimension, offset);
return coordToLin<blockEdgeSize>(res, stencilSupportRadius);
......@@ -415,32 +425,13 @@ public:
unsetBit(bitMask, PADDING_BIT);
}
inline __device__ void getNeighboursPos(const indexT blockId, const unsigned int offset, int * neighboursPos)
template<typename NNtype>
inline __device__ indexT getNeighboursPos(const indexT blockId, const unsigned int offset)
{
//todo: also do the full neighbourhood version, this is just cross
auto blockCoord = getBlockCoord(blockId);
if (offset < 2*dim)
{
unsigned int d = offset/2;
int dPos = blockCoord.get(d) + (offset%2)*2 - 1;
blockCoord.set_d(d, dPos);
neighboursPos[offset] = this->blockMap.get_sparse(getBlockLinId(blockCoord)).id;
}
}
inline __device__ int getNeighboursPos(const indexT blockId, const unsigned int offset)
{
//todo: also do the full neighbourhood version, this is just cross
auto blockCoord = getBlockCoord(blockId);
int neighbourPos = -1;
if (offset < 2*dim)
{
unsigned int d = offset/2;
int dPos = blockCoord.get(d) + (offset%2)*2 - 1;
blockCoord.set_d(d, dPos);
neighbourPos = this->blockMap.get_sparse(getBlockLinId(blockCoord)).id;
}
return neighbourPos;
return NNtype::template getNNpos<indexT>(blockCoord,this->blockMap,*this,offset);
}
#ifdef SE_CLASS1
......
This diff is collapsed.
......@@ -20,4 +20,10 @@ struct IntPow<base, 0>
constexpr static size_t value = 1;
};
template <unsigned int numerator, unsigned int denominator>
struct UIntDivCeil
{
constexpr static unsigned int value = numerator / denominator + (numerator%denominator!=0);
};
#endif //OPENFPM_PDATA_MATHUTILS_HPP
This diff is collapsed.
//
// Created by tommaso on 15/8/19.
//
#ifndef OPENFPM_PDATA_SPARSEGRIDGPU_TESTKERNELS_CUH
#define OPENFPM_PDATA_SPARSEGRIDGPU_TESTKERNELS_CUH
/////////////////// BOUNDARY STENCILS ////////////////////////
template<unsigned int dim, unsigned int p_src, unsigned int p_dst>
struct BoundaryStencilSetX
{
// This is an example of a boundary stencil setting the value to the same value as the x coordinate
typedef NNStar<dim> stencil_type;
static constexpr unsigned int supportRadius = 1;
template<typename SparseGridT, typename DataBlockWrapperT>
static inline __device__ void stencil(
SparseGridT & sparseGrid,
const unsigned int dataBlockId,
openfpm::sparse_index<unsigned int> dataBlockIdPos,
unsigned int offset,
grid_key_dx<dim, int> & pointCoord,
DataBlockWrapperT & dataBlockLoad,
DataBlockWrapperT & dataBlockStore,
bool applyStencilHere)
{
if (applyStencilHere)
{
dataBlockStore.template get<p_dst>()[offset] = pointCoord.get(0);
}
}
};
template<unsigned int dim, unsigned int p_src, unsigned int p_dst, typename ScalarT = float>
struct BoundaryStencilSetXRescaled
{
// This is an example of a boundary stencil setting the value to the same value as the x coordinate
typedef NNStar<dim> stencil_type;
static constexpr unsigned int supportRadius = 1;
template<typename SparseGridT, typename DataBlockWrapperT>
static inline __device__ void stencil(
SparseGridT & sparseGrid,
const unsigned int dataBlockId,
openfpm::sparse_index<unsigned int> dataBlockIdPos,
unsigned int offset,
grid_key_dx<dim, int> & pointCoord,
DataBlockWrapperT & dataBlockLoad,
DataBlockWrapperT & dataBlockStore,
bool applyStencilHere,
ScalarT minX, ScalarT maxX, ScalarT minValue, ScalarT maxValue)
{
if (applyStencilHere)
{
const ScalarT x = pointCoord.get(0);
auto value = maxValue * (x - minX) / (maxX - minX - 1);
if (x < minX)
{
value = minValue;
}
else if (x > maxX)
{
value = maxValue;
}
dataBlockStore.template get<p_dst>()[offset] = value;
}
}
};
/////////////////// KERNELS ////////////////////////
template<unsigned int p, typename SparseGridType, typename ValueType>
__global__ void insertSphere(SparseGridType sparseGrid, grid_key_dx<2,int> start, float r1, float r2, ValueType value)
{
constexpr unsigned int pMask = SparseGridType::pMask;
typedef BlockTypeOf<typename SparseGridType::AggregateType, p> BlockT;
typedef BlockTypeOf<typename SparseGridType::AggregateType, pMask> MaskBlockT;
grid_key_dx<2,int> blk({
blockIdx.x + start.get(0) / sparseGrid.getBlockEdgeSize(),
blockIdx.y + start.get(1) / sparseGrid.getBlockEdgeSize()
});
unsigned int offset = threadIdx.x;
__shared__ bool is_block_empty;
if (threadIdx.x == 0 && threadIdx.y == 0)
{is_block_empty = true;}
sparseGrid.init();
auto blockId = sparseGrid.getBlockLinId(blk);
grid_key_dx<2,int> keyg;
keyg = sparseGrid.getGlobalCoord(blk,offset);
float radius = sqrt( (float)
(keyg.get(0) - (start.get(0) + gridDim.x/2*SparseGridType::blockEdgeSize_))
* (keyg.get(0) - (start.get(0) + gridDim.x/2*SparseGridType::blockEdgeSize_))
+ (keyg.get(1) - (start.get(1) + gridDim.y/2*SparseGridType::blockEdgeSize_))
* (keyg.get(1) - (start.get(1) + gridDim.y/2*SparseGridType::blockEdgeSize_)) );
bool is_active = radius < r1 && radius > r2;
if (is_active == true)
{
is_block_empty = false;
}
__syncthreads();
if (is_block_empty == false)
{
auto ec = sparseGrid.insertBlock(blockId);
if ( is_active == true)
{
ec.template get<p>()[offset] = value;
BlockMapGpu_ker<>::setExist(ec.template get<pMask>()[offset]);
}
}
__syncthreads();
sparseGrid.flush_block_insert();
}
template<unsigned int p, typename SparseGridType, typename ValueType>
__global__ void insertSphere3D(SparseGridType sparseGrid, grid_key_dx<3,int> start, float r1, float r2, ValueType value)
{
constexpr unsigned int pMask = SparseGridType::pMask;
typedef BlockTypeOf<typename SparseGridType::AggregateType, p> BlockT;
typedef BlockTypeOf<typename SparseGridType::AggregateType, pMask> MaskBlockT;
grid_key_dx<3,int> blk({
blockIdx.x + start.get(0) / sparseGrid.getBlockEdgeSize(),
blockIdx.y + start.get(1) / sparseGrid.getBlockEdgeSize(),
blockIdx.z + start.get(2) / sparseGrid.getBlockEdgeSize()});
unsigned int offset = threadIdx.x;
__shared__ bool is_block_empty;
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
{is_block_empty = true;}
sparseGrid.init();
auto blockId = sparseGrid.getBlockLinId(blk);
grid_key_dx<3,int> keyg;
keyg = sparseGrid.getGlobalCoord(blk,offset);
const long long x = keyg.get(0) - (start.get(0) + gridDim.x / 2 * SparseGridType::blockEdgeSize_);
const long long y = keyg.get(1) - (start.get(1) + gridDim.y / 2 * SparseGridType::blockEdgeSize_);
const long long z = keyg.get(2) - (start.get(2) + gridDim.z / 2 * SparseGridType::blockEdgeSize_);
float radius = sqrt((float) (x*x + y*y + z*z));
bool is_active = radius < r1 && radius > r2;
if (is_active == true)
{
is_block_empty = false;
}
__syncthreads();
if (is_block_empty == false)
{
auto ec = sparseGrid.insertBlock(blockId);
if ( is_active == true)
{
ec.template get<p>()[offset] = value;
// ec.template get<p>()[offset] = x;
BlockMapGpu_ker<>::setExist(ec.template get<pMask>()[offset]);
}
}
__syncthreads();
sparseGrid.flush_block_insert();
}
template<unsigned int p, typename SparseGridType, typename ValueType>
__global__ void insertSphere3D_radius(SparseGridType sparseGrid, grid_key_dx<3,int> start, float r1, float r2, ValueType value)
{
constexpr unsigned int pMask = SparseGridType::pMask;
typedef BlockTypeOf<typename SparseGridType::AggregateType, p> BlockT;
typedef BlockTypeOf<typename SparseGridType::AggregateType, pMask> MaskBlockT;
grid_key_dx<3,int> blk({
blockIdx.x + start.get(0) / sparseGrid.getBlockEdgeSize(),
blockIdx.y + start.get(1) / sparseGrid.getBlockEdgeSize(),
blockIdx.z + start.get(2) / sparseGrid.getBlockEdgeSize()});
unsigned int offset = threadIdx.x;
__shared__ bool is_block_empty;
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
{is_block_empty = true;}
sparseGrid.init();
auto blockId = sparseGrid.getBlockLinId(blk);
grid_key_dx<3,int> keyg;
keyg = sparseGrid.getGlobalCoord(blk,offset);
const long long x = keyg.get(0) - (start.get(0) + gridDim.x / 2 * SparseGridType::blockEdgeSize_);
const long long y = keyg.get(1) - (start.get(1) + gridDim.y / 2 * SparseGridType::blockEdgeSize_);
const long long z = keyg.get(2) - (start.get(2) + gridDim.z / 2 * SparseGridType::blockEdgeSize_);
float radius = sqrt((float) (x*x + y*y + z*z));
bool is_active = radius < r1 && radius > r2;
if (is_active == true)
{
is_block_empty = false;
}
__syncthreads();
if (is_block_empty == false)
{
auto ec = sparseGrid.insertBlock(blockId);
if ( is_active == true)
{
ec.template get<p>()[offset] = x+y+z;
BlockMapGpu_ker<>::setExist(ec.template get<pMask>()[offset]);
}
}
__syncthreads();
sparseGrid.flush_block_insert();
}
#endif //OPENFPM_PDATA_SPARSEGRIDGPU_TESTKERNELS_CUH
......@@ -795,9 +795,9 @@ namespace openfpm
* \param i element i
*/
template<bool prefetch>
inline void _branchfree_search(Ti x, Ti & id) const
inline Ti _branchfree_search_nobck(Ti x, Ti & id) const
{
if (vct_index.size() == 0) {id = 0; return;}
if (vct_index.size() == 0) {id = 0; return -1;}
const Ti *base = &vct_index.template get<0>(0);
const Ti *end = (const Ti *)vct_index.template getPointer<0>() + vct_index.size();
Ti n = vct_data.size()-1;
......@@ -815,7 +815,19 @@ namespace openfpm
int off = (*base < x);
id = base - &vct_index.template get<0>(0) + off;
Ti v = (base + off != end)?*(base + off):-1;
return (base + off != end)?*(base + off):-1;
}
/*! \brief get the element i
*
* search the element x
*
* \param i element i
*/
template<bool prefetch>
inline void _branchfree_search(Ti x, Ti & id) const
{
Ti v = _branchfree_search_nobck<prefetch>(x,id);
id = (x == v)?id:vct_data.size()-1;
}
......@@ -940,11 +952,11 @@ namespace openfpm
auto ite = vct_add_cont_index.getGPUIterator();
vct_add_data_reord.resize(n_ele);
// Now we reorder the data vector accordingly to the indexes
if (impl2 == VECTOR_SPARSE_STANDARD)
{
vct_add_data_reord.resize(n_ele);
CUDA_LAUNCH(reorder_vector_data,ite,vct_add_cont_index_map.toKernel(),vct_add_data_cont.toKernel(),vct_add_data_reord.toKernel());
}
......@@ -996,7 +1008,11 @@ namespace openfpm
int n_ele_unique = vct_index_tmp4.template get<0>(vct_index_tmp4.size()-1);
vct_add_index_unique.resize(n_ele_unique);
vct_add_data_unique.resize(n_ele_unique);
if (impl2 == VECTOR_SPARSE_STANDARD)
{
vct_add_data_unique.resize(n_ele_unique);
}
CUDA_LAUNCH(
(construct_index_unique<0>),
......@@ -1027,8 +1043,11 @@ namespace openfpm
// Do not delete this reserve
// Unfortunately all resize with DataBlocks are broken
vct_add_data_cont.reserve(vct_index.size() + vct_add_index_unique.size()+1);
vct_add_data_cont.resize(vct_index.size() + vct_add_index_unique.size());
if (impl2 == VECTOR_SPARSE_STANDARD)
{
vct_add_data_cont.reserve(vct_index.size() + vct_add_index_unique.size()+1);
vct_add_data_cont.resize(vct_index.size() + vct_add_index_unique.size());
}
ite = vct_add_index_unique.getGPUIterator();
vct_index_tmp4.resize(vct_add_index_unique.size());
......@@ -1077,7 +1096,7 @@ namespace openfpm
// Now we can do a segmented reduction
scalar_block_implementation_switch<impl2, block_functor>
::template extendSegments<1>(vct_add_index_unique, vct_add_data_reord.size());
::template extendSegments<1>(vct_add_index_unique, vct_add_data_reord_map.size());
if (impl2 == VECTOR_SPARSE_STANDARD)
{
......@@ -1547,6 +1566,9 @@ namespace openfpm
/*! \brief It insert an element in the sparse vector
*
* \tparam p property id
*
* \param ele element id
*
*/
template <unsigned int p>
......@@ -1560,6 +1582,62 @@ namespace openfpm
/*! \brief It insert an element in the sparse vector
*
* \tparam p property id
*
* \param ele element id
*
*/
template <unsigned int p>
auto insertFlush(Ti ele) -> decltype(vct_data.template get<p>(0))
{
size_t di;
// first we have to search if the block exist
Ti v = _branchfree_search_nobck(ele,di);
if (v == ele)
{
// block exist
return vct_data.template get<p>(ele);
}
// It does not exist, we create it di contain the index where we have to create the new block
vct_index.insert(di);
vct_data.isert(di);
return vct_data.template get<p>(di);
}
/*! \brief It insert an element in the sparse vector
*
* \param ele element id
*
*/
auto insertFlush(Ti ele) -> decltype(vct_data.get(0))
{
Ti di;
// first we have to search if the block exist
Ti v = _branchfree_search_nobck<true>(ele,di);
if (v == ele)
{
// block exist
return vct_data.get(ele);
}
// It does not exist, we create it di contain the index where we have to create the new block
vct_index.insert(di);
vct_data.insert(di);
vct_index.template get<0>(di) = ele;
return vct_data.get(di);
}
/*! \brief It insert an element in the sparse vector
*
* \param ele element id
*
*/
auto insert(Ti ele) -> decltype(vct_data.get(0))
......
......@@ -37,7 +37,7 @@ __global__ void construct_index_unique(vector_type2 vd_input, vector_type vd, v
{
int p = threadIdx.x + blockIdx.x * blockDim.x;
if (p >= (int)vd.size()) return;
if (p >= (int)vd.size() - 1) return;
unsigned int id = vd.template get<0>(p);
unsigned int id_p1 = vd.template get<0>(p+1);
......
......@@ -102,7 +102,8 @@ static bool isFloat(const std::string &someString)
static void StandardXMLPerformanceGraph(std::string file_xml,
std::string file_xml_ref,
GoogleChart & cg)
GoogleChart & cg,
const double deviationMultiplier = 3.0)
{
// Create empty property tree object
boost::property_tree::ptree tree_measure;
......@@ -124,7 +125,7 @@ static void StandardXMLPerformanceGraph(std::string file_xml,
// First we check for graphs
try
// try
{
boost::property_tree::ptree childs = tree_measure.get_child("graphs");
......@@ -139,12 +140,14 @@ static void StandardXMLPerformanceGraph(std::string file_xml,
{
if (c.second.template get<std::string>("y.data(" + std::to_string(number) + ").title","") == "")
{break;}
yn.add(c.second.template get<std::string>("y.data(" + std::to_string(number) + ").title","line" + std::to_string(number)));
yn.add(c.second.template get<std::string>("y.data(" + std::to_string(number) + ").title",
"line" + std::to_string(number)));
yn.add("interval");
yn.add("interval");
number++;
}
bool is_log_x = c.second.template get<bool>("options.log_x",false);
bool is_log_y = c.second.template get<bool>("options.log_y",false);
// We process the graph
......@@ -239,8 +242,8 @@ static void StandardXMLPerformanceGraph(std::string file_xml,
y.last().add(y_val);
x_ref.last().add(x_val_ref);
y_ref_dw.last().add(y_val_ref - 3.0 * y_val_dev_ref);
y_ref_up.last().add(y_val_ref + 3.0 * y_val_dev_ref);
y_ref_dw.last().add(y_val_ref - deviationMultiplier * y_val_dev_ref);
y_ref_up.last().add(y_val_ref + deviationMultiplier * y_val_dev_ref);
warning_set(warning_level,y_val,y_val_ref,y_val_dev_ref);
......@@ -255,16 +258,26 @@ static void StandardXMLPerformanceGraph(std::string file_xml,
std::string chart_area;
addchartarea(chart_area,warning_level);
opt.curveType = c.second.template get<std::string>("interpolation","function");
// opt.curveType = c.second.template get<std::string>("interpolation","none");
if (is_log_y == true)
if (is_log_x == true)
{
opt.more = GC_Y_LOG + "," + GC_ZOOM + chart_area;
opt.more = GC_X_LOG + "," + GC_ZOOM + chart_area;
}
else
{
opt.more = GC_ZOOM + chart_area;
}
if (is_log_y == true)
{
opt.more = GC_Y_LOG + "," + GC_ZOOM + chart_area;
}
else
{
opt.more = GC_ZOOM + chart_area;
}
opt.title = title;
opt.xAxis = x_title;
opt.yAxis = y_title;
......@@ -354,10 +367,10 @@ static void StandardXMLPerformanceGraph(std::string file_xml,
}
}
}
catch (std::exception e)
{
std::cout << __FILE__ << ":" << __LINE__ << " Error: invalid xml for performance test " << e.what() << std::endl;
}
// catch (std::exception e)
// {
// std::cout << __FILE__ << ":" << __LINE__ << " Error: invalid xml for performance test " << e.what() << std::endl;
// }
}
......
......@@ -67,7 +67,7 @@ inline __device__ __host__ void invlin_zid(size_t lin, grid_key_dx<2,T> & key)
key.set_d(1,y);
}
static const size_t B3[] = {0x9249249249249249, 0x30C30C30C30C30C3, 0xF00F00F00F00F00F, 0x0FFF000FFF000FFF,0xFFFF0000FFFFFFFF};
static const size_t S3[] = {2, 4, 8, 16, 32};
template<typename T>
......@@ -77,23 +77,23 @@ inline __device__ __host__ size_t lin_zid(const grid_key_dx<3,T> & key)
size_t z = key.get(2);
size_t y = key.get(1);
x = (x | (x << S3[4])) & B3[4];
x = (x | (x << S3[3])) & B3[3];
x = (x | (x << S3[2])) & B3[2];
x = (x | (x << S3[1])) & B3[1];
x = (x | (x << S3[0])) & B3[0];
y = (y | (y << S3[4])) & B3[4];
y = (y | (y << S3[3])) & B3[3];
y = (y | (y << S3[2])) & B3[2];
y = (y | (y << S3[1])) & B3[1];
y = (y | (y << S3[0])) & B3[0];
z = (z | (z << S3[4])) & B3[4];
z = (y | (y << S3[3])) & B3[3];
z = (y | (y << S3[2])) & B3[2];
z = (y | (y << S3[1])) & B3[1];
z = (y | (y << S3[0])) & B3[0];
x = (x | (x << 32)) & 0xFFFF0000FFFFFFFF;
x = (x | (x << 16)) & 0x0FFF000FFF000FFF;
x = (x | (x << 8)) & 0xF00F00F00F00F00F;
x = (x | (x << 4)) & 0x30C30C30C30C30C3;
x = (x | (x << 2)) & 0x9249249249249249;
y = (y | (y << 32)) & 0xFFFF0000FFFFFFFF;
y = (y | (y << 16)) & 0x0FFF000FFF000FFF;
y = (y | (y << 8)) & 0xF00F00F00F00F00F;
y = (y | (y << 4)) & 0x30C30C30C30C30C3;
y = (y | (y << 2)) & 0x9249249249249249;
z = (z | (z << 32)) & 0xFFFF0000FFFFFFFF;
z = (y | (y << 16)) & 0x0FFF000FFF000FFF;
z = (y | (y << 5)) & 0xF00F00F00F00F00F;
z = (y | (y << 4)) & 0x30C30C30C30C30C3;
z = (y | (y << 2)) & 0x9249249249249249;
return x | (y << 1) | (z << 2);
}
......