...
 
Commits (22)
......@@ -62,7 +62,7 @@ compile
depcomp
missing
src/mem_map
**/performance/
#**/performance/
previous_measure
previous_measureg
previous_measurev
......
......@@ -10,6 +10,7 @@ set(SE_CLASS1 CACHE BOOL "Activate compilation with SE_CLASS1")
set(SE_CLASS2 CACHE BOOL "Activate compilation with SE_CLASS2")
set(SE_CLASS3 CACHE BOOL "Activate compilation with SE_CLASS3")
set(ENABLE_GPU CACHE BOOL "Disable the GPU code independently that a cuda compiler is found")
set(TEST_PERFORMANCE CACHE BOOL "Enable test performance")
set (CMAKE_CXX_STANDARD 11)
set (CMAKE_CUDA_STANDARD 11)
......@@ -29,7 +30,7 @@ if (ENABLE_GPU)
endif()
endif()
find_package(Boost 1.66.0 REQUIRED COMPONENTS unit_test_framework iostreams program_options)
find_package(Boost 1.66.0 REQUIRED COMPONENTS unit_test_framework iostreams program_options system filesystem)
find_package(LibHilbert REQUIRED)
......@@ -51,6 +52,10 @@ if(CUDA_FOUND)
set(DEFINE_CUDA_GPU "#define CUDA_GPU")
endif()
if(TEST_PERFORMANCE)
set(DEFINE_PERFORMANCE_TEST "#define PERFORMANCE_TEST")
endif()
if (Boost_FOUND)
set(DEFINE_HAVE_BOOST "#define HAVE_BOOST")
set(DEFINE_HAVE_BOOST_IOSTREAMS "#define HAVE_BOOST_IOSTREAMS")
......
......@@ -229,6 +229,12 @@ do
test_coverage)
conf_options="$conf_options -DTEST_COVERAGE=ON"
;;
scan_coverty)
conf_options="$conf_options -DSCAN_COVERTY=ON"
;;
test_performance)
conf_options="$conf_options -DTEST_PERFORMANCE=ON"
;;
gpu)
if [ x"$CXX" == x"" ]; then
conf_options="$conf_options"
......@@ -469,6 +475,9 @@ do
boost)
conf_options="$conf_options -DBOOST_ROOT=$ac_optarg"
;;
action_on_error)
conf_options="$conf_options -DACTION_ON_ERROR=$ac_optarg"
;;
mpivendor)
conf_options="$conf_options -DMPI_VENDOR=$ac_optarg"
;;
......
......@@ -2,8 +2,12 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
########################### Executables
if (TEST_PERFORMANCE)
set(CUDA_SOURCES SparseGridGpu/performance/Stencil_performance_tests.cu)
endif ()
if (CUDA_FOUND)
set(CUDA_SOURCES
set(CUDA_SOURCES ${CUDA_SOURCES}
Vector/vector_gpu_unit_tests.cu
Grid/cuda/cuda_grid_gpu_tests.cu
Vector/cuda/map_vector_cuda_funcs_tests.cu
......@@ -14,11 +18,12 @@ if (CUDA_FOUND)
Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu
Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu
NN/CellList/tests/CellDecomposer_gpu_ker_unit_test.cu
SparseGridGpu/BlockMapGpu.hpp
SparseGridGpu/BlockMapGpu_ker.cuh
SparseGridGpu/tests/BlockMapGpu_tests.cu
SparseGridGpu/DataBlock.cuh SparseGridGpu/BlockMapGpu_kernels.cuh
util/cuda/segreduce_block_cuda.cuh util/cuda/test/segreduce_block_cuda_tests.cu SparseGridGpu/BlockMapGpu_dimensionalityWrappers.cuh SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu SparseGridGpu/SparseGridGpu.hpp SparseGridGpu/tests/SparseGridGpu_tests.cu SparseGridGpu/SparseGridGpu_ker.cuh SparseGridGpu/SparseGridGpu_kernels.cuh SparseGridGpu/BlockCache/BlockCache.cuh SparseGridGpu/tests/Stencil_performance_tests.cu)
util/test/zmorton_unit_tests.cpp
util/cuda/test/segreduce_block_cuda_tests.cu
SparseGridGpu/tests/BlockMapGpu_kernels_tests.cu
SparseGridGpu/tests/SparseGridGpu_tests.cu
)
else ()
set(CUDA_SOURCES)
endif ()
......@@ -34,8 +39,14 @@ add_executable(mem_map ../../openfpm_devices/src/Memleak_check.cpp
Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp
../../openfpm_devices/src/memory/HeapMemory.cpp
../../openfpm_devices/src/memory/PtrMemory.cpp
SparseGridGpu/Geometry/BlockGeometry.hpp
SparseGridGpu/Geometry/tests/BlockGeometry_tests.cpp SparseGridGpu/TemplateUtils/mathUtils.hpp)
SparseGridGpu/Geometry/tests/grid_smb_tests.cpp)
add_executable(isolation
../../openfpm_devices/src/memory/CudaMemory.cu
../../openfpm_devices/src/memory/HeapMemory.cpp
SparseGridGpu/tests/SparseGridGpu_tests.cu
isolation.cpp
)
if (CMAKE_COMPILER_IS_GNUCC)
target_compile_options(mem_map PRIVATE "-Wno-deprecated-declarations")
......@@ -46,11 +57,25 @@ endif ()
###########################
if (TEST_PERFORMANCE)
target_include_directories(mem_map PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../openfpm_io/src/)
target_include_directories(isolation PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../openfpm_io/src/)
endif ()
if (CUDA_FOUND)
# target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >)
# The below is for better performance on CUDA
# "-rdc=true" is for Dynamic Parallelism (cooperative groups)
target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -arch=sm_61 -g -lineinfo >)
# target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -gencode arch=compute_50,code=sm_50 -g -lineinfo >) # For Maxwell
target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -gencode arch=compute_61,code=sm_61 -g -lineinfo >) # For Pascal
if (TEST_COVERAGE)
target_compile_options(mem_map PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fprofile-arcs -ftest-coverage" >)
endif ()
# target_compile_options(mem_map PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} >)
# The below is for better performance on CUDA
# "-rdc=true" is for Dynamic Parallelism (cooperative groups)
# target_compile_options(isolation PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -gencode arch=compute_50,code=sm_50 -g -lineinfo >) # For Maxwell
target_compile_options(isolation PUBLIC $<$<COMPILE_LANGUAGE:CUDA>: ${WARNING_SUPPRESSION_AND_OPTION_NVCC} -use_fast_math -gencode arch=compute_61,code=sm_61 -g -lineinfo >) # For Pascal
if (TEST_COVERAGE)
target_compile_options(mem_map PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "-fprofile-arcs -ftest-coverage" >)
endif ()
......@@ -59,14 +84,24 @@ endif ()
target_include_directories(mem_map PUBLIC ${CUDA_INCLUDE_DIRS})
target_include_directories(mem_map PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(mem_map PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../openfpm_devices/src/)
target_include_directories(mem_map PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../openfpm_io/src/)
target_include_directories(mem_map PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config)
target_include_directories(mem_map PUBLIC ${LIBHILBERT_INCLUDE_DIRS})
target_include_directories(mem_map PUBLIC ${Boost_INCLUDE_DIRS})
target_include_directories(isolation PUBLIC ${CUDA_INCLUDE_DIRS})
target_include_directories(isolation PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(isolation PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../openfpm_devices/src/)
target_include_directories(isolation PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config)
target_include_directories(isolation PUBLIC ${LIBHILBERT_INCLUDE_DIRS})
target_include_directories(isolation PUBLIC ${Boost_INCLUDE_DIRS})
target_link_libraries(mem_map ${Boost_LIBRARIES})
target_link_libraries(mem_map -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES})
target_link_libraries(isolation ${Boost_LIBRARIES})
target_link_libraries(isolation -L${LIBHILBERT_LIBRARY_DIRS} ${LIBHILBERT_LIBRARIES})
if (TEST_COVERAGE)
target_link_libraries(mem_map -lgcov)
endif ()
......@@ -267,6 +302,9 @@ install(FILES util/multi_array_openfpm/array_openfpm.hpp
util/multi_array_openfpm/types.hpp
DESTINATION openfpm_data/include/util/multi_array_openfpm)
install(DIRECTORY util/cuda/cub/
DESTINATION openfpm_data/include/util/cuda/cub )
install(FILES util/cuda/scan_cuda.cuh
util/cuda/ofp_context.hxx
util/cuda/cuda_launch.hpp
......
......@@ -691,7 +691,7 @@ public:
static const int max_prop = T::max_prop;
//! constructor require a key and a memory data
encapc(typename memory_traits_inte<T>::type & data, size_t k)
__device__ __host__ encapc(typename memory_traits_inte<T>::type & data, size_t k)
:data(data),k(k)
{}
......@@ -702,7 +702,8 @@ public:
* \return The reference of the data
*
*/
template <unsigned int p> __device__ __host__ auto get() -> decltype(boost::fusion::at_c<p>(data).mem_r.operator[](k))
template <unsigned int p>
__device__ __host__ auto get() -> decltype(boost::fusion::at_c<p>(data).mem_r.operator[](k))
{
return boost::fusion::at_c<p>(data).mem_r.operator[](k);
}
......@@ -773,6 +774,23 @@ public:
return *this;
}
__device__ __host__ inline void private_set_data_k(Mem & data_c, size_t k)
{
this->data = data;
this->k = k;
}
__device__ __host__ inline Mem & private_get_data()
{
return data;
}
__device__ __host__ inline size_t private_get_k()
{
return k;
}
};
#include "util/common.hpp"
......
......@@ -593,41 +593,6 @@ public:
typedef typename boost::mpl::at<typename grid::value_type::type,boost::mpl::int_<T::value>>::type prp_type;
mp_funct_impl<grid::dims,prp_type>::template process<T::value>(gd_src,gd_dst,bx_src,bx_dst);
/* grid_key_dx<3> zero;
zero.zero();
grid_key_dx<3> one = zero;
one.set_d(1,1);
unsigned char * ptr_final_src = (unsigned char *)(gd_src.template get_address<prp>(one));
unsigned char * ptr_start_src = (unsigned char *)(gd_src.template get_address<prp>(zero));
unsigned char * ptr_final_dst = (unsigned char *)(gd_dst.template get_address<prp>(one));
unsigned char * ptr_start_dst = (unsigned char *)(gd_dst.template get_address<prp>(zero));
unsigned char * ptr_src = (unsigned char *)(gd_src.template get_address<prp>(bx_src.getKP1()));
unsigned char * ptr_dst = (unsigned char *)(gd_dst.template get_address<prp>(bx_dst.getKP1()));
size_t n_cpy = bx_src.getHigh(0) - bx_src.getLow(0) + 1;
size_t tot_y = bx_src.getHigh(1) - bx_src.getLow(1) + 1;
size_t stride_src_x = ptr_final_src - ptr_start_src;
size_t stride_dst_x = ptr_final_dst - ptr_start_dst;
grid_key_dx<3> one2 = zero;
one2.set_d(2,1);
ptr_final_src = (unsigned char *)(gd_src.template get_address<0>(one2));
ptr_start_src = (unsigned char *)(gd_src.template get_address<0>(zero));
ptr_final_dst = (unsigned char *)(gd_dst.template get_address<0>(one2));
ptr_start_dst = (unsigned char *)(gd_dst.template get_address<0>(zero));
size_t stride_src_y = ptr_final_src - ptr_start_src;
size_t stride_dst_y = ptr_final_dst - ptr_start_dst;
copy_ndim_fast_selector<grid::dims>::template call<T::value>(gs_src,gs_dst,bx_src,bx_dst,
gd_src,gd_dst,cnt);*/
}
};
......@@ -682,101 +647,6 @@ struct copy_grid_fast<false,3,grid,ginfo>
grid_key_dx<3> (& cnt)[1] )
{
copy_grid_fast_layout_switch<is_layout_inte<typename grid::layout_base_>::value,3,grid,ginfo>::copy(gs_src,gs_dst,bx_src,bx_dst,gd_src,gd_dst,cnt);
/* grid_key_dx<3> zero;
zero.zero();
grid_key_dx<3> one = zero;
one.set_d(1,1);
unsigned char * ptr_final_src = (unsigned char *)(gd_src.template get_address<0>(one));
unsigned char * ptr_start_src = (unsigned char *)(gd_src.template get_address<0>(zero));
unsigned char * ptr_final_dst = (unsigned char *)(gd_dst.template get_address<0>(one));
unsigned char * ptr_start_dst = (unsigned char *)(gd_dst.template get_address<0>(zero));
unsigned char * ptr_src = (unsigned char *)(gd_src.template get_address<0>(bx_src.getKP1()));
unsigned char * ptr_dst = (unsigned char *)(gd_dst.template get_address<0>(bx_dst.getKP1()));
size_t n_cpy = bx_src.getHigh(0) - bx_src.getLow(0) + 1;
size_t tot_y = bx_src.getHigh(1) - bx_src.getLow(1) + 1;
size_t stride_src_x = ptr_final_src - ptr_start_src;
size_t stride_dst_x = ptr_final_dst - ptr_start_dst;
grid_key_dx<3> one2 = zero;
one2.set_d(2,1);
ptr_final_src = (unsigned char *)(gd_src.template get_address<0>(one2));
ptr_start_src = (unsigned char *)(gd_src.template get_address<0>(zero));
ptr_final_dst = (unsigned char *)(gd_dst.template get_address<0>(one2));
ptr_start_dst = (unsigned char *)(gd_dst.template get_address<0>(zero));
size_t stride_src_y = ptr_final_src - ptr_start_src;
size_t stride_dst_y = ptr_final_dst - ptr_start_dst;
switch (n_cpy)
{
case 1:
copy_grid_fast_shortx_3<grid,1>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x,
stride_src_y,stride_dst_y,
tot_y);
break;
case 2:
copy_grid_fast_shortx_3<grid,2>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x,
stride_src_y,stride_dst_y,
tot_y);
break;
case 3:
copy_grid_fast_shortx_3<grid,3>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x,
stride_src_y,stride_dst_y,
tot_y);
break;
case 4:
copy_grid_fast_shortx_3<grid,4>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x,
stride_src_y,stride_dst_y,
tot_y);
break;
case 5:
copy_grid_fast_shortx_3<grid,5>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x,
stride_src_y,stride_dst_y,
tot_y);
break;
case 6:
copy_grid_fast_shortx_3<grid,6>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x,
stride_src_y,stride_dst_y,
tot_y);
break;
case 7:
copy_grid_fast_shortx_3<grid,7>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x,
stride_src_y,stride_dst_y,
tot_y);
break;
case 8:
copy_grid_fast_shortx_3<grid,8>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x,
stride_src_y,stride_dst_y,
tot_y);
break;
default:
copy_grid_fast_longx_3<grid>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x,
stride_src_y,stride_dst_y,
tot_y,n_cpy);
}*/
}
};
......@@ -800,70 +670,6 @@ struct copy_grid_fast<false,2,grid,ginfo>
grid_key_dx<2> (& cnt)[1] )
{
copy_grid_fast_layout_switch<is_layout_inte<typename grid::layout_base_>::value,2,grid,ginfo>::copy(gs_src,gs_dst,bx_src,bx_dst,gd_src,gd_dst,cnt);
/* grid_key_dx<2> zero;
zero.zero();
grid_key_dx<2> one = zero;
one.set_d(1,1);
unsigned char * ptr_final_src = (unsigned char *)(gd_src.template get_address<0>(one));
unsigned char * ptr_start_src = (unsigned char *)(gd_src.template get_address<0>(zero));
unsigned char * ptr_final_dst = (unsigned char *)(gd_dst.template get_address<0>(one));
unsigned char * ptr_start_dst = (unsigned char *)(gd_dst.template get_address<0>(zero));
unsigned char * ptr_src = (unsigned char *)&(gd_src.template get<0>(bx_src.getKP1()));
unsigned char * ptr_dst = (unsigned char *)&(gd_dst.template get<0>(bx_dst.getKP1()));
size_t n_cpy = bx_src.getHigh(0) - bx_src.getLow(0) + 1;
size_t stride_src_x = ptr_final_src - ptr_start_src;
size_t stride_dst_x = ptr_final_dst - ptr_start_dst;
switch (n_cpy)
{
case 1:
copy_grid_fast_shortx_2<grid,1>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x);
break;
case 2:
copy_grid_fast_shortx_2<grid,2>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x);
break;
case 3:
copy_grid_fast_shortx_2<grid,3>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x);
break;
case 4:
copy_grid_fast_shortx_2<grid,4>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x);
break;
case 5:
copy_grid_fast_shortx_2<grid,5>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x);
break;
case 6:
copy_grid_fast_shortx_2<grid,6>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x);
break;
case 7:
copy_grid_fast_shortx_2<grid,7>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x);
break;
case 8:
copy_grid_fast_shortx_2<grid,8>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x);
break;
default:
copy_grid_fast_longx_2<grid>(bx_src,ptr_dst,ptr_src,
stride_src_x,stride_dst_x,
n_cpy);
}*/
}
};
......@@ -1156,7 +962,7 @@ struct pack_with_iterator_shortx<3,n_cpy,obj_byte,git,grid>
grid_key_dx<3> one = zero;
one.set_d(2,1);
unsigned char * ptr_final = (unsigned char *)(&gr.template get<0>(one));
unsigned char * ptr_final = (unsigned char *)(&gr.template get_unsafe<0>(one));
unsigned char * ptr_start = (unsigned char *)(&gr.template get<0>(zero));
size_t stride_y = ptr_final - ptr_start;
......@@ -1225,7 +1031,7 @@ struct pack_with_iterator<false,dim,grid,encap_src,encap_dst,boost_vct,it,dtype,
grid_key_dx<dim> one = zero;
one.set_d(1,1);
unsigned char * ptr_final = (unsigned char *)(&gr.template get<0>(one));
unsigned char * ptr_final = (unsigned char *)(&gr.template get_unsafe<0>(one));
unsigned char * ptr_start = (unsigned char *)(&gr.template get<0>(zero));
size_t stride = ptr_final - ptr_start;
......
......@@ -19,7 +19,7 @@ struct copy_ndim_grid_impl
{
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= src.getGrid().size())
if (i >= src.getGrid().size() || i >= dst.getGrid().size())
{return;}
auto key_src = src.getGrid().InvLinId(i);
......@@ -40,6 +40,9 @@ struct copy_ndim_grid_impl<2,grid_type>
if (key_src.get(0) >= src.getGrid().size(0)) {return;}
if (key_src.get(1) >= src.getGrid().size(1)) {return;}
if (key_src.get(0) >= dst.getGrid().size(0)) {return;}
if (key_src.get(1) >= dst.getGrid().size(1)) {return;}
dst.get_o(key_src) = src.get_o(key_src);
}
};
......@@ -58,6 +61,10 @@ struct copy_ndim_grid_impl<3,grid_type>
if (key_src.get(1) >= src.getGrid().size(1)) {return;}
if (key_src.get(2) >= src.getGrid().size(2)) {return;}
if (key_src.get(0) >= dst.getGrid().size(0)) {return;}
if (key_src.get(1) >= dst.getGrid().size(1)) {return;}
if (key_src.get(2) >= dst.getGrid().size(2)) {return;}
dst.get_o(key_src) = src.get_o(key_src);
}
};
......
......@@ -12,9 +12,11 @@
#include "util/cuda_util.hpp"
#include "cuda/cuda_grid_gpu_funcs.cuh"
#include "util/create_vmpl_sequence.hpp"
#include "util/cuda/cuda_launch.hpp"
#define DATA_ON_HOST 32
#define DATA_ON_DEVICE 64
constexpr int DATA_ON_HOST = 32;
constexpr int DATA_ON_DEVICE = 64;
constexpr int EXACT_RESIZE = 128;
template<bool np,typename T>
struct skip_init
......@@ -428,7 +430,7 @@ private:
for (size_t i = 0 ; i < dim ; i++)
{
start.set_d(i,0);
stop.set_d(i,g1.size(i)-1);
stop.set_d(i,sz[i]-1);
}
// if (dim == 1)
......@@ -443,7 +445,9 @@ private:
bool has_work = has_work_gpu(ite);
if (has_work == true)
{copy_ndim_grid_device<dim,decltype(grid_new.toKernel())><<<ite.wthr,ite.thr>>>(this->toKernel(),grid_new.toKernel());}
{
CUDA_LAUNCH((copy_ndim_grid_device<dim,decltype(grid_new.toKernel())>),ite,this->toKernel(),grid_new.toKernel());
}
}
else
{
......@@ -459,7 +463,7 @@ private:
auto ite = getGPUIterator_impl<1>(g_sm_copy,start,stop);
copy_ndim_grid_device<dim,decltype(grid_new.toKernel())><<<ite.wthr,ite.thr>>>(this->toKernel(),grid_new.toKernel());
CUDA_LAUNCH((copy_ndim_grid_device<dim,decltype(grid_new.toKernel())>),ite,this->toKernel(),grid_new.toKernel());
}
#else
......
......@@ -24,7 +24,8 @@ public:
* \param exp grid_key_dx expression
*
*/
template<typename exp1> inline grid_key_dx(const grid_key_dx_expression<dim,exp1> & exp)
template<typename exp1>
__device__ __host__ inline grid_key_dx(const grid_key_dx_expression<dim,exp1> & exp)
{
for (size_t i = 0 ; i < dim ; i++)
this->k[i] = exp.value(i);
......@@ -243,9 +244,10 @@ public:
* \return a grid_key_dx_expression that encapsulate the expression
*
*/
inline grid_key_dx_sum<dim,grid_key_dx<dim>,grid_key_dx<dim>> operator+(const grid_key_dx<dim> & p) const
__device__ __host__ inline grid_key_dx_sum<dim,grid_key_dx<dim,index_type>,grid_key_dx<dim,index_type>>
operator+(const grid_key_dx<dim,index_type> & p) const
{
grid_key_dx_sum<dim,grid_key_dx<dim>,grid_key_dx<dim>> exp_sum(*this,p);
grid_key_dx_sum<dim,grid_key_dx<dim,index_type>,grid_key_dx<dim,index_type>> exp_sum(*this,p);
return exp_sum;
}
......@@ -257,7 +259,8 @@ public:
* \return a grid_key_dx_expression that encapsulate the expression
*
*/
inline grid_key_dx_sum<dim,grid_key_dx<dim>,Point<dim,long int>> operator+(const Point<dim,long int> & p) const
inline grid_key_dx_sum<dim,grid_key_dx<dim>,Point<dim,long int>>
operator+(const Point<dim,long int> & p) const
{
grid_key_dx_sum<dim,grid_key_dx<dim>,Point<dim,long int>> exp_sum(*this,p);
......
/*
* grid_performance_tests.hpp
*
* Created on: Nov 1, 2015
* Author: i-bird
*/
#ifndef OPENFPM_DATA_SRC_GRID_GRID_PERFORMANCE_TESTS_HPP_
#define OPENFPM_DATA_SRC_GRID_GRID_PERFORMANCE_TESTS_HPP_
#include "grid_util_test.hpp"
openfpm::vector<std::string> testsg;
openfpm::vector<float> per_timesg;
BOOST_AUTO_TEST_CASE(grid_performance_set_obj)
{
size_t sz[] = {128,128,128};
grid_cpu<3, Point_test<float> > c3(sz);
c3.setMemory();
fill_grid<3>(c3);
Point_test<float> f __attribute__((aligned(16)));
f.fill();
std::vector<double> times(N_STAT + 1);
times[0] = 1000;
for (size_t j = 0 ; j < 8 ; j++)
{
for (size_t i = 1 ; i < N_STAT+1 ; i++)
{
timer t;
t.start();
auto it = c3.getIterator();
while (it.isNext())
{
c3.set(it.get(),f);
++it;
}
t.stop();
times[i] = t.getwct();
}
std::sort(times.begin(),times.end());
sleep(5);
}
testsg.add("Grid so");
per_timesg.add(times[0]);
}
BOOST_AUTO_TEST_CASE(grid_performance_set_other_grid)
{
size_t sz[] = {128,128,128};
grid_cpu<3, Point_test<float> > c3(sz);
c3.setMemory();
fill_grid<3>(c3);
grid_cpu<3, Point_test<float> > c1(sz);
c1.setMemory();
std::vector<double> times(N_STAT + 1);
times[0] = 1000;
for (size_t j = 0 ; j < 8 ; j++)
{
for (size_t i = 1 ; i < N_STAT+1 ; i++)
{
timer t;
t.start();
auto it = c3.getIterator();
while (it.isNext())
{
c3.set(it.get(),c1,it.get());
++it;
}
t.stop();
times[i] = t.getwct();
}
std::sort(times.begin(),times.end());
sleep(5);
}
testsg.add("Grid sog");
per_timesg.add(times[0]);
}
BOOST_AUTO_TEST_CASE(grid_performance_set_other_grid_encap)
{
size_t sz[] = {128,128,128};
grid_cpu<3, Point_test<float> > c3(sz);
c3.setMemory();
fill_grid<3>(c3);
grid_cpu<3, Point_test<float> > c1(sz);
c1.setMemory();
std::vector<double> times(N_STAT + 1);
times[0] = 1000;
for (size_t j = 0 ; j < 8 ; j++)
{
for (size_t i = 1 ; i < N_STAT+1 ; i++)
{
timer t;
t.start();
auto it = c3.getIterator();
while (it.isNext())
{
c3.set(it.get(),c1.get_o(it.get()));
++it;
}
t.stop();
times[i] = t.getwct();
}
std::sort(times.begin(),times.end());
sleep(5);
}
testsg.add("Grid soge");
per_timesg.add(times[0]);
}
BOOST_AUTO_TEST_CASE(grid_performance_duplicate)
{
size_t sz[] = {128,128,128};
grid_cpu<3, Point_test<float> > c3(sz);
c3.setMemory();
fill_grid<3>(c3);
grid_cpu<3, Point_test<float> > c1;
std::vector<double> times(N_STAT_SMALL + 1);
times[0] = 1000;
for (size_t j = 0 ; j < 8 ; j++)
{
for (size_t i = 1 ; i < N_STAT_SMALL+1 ; i++)
{
timer t;
t.start();
c1 = c3.duplicate();
t.stop();
times[i] = t.getwct();
}
std::sort(times.begin(),times.end());
sleep(5);
}
testsg.add("Grid dup");
per_timesg.add(times[0]);
}
/////// THIS IS NOT A TEST IT WRITE THE PERFORMANCE RESULT ///////
BOOST_AUTO_TEST_CASE(grid_performance_write_report)
{
openfpm::vector<std::string> yn;
openfpm::vector<openfpm::vector<float>> y;
// Get the directory of the performance test files
std::string per_dir(test_dir);
// Reference time
openfpm::vector<openfpm::vector<float>> y_ref;
y_ref.load(per_dir + std::string("/openfpm_data/ref_timesg"));
load_and_combine(per_dir + std::string("/openfpm_data/previous_measureg"),y,per_timesg);
// Adding the dataset names
if (y.size() != 0)
{
for (size_t j = 0; j < y.get(0).size(); j++)
yn.add("config " + std::to_string(j));
}
// Google charts options
GCoptions options;
options.title = std::string("Grid Performances");
options.yAxis = std::string("Time (seconds)");
options.xAxis = std::string("Benchmark");
options.stype = std::string("bars");
std::stringstream g_test_desc;
g_test_desc << "<h2>Grid performance test</h2>\n";
g_test_desc << "<strong>128x128x128 Grid containing a Point_test<float></strong><br>";
g_test_desc << "<strong>Grid so:</strong> Initialize each element of the grid<br>";
g_test_desc << "<strong>Grid sog:</strong> Manual copy of two grids<br>";
g_test_desc << "<strong>Grid soge:</strong> Manual copy of two grids in a different way<br>";
g_test_desc << "<strong>Grid dup:</strong> Duplication of the grid (Duplication include grid creation time)<br>";
cg.addHTML(g_test_desc.str());
cg.AddHistGraph(testsg,y,yn,options);
// compare the reference times with the actual times
// calculate speed-up
openfpm::vector<openfpm::vector<float>> y_ref_sup;
speedup_calculate(y_ref_sup,y,y_ref,yn);
std::stringstream g_test_spdesc;
g_test_spdesc << "<h2>Grid speedup</h2>\n";
g_test_spdesc << "The previous tests are compared with the best performances ever registered, ";
g_test_spdesc << "the banded area indicate the upper and lower bounds of the best registrered performances.<br>";
g_test_spdesc << "The lines are the latest 5 tests<br>";
g_test_spdesc << "<strong>Line inside the area</strong>: The tested configuration has no improvement or degradation in performance<br>";
g_test_spdesc << "<strong>Line break the upper bound</strong>: The tested configuration has improvement in performance<br>";
g_test_spdesc << "<strong>Line break the lower bound</strong>: The tested configuration has degradation in performance<br>";
g_test_spdesc << "<strong>Y axis:</strong> Performance change in percentage from the average of the best registered performances<br>";
cg.addHTML(g_test_spdesc.str());
cg.AddLinesGraph(testsg,y_ref_sup,yn,options);
}
#endif /* OPENFPM_DATA_SRC_GRID_GRID_PERFORMANCE_TESTS_HPP_ */
......@@ -336,6 +336,30 @@ public:
{
return background;
}
/*! \brief assign operator
*
* \return itself
*
*/
grid_cpu<dim,T,S,typename memory_traits_lin<T>::type> & operator=(const grid_base_impl<dim,T,S,typename memory_traits_lin<T>::type, memory_traits_lin> & base)
{
grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte>::operator=(base);
return *this;
}
/*! \brief assign operator
*
* \return itself
*
*/
grid_cpu<dim,T,S,typename memory_traits_lin<T>::type> & operator=(grid_base_impl<dim,T,S,typename memory_traits_lin<T>::type, memory_traits_lin> && base)
{
grid_base_impl<dim,T,S,typename memory_traits_lin<T>::type, memory_traits_lin>::operator=((grid_base_impl<dim,T,S,typename memory_traits_lin<T>::type, memory_traits_lin> &&)base);
return *this;
}
};
......@@ -610,6 +634,16 @@ public:
{
}
/*! \brief create a grid from another grid
*
* \param g the grid to copy
*
*/
inline grid_cpu(grid_cpu && g) THROW
:grid_base_impl<dim,T,S,layout,memory_traits_inte>(g)
{
}
/*! \brief create a grid of size sz on each direction
*
* \param sz grid size in each direction
......@@ -774,6 +808,30 @@ public:
{
return background;
}
/*! \brief assign operator
*
* \return itself
*
*/
grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> & operator=(const grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte> & base)
{
grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte>::operator=(base);
return *this;
}
/*! \brief assign operator
*
* \return itself
*
*/
grid_cpu<dim,T,S,typename memory_traits_inte<T>::type> & operator=(grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte> && base)
{
grid_base_impl<dim,T,S,typename memory_traits_inte<T>::type, memory_traits_inte>::operator=(base);
return *this;
}
};
//! short formula for a grid on gpu
......
/*
* grid_performance_tests.hpp
*
* Created on: Nov 1, 2015
* Author: i-bird
*/
#ifndef OPENFPM_DATA_SRC_GRID_GRID_PERFORMANCE_TESTS_HPP_
#define OPENFPM_DATA_SRC_GRID_GRID_PERFORMANCE_TESTS_HPP_
#include "Grid/grid_util_test.hpp"
#include "util/stat/common_statistics.hpp"
// Property tree
struct report_grid_copy_func_tests
{
boost::property_tree::ptree graphs;
};
report_grid_copy_func_tests report_grid_funcs;
BOOST_AUTO_TEST_SUITE( grid_performance )
BOOST_AUTO_TEST_CASE(grid_performance_set_obj)
{
size_t sz[] = {128,128,128};
report_grid_funcs.graphs.put("performance.grid.set(0).grid.x",sz[0]);
report_grid_funcs.graphs.put("performance.grid.set(0).grid.y",sz[1]);
report_grid_funcs.graphs.put("performance.grid.set(0).grid.z",sz[2]);
grid_cpu<3, Point_test<float> > c3(sz);
c3.setMemory();
fill_grid<3>(c3);
Point_test<float> f __attribute__((aligned(16)));
f.fill();
std::vector<double> times(N_STAT + 1);
for (size_t i = 0 ; i < N_STAT+1 ; i++)
{
timer t;
t.start();
auto it = c3.getIterator();
while (it.isNext())
{
c3.set(it.get(),f);
++it;
}
t.stop();
times[i] = t.getwct();
}
double mean;
double dev;
standard_deviation(times,mean,dev);
report_grid_funcs.graphs.put("performance.grid.set(0).x.data.name","Grid_so");
report_grid_funcs.graphs.put("performance.grid.set(0).y.data.mean",mean);
report_grid_funcs.graphs.put("performance.grid.set(0).y.data.dev",dev);
}
BOOST_AUTO_TEST_CASE(grid_performance_set_other_grid)
{
size_t sz[] = {128,128,128};
report_grid_funcs.graphs.put("performance.grid.set(1).grid.x",sz[0]);
report_grid_funcs.graphs.put("performance.grid.set(1).grid.y",sz[1]);
report_grid_funcs.graphs.put("performance.grid.set(1).grid.z",sz[2]);
grid_cpu<3, Point_test<float> > c3(sz);
c3.setMemory();
fill_grid<3>(c3);
grid_cpu<3, Point_test<float> > c1(sz);
c1.setMemory();
std::vector<double> times(N_STAT + 1);
for (size_t i = 0 ; i < N_STAT+1 ; i++)
{
timer t;
t.start();
auto it = c3.getIterator();
while (it.isNext())
{
c3.set(it.get(),c1,it.get());
++it;
}
t.stop();
times[i] = t.getwct();
}
std::sort(times.begin(),times.end());
double mean;
double dev;
standard_deviation(times,mean,dev);
report_grid_funcs.graphs.put("performance.grid.set(1).x.data.name","Grid_sog");
report_grid_funcs.graphs.put("performance.grid.set(1).y.data.mean",mean);
report_grid_funcs.graphs.put("performance.grid.set(1).y.data.dev",dev);
}
BOOST_AUTO_TEST_CASE(grid_performance_set_other_grid_encap)
{
size_t sz[] = {128,128,128};
report_grid_funcs.graphs.put("performance.grid.set(2).grid.x",sz[0]);
report_grid_funcs.graphs.put("performance.grid.set(2).grid.y",sz[1]);
report_grid_funcs.graphs.put("performance.grid.set(2).grid.z",sz[2]);
grid_cpu<3, Point_test<float> > c3(sz);
c3.setMemory();
fill_grid<3>(c3);
grid_cpu<3, Point_test<float> > c1(sz);
c1.setMemory();
std::vector<double> times(N_STAT + 1);
for (size_t i = 0 ; i < N_STAT+1 ; i++)
{
timer t;
t.start();
auto it = c3.getIterator();
while (it.isNext())
{
c3.set(it.get(),c1.get_o(it.get()));
++it;
}
t.stop();
times[i] = t.getwct();
}
double mean;
double dev;
standard_deviation(times,mean,dev);
report_grid_funcs.graphs.put("performance.grid.set(2).x.data.name","Grid_soge");
report_grid_funcs.graphs.put("performance.grid.set(2).y.data.mean",mean);
report_grid_funcs.graphs.put("performance.grid.set(2).y.data.dev",dev);
}
BOOST_AUTO_TEST_CASE(grid_performance_duplicate)
{
size_t sz[] = {128,128,128};
report_grid_funcs.graphs.put("performance.grid.set(3).grid.x",sz[0]);
report_grid_funcs.graphs.put("performance.grid.set(3).grid.y",sz[1]);
report_grid_funcs.graphs.put("performance.grid.set(3).grid.z",sz[2]);
grid_cpu<3, Point_test<float> > c3(sz);
c3.setMemory();
fill_grid<3>(c3);
grid_cpu<3, Point_test<float> > c1;
std::vector<double> times(N_STAT_SMALL + 1);
for (size_t i = 0 ; i < N_STAT_SMALL+1 ; i++)
{
timer t;
t.start();
c1 = c3.duplicate();
t.stop();
times[i] = t.getwct();
}
double mean;
double dev;
standard_deviation(times,mean,dev);
report_grid_funcs.graphs.put("performance.grid.set(3).x.data.name","Grid_dup");
report_grid_funcs.graphs.put("performance.grid.set(3).y.data.mean",mean);
report_grid_funcs.graphs.put("performance.grid.set(3).y.data.dev",dev);
}
/////// THIS IS NOT A TEST IT WRITE THE PERFORMANCE RESULT ///////
BOOST_AUTO_TEST_CASE(grid_performance_write_report)
{
// Create a graphs
report_grid_funcs.graphs.put("graphs.graph(0).type","line");
report_grid_funcs.graphs.add("graphs.graph(0).title","Grid set functions (so/sog/soge) and duplicate (dup) performance");
report_grid_funcs.graphs.add("graphs.graph(0).x.title","Tests");
report_grid_funcs.graphs.add("graphs.graph(0).y.title","Time seconds");
report_grid_funcs.graphs.add("graphs.graph(0).y.data(0).source","performance.grid.set(#).y.data.mean");
report_grid_funcs.graphs.add("graphs.graph(0).x.data(0).source","performance.grid.set(#).x.data.name");
report_grid_funcs.graphs.add("graphs.graph(0).y.data(0).title","Actual");
report_grid_funcs.graphs.add("graphs.graph(0).interpolation","lines");
boost::property_tree::xml_writer_settings<std::string> settings(' ', 4);
boost::property_tree::write_xml("grid_performance_funcs.xml", report_grid_funcs.graphs,std::locale(),settings);
GoogleChart cg;
std::string file_xml_ref(test_dir);
file_xml_ref += std::string("/openfpm_data/grid_performance_funcs_ref.xml");
StandardXMLPerformanceGraph("grid_performance_funcs.xml",file_xml_ref,cg);
addUpdtateTime(cg,1);
cg.write("grid_performance_funcs.html");
}
BOOST_AUTO_TEST_SUITE_END()
#endif /* OPENFPM_DATA_SRC_GRID_GRID_PERFORMANCE_TESTS_HPP_ */
......@@ -960,7 +960,8 @@ public:
* \return An iterator across the neighhood particles
*
*/
template<unsigned int impl=NO_CHECK> inline CellNNIterator<dim,CellList<dim,T,Mem_type,transform,vector_pos_type>,(int)FULL,impl> getNNIterator(size_t cell)
template<unsigned int impl=NO_CHECK>
__attribute__((always_inline)) inline CellNNIterator<dim,CellList<dim,T,Mem_type,transform,vector_pos_type>,(int)FULL,impl> getNNIterator(size_t cell)
{
CellNNIterator<dim,CellList<dim,T,Mem_type,transform,vector_pos_type>,(int)FULL,impl> cln(cell,NNc_full,*this);
return cln;
......@@ -977,7 +978,8 @@ public:
* \return An iterator across the neighborhood particles
*
*/
template<unsigned int impl=NO_CHECK> inline CellNNIteratorRadius<dim,CellList<dim,T,Mem_type,transform,vector_pos_type>,impl> getNNIteratorRadius(size_t cell, T r_cut)
template<unsigned int impl=NO_CHECK>
__attribute__((always_inline)) inline CellNNIteratorRadius<dim,CellList<dim,T,Mem_type,transform,vector_pos_type>,impl> getNNIteratorRadius(size_t cell, T r_cut)
{
openfpm::vector<long int> & NNc = rcache[r_cut];
......@@ -1012,7 +1014,7 @@ public:
*
*/
template<unsigned int impl>
inline CellNNIteratorSym<dim,CellList<dim,T,Mem_type,transform,vector_pos_type>,vector_pos_type,(unsigned int)SYM,impl>
__attribute__((always_inline)) inline CellNNIteratorSym<dim,CellList<dim,T,Mem_type,transform,vector_pos_type>,vector_pos_type,(unsigned int)SYM,impl>
getNNIteratorSym(size_t cell, size_t p, const vector_pos_type & v)
{
#ifdef SE_CLASS1
......@@ -1047,7 +1049,7 @@ public:
*
*/
template<unsigned int impl, typename vector_pos_type2>
inline CellNNIteratorSymMP<dim,CellList<dim,T,Mem_type,transform,vector_pos_type>,vector_pos_type2,(unsigned int)SYM,impl>
__attribute__((always_inline)) inline CellNNIteratorSymMP<dim,CellList<dim,T,Mem_type,transform,vector_pos_type>,vector_pos_type2,(unsigned int)SYM,impl>
getNNIteratorSymMP(size_t cell, size_t p, const vector_pos_type2 & v_p1, const vector_pos_type2 & v_p2)
{
#ifdef SE_CLASS1
......@@ -1115,7 +1117,7 @@ public:
* \return the index
*
*/
inline const typename Mem_type::local_index_type & getStartId(typename Mem_type::local_index_type cell_id) const
__attribute__((always_inline)) inline const typename Mem_type::local_index_type & getStartId(typename Mem_type::local_index_type cell_id) const
{
return Mem_type::getStartId(cell_id);
}
......@@ -1127,7 +1129,7 @@ public:
* \return the stop index
*
*/
inline const typename Mem_type::local_index_type & getStopId(typename Mem_type::local_index_type cell_id) const
__attribute__((always_inline)) inline const typename Mem_type::local_index_type & getStopId(typename Mem_type::local_index_type cell_id) const
{
return Mem_type::getStopId(cell_id);
}
......@@ -1139,7 +1141,7 @@ public:
* \return the neighborhood id
*
*/
inline const typename Mem_type::local_index_type & get_lin(const typename Mem_type::local_index_type * part_id) const
__attribute__((always_inline)) inline const typename Mem_type::local_index_type & get_lin(const typename Mem_type::local_index_type * part_id) const
{
return Mem_type::get_lin(part_id);
}
......
......@@ -1240,11 +1240,11 @@ struct execute_cl_test<1>
{
auto ite = pl.getGPUIterator();
calc_force_number_rad<decltype(pl.toKernel()),
CUDA_LAUNCH((calc_force_number_rad<decltype(pl.toKernel()),
decltype(s_t_ns.toKernel()),
decltype(cl2.toKernel()),
decltype(n_out.toKernel())>
<<<ite.wthr,ite.thr>>>(pl.toKernel(),
decltype(n_out.toKernel())>),
ite,pl.toKernel(),
s_t_ns.toKernel(),
cl2.toKernel(),
n_out.toKernel());
......@@ -1255,11 +1255,11 @@ struct execute_cl_test<1>
{
auto ite = pl.getGPUIterator();
calc_force_list_rad<decltype(pl.toKernel()),
CUDA_LAUNCH((calc_force_list_rad<decltype(pl.toKernel()),
decltype(s_t_ns.toKernel()),
decltype(cl2.toKernel()),
decltype(nn_list.toKernel())>
<<<ite.wthr,ite.thr>>>(pl.toKernel(),
decltype(nn_list.toKernel())>),
ite,pl.toKernel(),
s_t_ns.toKernel(),
cl2.toKernel(),
n_out_scan.toKernel(),
......@@ -1288,11 +1288,11 @@ struct execute_cl_test<2>
{
auto ite = s_t_ns.getGPUIterator();
calc_force_number_box_noato<decltype(pl.toKernel()),
CUDA_LAUNCH((calc_force_number_box_noato<decltype(pl.toKernel()),
decltype(s_t_ns.toKernel()),
decltype(cl2.toKernel()),
decltype(n_out.toKernel())>
<<<ite.wthr,ite.thr>>>(pl.toKernel(),
decltype(n_out.toKernel())>),
ite,pl.toKernel(),
s_t_ns.toKernel(),
cl2.toKernel(),
n_out.toKernel(),
......@@ -1304,11 +1304,12 @@ struct execute_cl_test<2>
{
auto ite = s_t_ns.getGPUIterator();
calc_force_number_box<decltype(pl.toKernel()),
CUDA_LAUNCH((calc_force_number_box<decltype(pl.toKernel()),
decltype(s_t_ns.toKernel()),
decltype(cl2.toKernel()),
decltype(n_out.toKernel())>
<<<ite.wthr,ite.thr>>>(pl.toKernel(),
decltype(n_out.toKernel())>),
ite,
pl.toKernel(),
s_t_ns.toKernel(),
cl2.toKernel(),
n_out.toKernel(),
......@@ -1320,11 +1321,11 @@ struct execute_cl_test<2>
{
auto ite = s_t_ns.getGPUIterator();
calc_force_list_box<decltype(pl.toKernel()),
CUDA_LAUNCH((calc_force_list_box<decltype(pl.toKernel()),
decltype(s_t_ns.toKernel()),
decltype(cl2.toKernel()),
decltype(nn_list.toKernel())>
<<<ite.wthr,ite.thr>>>(pl.toKernel(),
decltype(nn_list.toKernel())>),
ite,pl.toKernel(),
s_t_ns.toKernel(),
cl2.toKernel(),
n_out_scan.toKernel(),
......@@ -1486,8 +1487,9 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di
openfpm::vector<aggregate<unsigned int>,CudaMemory,typename memory_traits_inte<aggregate<unsigned int>>::type,memory_traits_inte> n_out_scan;
openfpm::vector<aggregate<unsigned int>,CudaMemory,typename memory_traits_inte<aggregate<unsigned int>>::type,memory_traits_inte> nn_list;
scan<unsigned int,unsigned char> sc;
sc.scan_(n_out,n_out_scan);
n_out_scan.resize(pl.size()+1);
mgpu::scan((unsigned int *)n_out.template getDeviceBuffer<0>(),n_out.size(),(unsigned int *)n_out_scan.template getDeviceBuffer<0>(),context);
n_out_scan.template deviceToHost<0>();
if (n_out_scan.template get<0>(pl.size()) == 0)
......@@ -1551,7 +1553,6 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di
}
BOOST_REQUIRE_EQUAL(check,true);
}
}
......
......@@ -59,7 +59,7 @@ protected:
/*! \brief Select non-empty cell
*
*/
inline void selectValid()
__attribute__((always_inline)) inline void selectValid()
{
while (start_id == stop_id)
{
......@@ -89,7 +89,7 @@ public:
* \param cl Cell structure
*
*/
inline CellNNIterator(size_t cell, const NNc_array<dim,NNc_size> &NNc, Cell & cl)
__attribute__((always_inline)) inline CellNNIterator(size_t cell, const NNc_array<dim,NNc_size> &NNc, Cell & cl)
:NNc_id(0),cell(cell),cell_id(NNc[NNc_id] + cell),cl(cl),NNc(NNc)
{
start_id = &cl.getStartId(cell_id);
......@@ -102,7 +102,7 @@ public:
* \return true if there is the next element
*
*/
inline bool isNext()
__attribute__((always_inline)) inline bool isNext()
{
if (NNc_id >= NNc_size)
return false;
......@@ -114,7 +114,7 @@ public:
* \return itself
*
*/
inline CellNNIterator & operator++()
__attribute__((always_inline)) inline CellNNIterator & operator++()
{
start_id++;
......@@ -128,7 +128,7 @@ public:
* \return the next element object
*
*/
inline const typename Cell::Mem_type_type::local_index_type & get()
__attribute__((always_inline)) inline const typename Cell::Mem_type_type::local_index_type & get()
{
return cl.get_lin(start_id);
}
......@@ -138,7 +138,7 @@ public:
* \return the next element object
*
*/
inline const typename Cell::Mem_type_type::local_index_type & get() const
__attribute__((always_inline)) inline const typename Cell::Mem_type_type::local_index_type & get() const
{
return cl.get_lin(start_id);
}
......@@ -173,7 +173,7 @@ class CellNNIteratorSym : public CellNNIterator<dim,Cell,NNc_size,impl>
/*! Select the next valid element
*
*/
inline void selectValid()
__attribute__((always_inline)) inline void selectValid()
{
if (this->NNc[this->NNc_id] == 0)
{
......@@ -212,7 +212,7 @@ public:
* \param cl Cell structure
*
*/
inline CellNNIteratorSym(size_t cell, size_t p, const NNc_array<dim,NNc_size> &NNc, Cell & cl, const vector_pos_type & v)
__attribute__((always_inline)) inline CellNNIteratorSym(size_t cell, size_t p, const NNc_array<dim,NNc_size> &NNc, Cell & cl, const vector_pos_type & v)
:CellNNIterator<dim,Cell,NNc_size,impl>(cell,NNc,cl),p(p),v(v)
{
selectValid();
......@@ -224,7 +224,7 @@ public:
* \return itself
*
*/
inline CellNNIteratorSym<dim,Cell,vector_pos_type,NNc_size,impl> & operator++()
__attribute__((always_inline)) inline CellNNIteratorSym<dim,Cell,vector_pos_type,NNc_size,impl> & operator++()
{
this->start_id++;
......@@ -265,7 +265,7 @@ class CellNNIteratorSymMP : public CellNNIterator<dim,Cell,NNc_size,impl>
/*! Select the next valid element
*
*/
inline void selectValid()
__attribute__((always_inline)) inline void selectValid()
{
if (this->NNc[this->NNc_id] == 0)
{
......@@ -304,7 +304,7 @@ public:
* \param cl Cell structure
*
*/
inline CellNNIteratorSymMP(size_t cell,
__attribute__((always_inline)) inline CellNNIteratorSymMP(size_t cell,
size_t p,
const NNc_array<dim,NNc_size> &NNc,
Cell & cl,
......@@ -321,7 +321,7 @@ public:
* \return itself
*
*/
inline CellNNIteratorSymMP<dim,Cell,vector_pos_type,NNc_size,impl> & operator++()
__attribute__((always_inline)) inline CellNNIteratorSymMP<dim,Cell,vector_pos_type,NNc_size,impl> & operator++()
{
this->start_id++;
......@@ -357,7 +357,7 @@ public:
* \param cl Cell on which iterate
*
*/
inline CellIterator(const size_t cell, Cell & cl)
__attribute__((always_inline)) inline CellIterator(const size_t cell, Cell & cl)
:cl(cl),ele_id(0),cell(cell)
{
}
......@@ -367,7 +367,7 @@ public:
* \return true if there are still neighborhood particles
*
*/
inline bool isNext()
__attribute__((always_inline)) inline bool isNext()
{
return cl.getNelements(cell) > ele_id;
}
......@@ -377,7 +377,7 @@ public:
* \return itself
*
*/
inline CellIterator & operator++()
__attribute__((always_inline)) inline CellIterator & operator++()
{
ele_id++;
......@@ -389,7 +389,7 @@ public:
* \return the next element object
*
*/
inline typename Cell::value_type & get()
__attribute__((always_inline)) inline typename Cell::value_type & get()
{
return cl.get(cell,ele_id);
}
......@@ -399,7 +399,7 @@ public:
* \return the next element object
*
*/
inline const typename Cell::value_type & get() const
__attribute__((always_inline)) inline const typename Cell::value_type & get() const
{
return cl.get(cell,ele_id);
}
......
......@@ -61,7 +61,7 @@ protected:
/*! \brief Select non-empty cell
*
*/
inline void selectValid()
__attribute__((always_inline)) inline void selectValid()
{
while (start_id == stop_id)
{
......@@ -92,7 +92,7 @@ public:
* \param cl Cell structure
*
*/
inline CellNNIterator(size_t cell, const long int * NNc, size_t NNc_size, Cell & cl)
__attribute__((always_inline)) inline CellNNIterator(size_t cell, const long int * NNc, size_t NNc_size, Cell & cl)
:NNc_id(0),NNc_size(NNc_size),cell(cell),cell_id(NNc[NNc_id] + cell),cl(cl),NNc(NNc)
{
start_id = &cl.getStartId(cell_id);
......@@ -105,7 +105,7 @@ public:
* \return true if there is the next element
*
*/
inline bool isNext()
__attribute__((always_inline)) inline bool isNext()
{
if (NNc_id >= NNc_size)
return false;
......@@ -117,7 +117,7 @@ public:
* \return itself
*
*/
inline CellNNIterator & operator++()
__attribute__((always_inline)) inline CellNNIterator & operator++()
{
start_id++;