...
 
Commits (29)
......@@ -24,7 +24,6 @@ if (ENABLE_GPU)
elseif ( CUDA_VERSION_MAJOR EQUAL 10 AND CUDA_VERSION_MINOR EQUAL 1 )
message("CUDA is compatible")
set(WARNING_SUPPRESSION_AND_OPTION_NVCC -Xcudafe "--display_error_number --diag_suppress=2915 --diag_suppress=2914 --diag_suppress=2912 --diag_suppress=2913 --diag_suppress=111 --diag_suppress=186 --diag_suppress=611 " --expt-extended-lambda)
else()
message(FATAL_ERROR "CUDA is incompatible, version 9.2 is only supported")
endif()
......
......@@ -3,12 +3,30 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
########################### Executables
if (CUDA_FOUND)
set(CUDA_SOURCES Vector/vector_gpu_unit_tests.cu Grid/cuda/cuda_grid_gpu_tests.cu Vector/cuda/map_vector_cuda_funcs_tests.cu ../../openfpm_devices/src/memory/CudaMemory.cu NN/CellList/CellList_gpu_test.cu util/cuda/scan_cuda_unit_tests.cu Grid/cuda/cuda_grid_unit_tests_func.cu util/cuda/modern_gpu_tests.cu)
set(CUDA_SOURCES Vector/vector_gpu_unit_tests.cu
Grid/cuda/cuda_grid_gpu_tests.cu
Vector/cuda/map_vector_cuda_funcs_tests.cu
../../openfpm_devices/src/memory/CudaMemory.cu
NN/CellList/CellList_gpu_test.cu util/cuda/scan_cuda_unit_tests.cu
Grid/cuda/cuda_grid_unit_tests_func.cu
util/cuda/modern_gpu_tests.cu
Vector/cuda/map_vector_sparse_cuda_ker_unit_tests.cu
Vector/cuda/map_vector_sparse_cuda_kernels_unit_tests.cu
NN/CellList/tests/CellDecomposer_gpu_ker_unit_test.cu )
else()
set(CUDA_SOURCES )
endif()
add_executable(mem_map ../../openfpm_devices/src/Memleak_check.cpp main.cpp util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp memory_ly/memory_conf_unit_tests.cpp Space/tests/SpaceBox_unit_tests.cpp Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp ${CUDA_SOURCES} ../../openfpm_devices/src/memory/HeapMemory.cpp ../../openfpm_devices/src/memory/PtrMemory.cpp)
add_executable(mem_map ../../openfpm_devices/src/Memleak_check.cpp
${CUDA_SOURCES}
main.cpp
Vector/map_vector_sparse_unit_tests.cpp
util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp
memory_ly/memory_conf_unit_tests.cpp
Space/tests/SpaceBox_unit_tests.cpp
Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp
../../openfpm_devices/src/memory/HeapMemory.cpp
../../openfpm_devices/src/memory/PtrMemory.cpp)
if ( CMAKE_COMPILER_IS_GNUCC )
target_compile_options(mem_map PRIVATE "-Wno-deprecated-declarations")
......@@ -214,13 +232,20 @@ install(FILES Vector/map_vector.hpp
Vector/vector_std_pack_unpack.ipp
Vector/vector_pack_unpack.ipp
Vector/vector_map_iterator.hpp
Vector/map_vector_printers.hpp
Vector/map_vector_sparse.hpp
DESTINATION openfpm_data/include/Vector )
install(FILES Vector/cuda/map_vector_cuda_ker.cuh
Vector/cuda/map_vector_std_cuda_ker.cuh
Vector/cuda/map_vector_std_cuda.hpp
Vector/cuda/map_vector_sparse_cuda_ker.cuh
Vector/cuda/map_vector_sparse_cuda_kernels.cuh
DESTINATION openfpm_data/include/Vector/cuda/ )
install(DIRECTORY util/cuda/cub/
DESTINATION openfpm_data/include/util/cuda/cub )
install(FILES util/multi_array_openfpm/array_openfpm.hpp
util/multi_array_openfpm/multi_array_iterator_openfpm.hpp
util/multi_array_openfpm/multi_array_ref_base_openfpm.hpp
......@@ -235,6 +260,7 @@ install(FILES util/cuda/scan_cuda.cuh
util/cuda/ofp_context.hxx
util/cuda/cuda_launch.hpp
util/cuda/cuda_kernel_error_checker.hpp
util/cuda/kernels.cuh
DESTINATION openfpm_data/include/util/cuda )
install(FILES util/cuda/moderngpu/context.hxx
......
......@@ -37,6 +37,9 @@ struct copy_ndim_grid_impl<2,grid_type>
key_src.set_d(0,threadIdx.x + blockIdx.x * blockDim.x);
key_src.set_d(1,threadIdx.y + blockIdx.y * blockDim.y);
if (key_src.get(0) >= src.getGrid().size(0)) {return;}
if (key_src.get(1) >= src.getGrid().size(1)) {return;}
dst.get_o(key_src) = src.get_o(key_src);
}
};
......@@ -51,6 +54,10 @@ struct copy_ndim_grid_impl<3,grid_type>
key_src.set_d(1,threadIdx.y + blockIdx.y * blockDim.y);
key_src.set_d(2,threadIdx.y + blockIdx.y * blockDim.y);
if (key_src.get(0) >= src.getGrid().size(0)) {return;}
if (key_src.get(1) >= src.getGrid().size(1)) {return;}
if (key_src.get(2) >= src.getGrid().size(2)) {return;}
dst.get_o(key_src) = src.get_o(key_src);
}
};
......
......@@ -27,8 +27,6 @@ __global__ void test_launch(vector_pos_type set_points, vector_prop_type prop, B
v[0] = prop.template get<1>(p)[0];
v[1] = prop.template get<1>(p)[1];
v[2] = prop.template get<1>(p)[2];
printf("Point p %f %f %f scalar: %f vector: %f %f %f \n",pos[0],pos[1],pos[2],scalar,v[0],v[1],v[2]);
}
template<typename grid_type>
......
......@@ -64,97 +64,114 @@ struct skip_init<true,T>
if (key.get(0) > ite_gpu.stop.get(0) || key.get(1) > ite_gpu.stop.get(1))\
{return;}
template<unsigned int dim>
struct ite_gpu
#ifdef __NVCC__
template<unsigned int dim, typename ids_type = int>
struct grid_p
{
dim3 thr;
dim3 wthr;
__device__ static inline grid_key_dx<dim,ids_type> get_grid_point(const grid_sm<dim,void> & g)
{
grid_key_dx<dim,ids_type> key;
grid_key_dx<dim> start;
grid_key_dx<dim> stop;
};
key.set_d(0,blockIdx.x * blockDim.x + threadIdx.x);
key.set_d(1,blockIdx.y * blockDim.y + threadIdx.y);
unsigned int bz = blockIdx.z * blockDim.z + threadIdx.z;
key.set_d(2,bz % g.size(2));
template<unsigned int dim, typename T>
ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T> & g1, grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = 1024)
{
size_t tot_work = 1;
for (size_t i = 0 ; i < dim ; i++)
{tot_work *= key2.get(i) - key1.get(i) + 1;}
size_t n = (tot_work <= n_thr)?openfpm::math::round_big_2(tot_work):n_thr;
for (unsigned int i = 3 ; i < dim ; i++)
{
bz /= g.size(i);
key.set_d(i,bz % g.size(i));
}
// Work to do
ite_gpu<dim> ig;
return key;
}
if (tot_work == 0)
__device__ static inline grid_key_dx<dim,ids_type> get_grid_point(const openfpm::array<ids_type,dim,unsigned int> & g)
{
ig.thr.x = 0;
ig.thr.y = 0;
ig.thr.z = 0;
grid_key_dx<dim,ids_type> key;
ig.wthr.x = 0;
ig.wthr.y = 0;
ig.wthr.z = 0;
key.set_d(0,blockIdx.x * blockDim.x + threadIdx.x);
key.set_d(1,blockIdx.y * blockDim.y + threadIdx.y);
return ig;
}
unsigned int bz = blockIdx.z * blockDim.z + threadIdx.z;
key.set_d(2,bz % g[2]);
ig.thr.x = 1;
ig.thr.y = 1;
ig.thr.z = 1;
for (unsigned int i = 3 ; i < dim ; i++)
{
bz /= g[i];
key.set_d(i,bz % g[i]);
}
int dir = 0;
return key;
}
};
while (n != 1)
template<typename ids_type>
struct grid_p<3,ids_type>
{
__device__ static inline grid_key_dx<3,ids_type> get_grid_point(const grid_sm<3,void> & g)
{
if (dir % 3 == 0)
{ig.thr.x = ig.thr.x << 1;}
else if (dir % 3 == 1)
{ig.thr.y = ig.thr.y << 1;}
else if (dir % 3 == 2)
{ig.thr.z = ig.thr.z << 1;}
grid_key_dx<3,unsigned int> key;
n = n >> 1;
key.set_d(0,blockIdx.x * blockDim.x + threadIdx.x);
key.set_d(1,blockIdx.y * blockDim.y + threadIdx.y);
key.set_d(2,blockIdx.z * blockDim.z + threadIdx.z);
dir++;
dir %= dim;
return key;
}
if (dim >= 1)
{ig.wthr.x = (key2.get(0) - key1.get(0) + 1) / ig.thr.x + (((key2.get(0) - key1.get(0) + 1)%ig.thr.x != 0)?1:0);}
__device__ static inline grid_key_dx<3,ids_type> get_grid_point(const openfpm::array<ids_type,3,unsigned int> & g)
{
grid_key_dx<3,ids_type> key;
if (dim >= 2)
{ig.wthr.y = (key2.get(1) - key1.get(1) + 1) / ig.thr.y + (((key2.get(1) - key1.get(1) + 1)%ig.thr.y != 0)?1:0);}
else
{ig.wthr.y = 1;}
key.set_d(0,blockIdx.x * blockDim.x + threadIdx.x);
key.set_d(1,blockIdx.y * blockDim.y + threadIdx.y);
key.set_d(2,blockIdx.z * blockDim.z + threadIdx.z);
if (dim >= 3)
return key;
}
};
template<typename ids_type>
struct grid_p<2,ids_type>
{
__device__ static inline grid_key_dx<2,ids_type> get_grid_point(const grid_sm<2,void> & g)
{
// Roll the other dimensions on z
ig.wthr.z = 1;
for (size_t i = 2 ; i < dim ; i++)
{ig.wthr.z *= (key2.get(i) - key1.get(i) + 1) / ig.thr.z + (((key2.get(i) - key1.get(i) + 1)%ig.thr.z != 0)?1:0);}
grid_key_dx<2,ids_type> key;
key.set_d(0,blockIdx.x * blockDim.x + threadIdx.x);
key.set_d(1,blockIdx.y * blockDim.y + threadIdx.y);
return key;
}
else
{ig.wthr.z = 1;}
// crop if wthr == 1
__device__ static inline grid_key_dx<2,ids_type> get_grid_point(const openfpm::array<ids_type,2,unsigned int> & g)
{
grid_key_dx<2,ids_type> key;
if (dim >= 1 && ig.wthr.x == 1)
{ig.thr.x = (key2.get(0) - key1.get(0) + 1);}
key.set_d(0,blockIdx.x * blockDim.x + threadIdx.x);
key.set_d(1,blockIdx.y * blockDim.y + threadIdx.y);
if (dim >= 2 && ig.wthr.y == 1)
{ig.wthr.y = key2.get(1) - key1.get(1) + 1;}
return key;
}
};
if (dim == 3 && ig.wthr.z == 1)
{ig.wthr.z = key2.get(2) - key1.get(2) + 1;}
template<typename ids_type>
struct grid_p<1,ids_type>
{
__device__ static inline grid_key_dx<1,unsigned int> get_grid_point(const grid_sm<1,void> & g)
{
grid_key_dx<1,unsigned int> key;
ig.start = key1;
ig.stop = key2;
key.set_d(0,blockIdx.x * blockDim.x + threadIdx.x);
return ig;
}
return key;
}
};
#endif
template<unsigned int dim>
bool has_work_gpu(ite_gpu<dim> & ite)
......@@ -405,7 +422,7 @@ private:
grid_sm<1,void> g_sm_copy(sz);
auto ite = getGPUIterator_impl<1,void>(g_sm_copy,start,stop);
auto ite = getGPUIterator_impl<1>(g_sm_copy,start,stop);
copy_ndim_grid_device<dim,decltype(grid_new.toKernel())><<<ite.wthr,ite.thr>>>(this->toKernel(),grid_new.toKernel());
}
......
......@@ -116,6 +116,17 @@ public:
{this->k[i] = k[i];}
}
/*! \brief Constructor from buffer reference
*
* \param k reference buffer
*
*/
__device__ __host__ inline grid_key_dx(const unsigned int (&k)[dim])
{
for (size_t i = 0 ; i < dim ; i++)
{this->k[i] = k[i];}
}
/*! \brief Construct a grid key from a list of numbers
*
* \param cmb combination
......
......@@ -13,9 +13,6 @@
#include "util/mathutil.hpp"
#include "iterators/stencil_type.hpp"
#define PERIODIC 1
#define NON_PERIODIC 0
// Box need the definition of grid_key_dx_r
#define HARDWARE 1
......@@ -67,9 +64,31 @@ public:
}
};
template<unsigned int dim>
struct ite_gpu
{
#ifdef CUDA_GPU
dim3 thr;
dim3 wthr;
grid_key_dx<dim,int> start;
grid_key_dx<dim,int> stop;
size_t nblocks()
{
return wthr.x * wthr.y * wthr.z;
}
#endif
};
//! Declaration grid_sm
template<unsigned int N, typename T> class grid_sm;
template<unsigned int dim, typename T2, typename T>
ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, grid_key_dx<dim,T> & key1, grid_key_dx<dim,T> & key2, size_t n_thr = 1024);
//! Declaration print_warning_on_adjustment
template <unsigned int dim> class print_warning_on_adjustment;
......@@ -691,6 +710,42 @@ public:
return grid_key_dx_iterator_sub<N>(*this,start,stop);
}
#ifdef CUDA_GPU
/*! \brief Get an iterator for the GPU
*
* \param start starting point
* \param stop end point
*
*/
template<typename T2>
struct ite_gpu<N> getGPUIterator(grid_key_dx<N,T2> & key1, grid_key_dx<N,T2> & key2, size_t n_thr = 1024) const
{
return getGPUIterator_impl<N>(*this,key1,key2,n_thr);
}
/*! \brief Get an iterator for the GPU
*
* \param start starting point
* \param stop end point
*
*/
struct ite_gpu<N> getGPUIterator(size_t n_thr = 1024) const
{
grid_key_dx<N> k1;
grid_key_dx<N> k2;
for (size_t i = 0 ; i < N ; i++)
{
k1.set_d(i,0);
k2.set_d(i,size(i));
}
return getGPUIterator_impl<N>(*this,k1,k2,n_thr);
}
#endif
/*! \brief swap the grid_sm informations
*
* \param g grid to swap
......@@ -734,6 +789,89 @@ public:
};
template<unsigned int dim, typename T2, typename T>
ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, grid_key_dx<dim,T> & key1, grid_key_dx<dim,T> & key2, size_t n_thr)
{
size_t tot_work = 1;
for (size_t i = 0 ; i < dim ; i++)
{tot_work *= key2.get(i) - key1.get(i) + 1;}
size_t n = (tot_work <= n_thr)?openfpm::math::round_big_2(tot_work):n_thr;
// Work to do
ite_gpu<dim> ig;
if (tot_work == 0)
{
ig.thr.x = 0;
ig.thr.y = 0;
ig.thr.z = 0;
ig.wthr.x = 0;
ig.wthr.y = 0;
ig.wthr.z = 0;
return ig;
}
ig.thr.x = 1;
ig.thr.y = 1;
ig.thr.z = 1;
int dir = 0;
while (n != 1)
{
if (dir % 3 == 0)
{ig.thr.x = ig.thr.x << 1;}
else if (dir % 3 == 1)
{ig.thr.y = ig.thr.y << 1;}
else if (dir % 3 == 2)
{ig.thr.z = ig.thr.z << 1;}
n = n >> 1;
dir++;
dir %= dim;
}
if (dim >= 1)
{ig.wthr.x = (key2.get(0) - key1.get(0) + 1) / ig.thr.x + (((key2.get(0) - key1.get(0) + 1)%ig.thr.x != 0)?1:0);}
if (dim >= 2)
{ig.wthr.y = (key2.get(1) - key1.get(1) + 1) / ig.thr.y + (((key2.get(1) - key1.get(1) + 1)%ig.thr.y != 0)?1:0);}
else
{ig.wthr.y = 1;}
if (dim >= 3)
{
// Roll the other dimensions on z
ig.wthr.z = 1;
for (size_t i = 2 ; i < dim ; i++)
{ig.wthr.z *= (key2.get(i) - key1.get(i) + 1) / ig.thr.z + (((key2.get(i) - key1.get(i) + 1)%ig.thr.z != 0)?1:0);}
}
else
{ig.wthr.z = 1;}
// crop if wthr == 1
if (dim >= 1 && ig.wthr.x == 1)
{ig.thr.x = (key2.get(0) - key1.get(0) + 1);}
if (dim >= 2 && ig.wthr.y == 1)
{ig.wthr.y = key2.get(1) - key1.get(1) + 1;}
if (dim == 3 && ig.wthr.z == 1)
{ig.wthr.z = key2.get(2) - key1.get(2) + 1;}
for (size_t i = 0 ; i < dim ; i++)
{
ig.start.set_d(i,key1.get(i));
ig.stop.set_d(i,key2.get(i));
}
return ig;
}
/*! \brief Emulate grid_key_dx with runtime dimensionality
......
......@@ -270,6 +270,7 @@ void NNcalc_rad(T r_cut, openfpm::vector<long int> & NNcell, const Box<dim,T> &
cell_zero.setHigh(i,(n_cell_mid[i]+1)*spacing.get(i));
}
NNcell.clear();
while (gkdi.isNext())
{
auto key = gkdi.get();
......@@ -1211,6 +1212,9 @@ public:
/////////////////////////////////////
void re_setBoxNN()
{}
/////////////////////////////////////
/*! \brief Set the n_dec number
......
......@@ -462,6 +462,8 @@ BOOST_AUTO_TEST_CASE( ParticleIt_Cells_iterator )
while (it_cl.isNext())
{
auto i = it_cl.get();
count++;
++it_cl;
}
......
This diff is collapsed.
......@@ -94,6 +94,12 @@ public:
stop = &cli.getStopId(s_cell);
selectValid();
while (*start >= g_m)
{
++start;
selectValid();
}
}
/*! \brief Increment to the next particle
......
/*
* CellDecomposer_gpu_ker.hpp
*
* Created on: Apr 28, 2019
* Author: i-bird
*/
#ifndef CELLDECOMPOSER_GPU_KER_HPP_
#define CELLDECOMPOSER_GPU_KER_HPP_
#include "util/multi_array_openfpm/array_openfpm.hpp"
#include "Grid/grid_sm.hpp"
#include "NN/CellList/cuda/Cuda_cell_list_util_func.hpp"
#include "NN/CellList/CellDecomposer.hpp"
template <unsigned int dim, typename T, typename cnt_type, typename ids_type, typename transform>
class CellDecomposer_gpu_ker
{
//! Spacing
openfpm::array<T,dim,cnt_type> spacing_c;
//! \brief number of sub-divisions in each direction
openfpm::array<ids_type,dim,cnt_type> div_c;
//! \brief cell offset
openfpm::array<ids_type,dim,cnt_type> off;
//! transformation
transform t;
public:
__device__ __host__ CellDecomposer_gpu_ker(openfpm::array<T,dim,cnt_type> & spacing_c,
openfpm::array<ids_type,dim,cnt_type> & div_c,
openfpm::array<ids_type,dim,cnt_type> & off,
const transform & t)
:spacing_c(spacing_c),div_c(div_c),off(off),t(t)
{}
__host__ grid_sm<dim,void> getGrid()
{
size_t sz[dim];
for (size_t i = 0 ; i < dim ; i++)
{
sz[i] = div_c[i] + 2*off[i];
}
return grid_sm<dim,void> (sz);
}
__device__ __host__ inline grid_key_dx<dim,ids_type> getCell(const Point<dim,T> & xp) const
{
return cid_<dim,cnt_type,ids_type,transform>::get_cid_key(spacing_c,off,t,xp);
}
__device__ __host__ inline cnt_type LinId(const grid_key_dx<dim,ids_type> & k) const
{
return cid_<dim,cnt_type,ids_type,transform>::get_cid(div_c,k);
}
__device__ inline const openfpm::array<T,dim,cnt_type> & get_spacing_c() const
{
return spacing_c;
}
__device__ inline const openfpm::array<ids_type,dim,cnt_type> & get_div_c() const
{
return div_c;
}
__device__ inline const openfpm::array<ids_type,dim,cnt_type> & get_off() const
{
return off;
}
__device__ inline const transform & get_t() const
{
return t;
}
};
#endif /* CELLDECOMPOSER_GPU_KER_HPP_ */
This diff is collapsed.
This diff is collapsed.
......@@ -9,6 +9,7 @@
#define OPENFPM_DATA_SRC_NN_CELLLIST_CUDA_CUDA_CELL_LIST_UTIL_FUNC_HPP_
#include <boost/integer/integer_mask.hpp>
#include <Vector/map_vector_sparse.hpp>
template<unsigned int dim, typename cnt_type, typename ids_type, typename transform>
struct cid_
......@@ -24,7 +25,7 @@ struct cid_
return id;
}
static inline __device__ __host__ cnt_type get_cid(openfpm::array<ids_type,dim,cnt_type> & div_c , const grid_key_dx<1,cnt_type> & e)
static inline __device__ __host__ cnt_type get_cid(const openfpm::array<ids_type,dim,cnt_type> & div_c , const grid_key_dx<1,cnt_type> & e)
{
cnt_type id = e.get(dim-1);
......@@ -35,8 +36,8 @@ struct cid_
return id;
}
template<typename T> static inline __device__ __host__ cnt_type get_cid(openfpm::array<ids_type,dim,cnt_type> & div_c,
openfpm::array<T,dim,cnt_type> & spacing,
template<typename T> static inline __device__ __host__ cnt_type get_cid(const openfpm::array<ids_type,dim,cnt_type> & div_c,
const openfpm::array<T,dim,cnt_type> & spacing,
const transform & t,
const Point<dim,T> & p)
{
......@@ -53,18 +54,18 @@ struct cid_
template<typename cnt_type, typename ids_type, typename transform>
struct cid_<1,cnt_type,ids_type, transform>
{
static inline __device__ __host__ cnt_type get_cid(openfpm::array<ids_type,1,cnt_type> & div_c, ids_type * e)
static inline __device__ __host__ cnt_type get_cid(const openfpm::array<ids_type,1,cnt_type> & div_c, ids_type * e)
{
return e[0];
}
static inline __device__ __host__ cnt_type get_cid(const openfpm::array<ids_type,1,cnt_type> & div_c, const grid_key_dx<1,cnt_type> & e)
static inline __device__ __host__ cnt_type get_cid(const openfpm::array<ids_type,1,cnt_type> & div_c, const grid_key_dx<1,ids_type> & e)
{
return e.get(0);
}
template<typename T> static inline __device__ __host__ cnt_type get_cid(openfpm::array<ids_type,1,cnt_type> & div_c,
openfpm::array<T,1,cnt_type> & spacing,
template<typename T> static inline __device__ __host__ cnt_type get_cid(const openfpm::array<ids_type,1,cnt_type> & div_c,
const openfpm::array<T,1,cnt_type> & spacing,
const transform & t,
const Point<1,T> & p)
{
......@@ -75,7 +76,7 @@ struct cid_<1,cnt_type,ids_type, transform>
template<typename cnt_type, typename ids_type, typename transform>
struct cid_<2,cnt_type,ids_type,transform>
{
static inline __device__ __host__ cnt_type get_cid(openfpm::array<ids_type,2,cnt_type> & div_c, ids_type * e)
static inline __device__ __host__ cnt_type get_cid(const openfpm::array<ids_type,2,cnt_type> & div_c, ids_type * e)
{
return e[0] + div_c[0] * e[1];
}
......@@ -109,7 +110,8 @@ struct cid_<2,cnt_type,ids_type,transform>
return e[0] + e[1]*div_c[0];
}
template<typename T> static inline __device__ __host__ grid_key_dx<2,ids_type> get_cid_key(const openfpm::array<T,2,cnt_type> & spacing,
template<typename T>
static inline __device__ __host__ grid_key_dx<2,ids_type> get_cid_key(const openfpm::array<T,2,cnt_type> & spacing,
const openfpm::array<ids_type,2,cnt_type> & off,
const transform & t,
const Point<2,T> & p)
......@@ -147,6 +149,13 @@ struct cid_<3,cnt_type,ids_type,transform>
return e.get(0) + (e.get(1) + e.get(2)*div_c[1])*div_c[0];
}
static inline __device__ __host__ cnt_type get_cid(const openfpm::array<ids_type,3,cnt_type> & div_c,
const openfpm::array<ids_type,3,cnt_type> & off,
const grid_key_dx<3,ids_type> & e)
{
return (e.get(0) + off[0]) + ((e.get(1) + off[1]) + (e.get(2) + off[2])*div_c[1])*div_c[0];
}
template<typename T> static inline __device__ __host__ cnt_type get_cid(const openfpm::array<ids_type,3,cnt_type> & div_c,
const openfpm::array<T,3,cnt_type> & spacing,
const openfpm::array<ids_type,3,cnt_type> & off,
......@@ -173,7 +182,8 @@ struct cid_<3,cnt_type,ids_type,transform>
return e[0] + (e[1] + e[2]*div_c[1])*div_c[0];
}
template<typename T> static inline __device__ __host__ grid_key_dx<3,ids_type> get_cid_key(const openfpm::array<T,3,cnt_type> & spacing,
template<typename T>
static inline __device__ __host__ grid_key_dx<3,ids_type> get_cid_key(const openfpm::array<T,3,cnt_type> & spacing,
const openfpm::array<ids_type,3,cnt_type> & off,
const transform & t,
const Point<3,T> & p)
......@@ -214,8 +224,68 @@ __device__ __host__ cnt_type encode_phase_id(cnt_type ph_id,cnt_type pid)
#ifdef __NVCC__
template<unsigned int dim, typename pos_type, typename cnt_type, typename ids_type, typename transform>
template<bool is_sparse,unsigned int dim, typename pos_type,
typename cnt_type, typename ids_type, typename transform>
__global__ void subindex(openfpm::array<ids_type,dim,cnt_type> div,
openfpm::array<pos_type,dim,cnt_type> spacing,
openfpm::array<ids_type,dim,cnt_type> off,
transform t,
int n_cap,
int n_part,
int n_cap2,
cnt_type start,
pos_type * p_pos,
cnt_type *counts,
ids_type * p_ids)
{
cnt_type i, cid, ins;
ids_type e[dim+1];
i = threadIdx.x + blockIdx.x * blockDim.x + start;
ins = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= n_part) return;
pos_type p[dim];
for (size_t k = 0 ; k < dim ; k++)
{p[k] = p_pos[i+k*n_cap];}
cid = cid_<dim,cnt_type,ids_type,transform>::get_cid(div,spacing,off,t,p,e);
if (is_sparse == false)
{
e[dim] = atomicAdd(counts + cid, 1);
for (size_t k = 0 ; k <= dim ; k++)
{p_ids[ins+k*(n_cap2)] = e[k];}
}
else
{
for (size_t k = 0 ; k <= dim ; k++)
{p_ids[ins+k*(n_cap2)] = e[k];}
counts[ins] = cid;
}
}
template<typename vector_sparse, typename vector_cell>
__global__ void fill_cells_sparse(vector_sparse vs, vector_cell vc)
{
vs.init();
int p = blockIdx.x*blockDim.x + threadIdx.x;
if (p >= vc.size()) {return;}
int c = vc.template get<0>(p);
vs.template insert<0>(c) = p;
vs.flush_block_insert();
}
template<unsigned int dim, typename pos_type, typename cnt_type, typename ids_type, typename transform>
__global__ void subindex_without_count(openfpm::array<ids_type,dim,cnt_type> div,
openfpm::array<pos_type,dim,cnt_type> spacing,
openfpm::array<ids_type,dim,cnt_type> off,
transform t,
......@@ -251,6 +321,7 @@ __global__ void fill_cells(cnt_type phase_id ,
openfpm::array<ids_type,dim,cnt_type> off,
cnt_type n,
cnt_type n_cap,
cnt_type start_p,
const cnt_type *starts,
const ids_type * p_ids,
cnt_type *cells)
......@@ -270,7 +341,7 @@ __global__ void fill_cells(cnt_type phase_id ,
start = starts[cid];
id = start + e[dim];
cells[id] = encode_phase_id<cnt_type,ph>(phase_id,i);
cells[id] = encode_phase_id<cnt_type,ph>(phase_id,i + start_p);
}
......@@ -338,6 +409,64 @@ __global__ void mark_domain_particles(vector_sort_index vsi, vector_out_type vou
}
template<typename cl_sparse_type, typename vector_type, typename vector_type2>
__global__ void count_nn_cells(cl_sparse_type cl_sparse, vector_type output, vector_type2 nn_to_test)
{
typedef typename cl_sparse_type::index_type index_type;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
openfpm::sparse_index<index_type> id(tid);
index_type cell = cl_sparse.get_index(id);
for (int i = 0 ; i < nn_to_test.size() ; i++)
{
index_type cell_n = cell + nn_to_test.template get<0>(i);
index_type start = cl_sparse.template get<0>(cell_n);
if (start != (index_type)-1)
{
// Cell exist
output.template get<0>(tid) += 1;
}
}
};
template<typename cl_sparse_type, typename vector_type, typename vector_type2, typename vector_type3>
__global__ void fill_nn_cells(cl_sparse_type cl_sparse, vector_type starts, vector_type2 nn_to_test, vector_type3 output, typename cl_sparse_type::index_type max_stop)
{
typedef typename cl_sparse_type::index_type index_type;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
openfpm::sparse_index<index_type> id(tid);
if (tid >= cl_sparse.size()) {return;}
index_type cell = cl_sparse.get_index(id);
int cnt = 0;
for (int i = 0 ; i < nn_to_test.size() ; i++)
{
index_type cell_n = cell + nn_to_test.template get<0>(i);
auto sid = cl_sparse.get_sparse(cell_n);
if (sid.id != (index_type)-1)
{
index_type start = cl_sparse.template get<0>(sid);
// Cell exist
output.template get<0>(starts.template get<0>(tid) + cnt) = start;
if (sid.id == cl_sparse.size() - 1)
{output.template get<1>(starts.template get<0>(tid) + cnt) = max_stop;}
else
{output.template get<1>(starts.template get<0>(tid) + cnt) = cl_sparse.template get<0>(decltype(sid)(sid.id+1));}
++cnt;
}
}
};
template<typename T>
struct to_type4
{
......
#include "config.h"
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
#include "NN/CellList/cuda/CellDecomposer_gpu_ker.cuh"
#include "Space/Shape/Point.hpp"
#include "Vector/map_vector.hpp"
template<typename vector_type, typename celldec>
__global__ void check(vector_type vd, celldec cd, unsigned int id, Point<3,float> p)
{
vd.template get<0>(id) = cd.getCell(p);
}
BOOST_AUTO_TEST_SUITE( CellDecomposer_gpu_test_suite )
BOOST_AUTO_TEST_CASE( CellDecomposer_gpu_test_use )
{
//! Spacing
openfpm::array<float,3,unsigned int> spacing_c = {0.1,0.1,0.1};
//! \brief number of sub-divisions in each direction
openfpm::array<unsigned int,3,unsigned int> div_c = {10,10,10};
//! \brief cell offset
openfpm::array<unsigned int,3,unsigned int> off = {2,2,2};
Point<3,float> trans({0.0,0.0,0.0});
shift_only<3,float> t(Matrix<3,float>::identity(),trans);
CellDecomposer_gpu_ker<3,float,unsigned int, unsigned int,shift_only<3,float>> clk(spacing_c,div_c,off,t);
openfpm::vector_gpu<aggregate<grid_key_dx<3,unsigned int>>> output(8);
check<<<1,1>>>(output.toKernel(),clk,0,Point<3,float>({0.2,0.2,0.2}));
check<<<1,1>>>(output.toKernel(),clk,1,Point<3,float>({0.1,0.2,0.3}));
check<<<1,1>>>(output.toKernel(),clk,2,Point<3,float>({0.25,0.55,0.45}));
check<<<1,1>>>(output.toKernel(),clk,3,Point<3,float>({0.15,0.15,0.95}));
check<<<1,1>>>(output.toKernel(),clk,4,Point<3,float>({1.05,1.05,1.05}));
check<<<1,1>>>(output.toKernel(),clk,5,Point<3,float>({1.15,1.15,1.15}));
check<<<1,1>>>(output.toKernel(),clk,6,Point<3,float>({-0.05,-0.05,-0.05}));
check<<<1,1>>>(output.toKernel(),clk,7,Point<3,float>({-0.15,-0.15,-0.15}));
output.template deviceToHost<0>();
grid_key_dx<3,unsigned int> k = output.template get<0>(0);
BOOST_REQUIRE_EQUAL(k.get(0),4);
BOOST_REQUIRE_EQUAL(k.get(1),4);
BOOST_REQUIRE_EQUAL(k.get(2),4);
k = output.template get<0>(1);
BOOST_REQUIRE_EQUAL(k.get(0),3);
BOOST_REQUIRE_EQUAL(k.get(1),4);
BOOST_REQUIRE_EQUAL(k.get(2),5);
k = output.template get<0>(2);
BOOST_REQUIRE_EQUAL(k.get(0),4);
BOOST_REQUIRE_EQUAL(k.get(1),7);
BOOST_REQUIRE_EQUAL(k.get(2),6);
k = output.template get<0>(3);
BOOST_REQUIRE_EQUAL(k.get(0),3);
BOOST_REQUIRE_EQUAL(k.get(1),3);
BOOST_REQUIRE_EQUAL(k.get(2),11);
k = output.template get<0>(4);
BOOST_REQUIRE_EQUAL(k.get(0),12);
BOOST_REQUIRE_EQUAL(k.get(1),12);
BOOST_REQUIRE_EQUAL(k.get(2),12);
k = output.template get<0>(5);
BOOST_REQUIRE_EQUAL(k.get(0),13);
BOOST_REQUIRE_EQUAL(k.get(1),13);
BOOST_REQUIRE_EQUAL(k.get(2),13);
k = output.template get<0>(6);
BOOST_REQUIRE_EQUAL(k.get(0),1);
BOOST_REQUIRE_EQUAL(k.get(1),1);
BOOST_REQUIRE_EQUAL(k.get(2),1);
k = output.template get<0>(7);
BOOST_REQUIRE_EQUAL(k.get(0),0);
BOOST_REQUIRE_EQUAL(k.get(1),0);
BOOST_REQUIRE_EQUAL(k.get(2),0);
}
BOOST_AUTO_TEST_SUITE_END()
......@@ -356,7 +356,7 @@ template<unsigned int dim ,typename T> class Point
for (size_t i = 0 ; i < dim ; i++)
{
if (p.get(i) != get(i))
return false;
{return false;}
}
return true;
......
......@@ -460,6 +460,11 @@ BOOST_AUTO_TEST_CASE( vector_cuda_copy )
match = v2.template get<2>(p)[2][1] == p + 15000;
match = v2.template get<2>(p)[2][2] == p + 16000;
if (match == false)
{
std::cout << v2.template get<0>(p) << std::endl;
}
++ite;
}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -39,6 +39,7 @@
#include "util/cuda_util.hpp"
#include "util/cuda/cuda_launch.hpp"
#include "cuda/map_vector_cuda_ker.cuh"
#include "map_vector_printers.hpp"
#include "util/cuda/cuda_launch.hpp"
namespace openfpm
......@@ -145,6 +146,7 @@ namespace openfpm
}
};
/*! \brief Implementation of 1-D std::vector like structure
*
* Stub object look at the various implementations
......@@ -1135,6 +1137,19 @@ namespace openfpm
dup.v_size = v_size;
dup.base.swap(base.duplicate());
// copy the device part
// and device
if (Memory::isDeviceHostSame() == false)
{
#ifdef __NVCC__
if (dup.size() != 0)
{
auto it = dup.getGPUIterator();
CUDA_LAUNCH(copy_two_vectors,it,dup.toKernel(),toKernel());
}
#endif
}
return dup;
}
......@@ -1302,8 +1317,7 @@ namespace openfpm
#endif
v_size = mv.v_size;
size_t rsz[1] = {v_size};
if (base.size() < v_size)
base.resize(rsz);
base.resize(rsz);
// copy the object on cpu
for (size_t i = 0 ; i < v_size ; i++ )
......@@ -1373,6 +1387,18 @@ namespace openfpm
base.set(key,mv.getInternal_base(),key);
}
// and device
if (Memory::isDeviceHostSame() == false && Mem::isDeviceHostSame() == false)
{
#ifdef __NVCC__
if (mv.size() != 0)
{
auto it = mv.getGPUIterator();
CUDA_LAUNCH(copy_two_vectors,it,toKernel(),mv.toKernel());
}
#endif
}
return *this;
}
......@@ -1427,6 +1453,18 @@ namespace openfpm
base.set_general(key,mv.getInternal_base(),key);
}
// and device
if (Memory::isDeviceHostSame() == false && Mem::isDeviceHostSame() == false)
{
#ifdef __NVCC__
if (mv.size() != 0)
{
auto it = mv.getGPUIterator();
CUDA_LAUNCH(copy_two_vectors,it,toKernel(),mv.toKernel());
}
#endif
}
return *this;
}
......@@ -1823,6 +1861,38 @@ namespace openfpm
return v;
}
/*! Convert this vector into a string
*
* \param prefix prefix to use for printing
*
* \return a string showing thid vector
*
*/
template<unsigned int ... prps>
const std::string toString(std::string prefix = std::string())
{
std::stringstream ss;
auto it = getIterator();
while (it.isNext())
{
auto p = it.get();
ss << prefix;
ss << prefix << " element[" << p << "]" << " ";
vector_printer<self_type,prps ...> vp(*this,p,ss);
boost::mpl::for_each_ref<boost::mpl::range_c<int,0,sizeof...(prps)>>(vp);
ss << std::endl;
++it;
}
return ss.str();
}
void * internal_get_size_pointer() {return &v_size;}
void print_size()
......
/*
* map_vector_printers.hpp
*
* Created on: Feb 10, 2019
* Author: i-bird
*/
#ifndef MAP_VECTOR_PRINTERS_HPP_
#define MAP_VECTOR_PRINTERS_HPP_
/*! \brief this class is a functor for "for_each" algorithm
*
* This class is a functor for "for_each" algorithm. For each
* element of the boost::vector the operator() is called.
* Is mainly used to print the elements of the vector
*
* \tparam encap source
* \tparam encap dst
*
*/
template<typename vector_type, unsigned int ... prp>
struct vector_printer
{
//! element to print
size_t & ele;
//! vector to print
vector_type & vt;
// stringstream
std::stringstream & ss;
typedef typename to_boost_vmpl<prp...>::type vprp;
/*! \brief constructor
*
* \param src source encapsulated object
* \param dst source encapsulated object
*
*/
inline vector_printer(vector_type & vt, size_t & ele, std::stringstream & ss)
:vt(vt),ele(ele),ss(ss)
{};
//! It call the copy function for each property
template<typename T>
inline void operator()(T& t) const
{
ss << vt.template get<T::value>(ele) << " ";
}
};
#endif /* MAP_VECTOR_PRINTERS_HPP_ */
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -19,7 +19,7 @@
* \tparam T type to copy
*
*/
template<typename T, unsigned int agg=2 * is_openfpm_native<T>::value>
template<typename T, unsigned int agg=2 * is_aggregate<T>::value>
struct compare_general
{
/*! \brief Spacialization when there is unknown compare method
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.