Commit cb3566e1 authored by incardon's avatar incardon

GPU moving on

parent 3d995585
SUBDIRS = src images openfpm_data openfpm_io openfpm_devices openfpm_vcluster openfpm_numerics SUBDIRS = src images openfpm_data openfpm_io openfpm_devices openfpm_vcluster openfpm_numerics
ACLOCAL_AMFLAGS = -I m4
bin_PROGRAMS = bin_PROGRAMS =
pdata: pdata:
...@@ -20,6 +22,8 @@ io: ...@@ -20,6 +22,8 @@ io:
numerics: numerics:
cd openfpm_numerics/src && make cd openfpm_numerics/src && make
actual_test:
cd src && make actual_test
......
...@@ -8,6 +8,7 @@ AC_INIT(OpenFPM_pdata, 1.0.0, BUG-REPORT-ADDRESS) ...@@ -8,6 +8,7 @@ AC_INIT(OpenFPM_pdata, 1.0.0, BUG-REPORT-ADDRESS)
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM
AC_CONFIG_SRCDIR([src/main.cpp]) AC_CONFIG_SRCDIR([src/main.cpp])
AC_CONFIG_SUBDIRS([openfpm_data openfpm_devices openfpm_vcluster openfpm_io openfpm_numerics]) AC_CONFIG_SUBDIRS([openfpm_data openfpm_devices openfpm_vcluster openfpm_io openfpm_numerics])
AC_CONFIG_MACRO_DIRS([m4])
#### Adding --with-pdata option and openfpm_pdata to prefix folder #### Adding --with-pdata option and openfpm_pdata to prefix folder
...@@ -22,7 +23,7 @@ ac_configure_args="$ac_configure_args --with-pdata=../../src" ...@@ -22,7 +23,7 @@ ac_configure_args="$ac_configure_args --with-pdata=../../src"
######################## ########################
AM_INIT_AUTOMAKE([subdir-objects]) AM_INIT_AUTOMAKE([subdir-objects foreign])
AC_CONFIG_HEADER([src/config/config.h]) AC_CONFIG_HEADER([src/config/config.h])
m4_ifdef([ACX_PTHREAD],,[m4_include([m4/acx_pthread.m4])]) m4_ifdef([ACX_PTHREAD],,[m4_include([m4/acx_pthread.m4])])
m4_ifdef([ACX_MPI],,[m4_include([m4/acx_mpi.m4])]) m4_ifdef([ACX_MPI],,[m4_include([m4/acx_mpi.m4])])
...@@ -144,7 +145,7 @@ have_quad_head=no ...@@ -144,7 +145,7 @@ have_quad_head=no
AC_CHECK_LIB(quadmath, sinq, [have_quad_lib=yes], []) AC_CHECK_LIB(quadmath, sinq, [have_quad_lib=yes], [])
AC_CHECK_HEADER(quadmath.h,[have_quad_head=yes],[]) AC_CHECK_HEADER(quadmath.h,[have_quad_head=yes],[])
if [x"have_quad_math" == x"yes" $&& x"have_quad_math" == x"yes" ]; then if test x"have_quad_math" == x"yes"; then
AC_DEFINE(HAVE_LIBQUADMATH,[],[Have quad math lib]) AC_DEFINE(HAVE_LIBQUADMATH,[],[Have quad math lib])
LIBQUADMATH=" -lquadmath " LIBQUADMATH=" -lquadmath "
fi fi
......
...@@ -1444,7 +1444,7 @@ int main(int argc, char* argv[]) ...@@ -1444,7 +1444,7 @@ int main(int argc, char* argv[])
// calculate the pressure at the sensor points // calculate the pressure at the sensor points
sensor_pressure(vd,NN,press_t,probes); sensor_pressure(vd,NN,press_t,probes);
vd.write("Geometry",write); vd.write_frame("Geometry",write);
write++; write++;
if (v_cl.getProcessUnitID() == 0) if (v_cl.getProcessUnitID() == 0)
......
...@@ -2,22 +2,22 @@ LINKLIBS = $(HDF5_LDFLAGS) $(HDF5_LIBS) $(OPENMP_LDFLAGS) $(LIBHILBERT_LIB) $(P ...@@ -2,22 +2,22 @@ LINKLIBS = $(HDF5_LDFLAGS) $(HDF5_LIBS) $(OPENMP_LDFLAGS) $(LIBHILBERT_LIB) $(P
noinst_PROGRAMS = cart_dec metis_dec dom_box vector_dist noinst_PROGRAMS = cart_dec metis_dec dom_box vector_dist
cart_dec_SOURCES = CartDecomposition_gen_vtk.cpp ../src/lib/pdata.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp cart_dec_SOURCES = CartDecomposition_gen_vtk.cpp ../src/lib/pdata.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
cart_dec_CXXFLAGS = $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(PETSC_INCLUDE) $(METIS_INCLUDE) $(PARMETIS_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(BOOST_CPPFLAGS) -I../src -Wno-unused-function -Wno-unused-local-typedefs cart_dec_CXXFLAGS = -Wno-unknown-pragmas $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(PETSC_INCLUDE) $(METIS_INCLUDE) $(PARMETIS_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(BOOST_CPPFLAGS) -I../src -Wno-unused-function -Wno-unused-local-typedefs
cart_dec_CFLAGS = $(OPENMP_CFLAGS) $(CUDA_CFLAGS) cart_dec_CFLAGS = $(OPENMP_CFLAGS) $(CUDA_CFLAGS)
cart_dec_LDADD = $(LINKLIBS) -lparmetis -lmetis cart_dec_LDADD = $(LINKLIBS) -lparmetis -lmetis
metis_dec_SOURCES = Metis_gen_vtk.cpp ../src/lib/pdata.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp metis_dec_SOURCES = Metis_gen_vtk.cpp ../src/lib/pdata.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
metis_dec_CXXFLAGS = $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(PETSC_INCLUDE) $(METIS_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(BOOST_CPPFLAGS) -I../src -Wno-unused-function -Wno-unused-local-typedefs metis_dec_CXXFLAGS = -Wno-unknown-pragmas $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(PETSC_INCLUDE) $(METIS_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(BOOST_CPPFLAGS) -I../src -Wno-unused-function -Wno-unused-local-typedefs
metis_dec_CFLAGS = $(OPENMP_CFLAGS) $(CUDA_CFLAGS) metis_dec_CFLAGS = $(OPENMP_CFLAGS) $(CUDA_CFLAGS)
metis_dec_LDADD = $(LINKLIBS) -lmetis metis_dec_LDADD = $(LINKLIBS) -lmetis
dom_box_SOURCES = domain_gen_vtk.cpp ../src/lib/pdata.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp dom_box_SOURCES = domain_gen_vtk.cpp ../src/lib/pdata.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
dom_box_CXXFLAGS = $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(PETSC_INCLUDE) $(METIS_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(BOOST_CPPFLAGS) -I../src -Wno-unused-function -Wno-unused-local-typedefs dom_box_CXXFLAGS = -Wno-unknown-pragmas $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(PETSC_INCLUDE) $(METIS_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(BOOST_CPPFLAGS) -I../src -Wno-unused-function -Wno-unused-local-typedefs
dom_box_CFLAGS = $(OPENMP_CFLAGS) $(CUDA_CFLAGS) dom_box_CFLAGS = $(OPENMP_CFLAGS) $(CUDA_CFLAGS)
dom_box_LDADD = $(LINKLIBS) dom_box_LDADD = $(LINKLIBS)
vector_dist_SOURCES = vector.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_devices/src/Memleak_check.cpp vector_dist_SOURCES = vector.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_devices/src/Memleak_check.cpp
vector_dist_CXXFLAGS = $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(PETSC_INCLUDE) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(HDF5_CPPFLAGS) $(BOOST_CPPFLAGS) -I../src -Wno-unused-function -Wno-unused-local-typedefs vector_dist_CXXFLAGS = -Wno-unknown-pragmas $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(PETSC_INCLUDE) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(HDF5_CPPFLAGS) $(BOOST_CPPFLAGS) -I../src -Wno-unused-function -Wno-unused-local-typedefs
vector_dist_CFLAGS = $(OPENMP_CFLAGS) $(CUDA_CFLAGS) vector_dist_CFLAGS = $(OPENMP_CFLAGS) $(CUDA_CFLAGS)
vector_dist_LDADD = $(LINKLIBS) -lparmetis -lmetis vector_dist_LDADD = $(LINKLIBS) -lparmetis -lmetis
......
openfpm_data @ 66fdd0bd
Subproject commit e96170649cc5257d6be5b937372b281e6e5d74a8 Subproject commit 66fdd0bdd308bd78efe3c14a7d2bf9290e35f550
openfpm_vcluster @ ad71d154
Subproject commit 9602917fecc1e84f7a837fcd69a4bd98d572b4ba Subproject commit ad71d154d0a8420816f49b63dfa1f43084b13022
This diff is collapsed.
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
#include "Space/Ghost.hpp" #include "Space/Ghost.hpp"
#include "Decomposition/nn_processor.hpp" #include "Decomposition/nn_processor.hpp"
template<unsigned int dim, typename T, typename Memory = HeapMemory, typename Distribution = ParMetisDistribution<dim, T>> template<unsigned int dim, typename T, typename Memory = HeapMemory, template<typename> class layout_base = memory_traits_lin, typename Distribution = ParMetisDistribution<dim, T>>
class CartDecomposition; class CartDecomposition;
/** /**
...@@ -38,8 +38,8 @@ class CartDecomposition; ...@@ -38,8 +38,8 @@ class CartDecomposition;
* *
*/ */
template<unsigned int dim, typename T, typename Memory = HeapMemory, typename Distribution = ParMetisDistribution<dim, T>> template<unsigned int dim, typename T, typename Memory = HeapMemory, template<typename> class layout_base = memory_traits_lin, typename Distribution = ParMetisDistribution<dim, T>>
class CartDecomposition_ext: public CartDecomposition<dim,T,Memory,Distribution> class CartDecomposition_ext: public CartDecomposition<dim,T,Memory,layout_base,Distribution>
{ {
private: private:
...@@ -51,7 +51,7 @@ private: ...@@ -51,7 +51,7 @@ private:
* \param ext_dom Extended domain * \param ext_dom Extended domain
* *
*/ */
void extend_subdomains(const CartDecomposition<dim,T,Memory,Distribution> & dec, const ::Box<dim,T> & ext_dom) void extend_subdomains(const CartDecomposition<dim,T,Memory,layout_base,Distribution> & dec, const ::Box<dim,T> & ext_dom)
{ {
// Box // Box
typedef ::Box<dim,T> b; typedef ::Box<dim,T> b;
...@@ -168,12 +168,12 @@ public: ...@@ -168,12 +168,12 @@ public:
* *
*/ */
CartDecomposition_ext(Vcluster & v_cl) CartDecomposition_ext(Vcluster & v_cl)
:CartDecomposition<dim,T,Memory,Distribution>(v_cl) :CartDecomposition<dim,T,Memory,layout_base,Distribution>(v_cl)
{ {
} }
//! The non-extended decomposition base class //! The non-extended decomposition base class
typedef CartDecomposition<dim,T,Memory,Distribution> base_type; typedef CartDecomposition<dim,T,Memory,layout_base,Distribution> base_type;
/*! \brief It create another object that contain the same decomposition information but with different ghost boxes and an extended domain /*! \brief It create another object that contain the same decomposition information but with different ghost boxes and an extended domain
* *
...@@ -216,7 +216,7 @@ public: ...@@ -216,7 +216,7 @@ public:
* \return a duplicated decomposition with different ghost boxes and an extended domain * \return a duplicated decomposition with different ghost boxes and an extended domain
* *
*/ */
void setParameters(const CartDecomposition<dim,T,Memory,Distribution> & dec, const Ghost<dim,T> & g, const ::Box<dim,T> & ext_domain) void setParameters(const CartDecomposition<dim,T,Memory,layout_base,Distribution> & dec, const Ghost<dim,T> & g, const ::Box<dim,T> & ext_domain)
{ {
this->box_nn_processor = dec.box_nn_processor; this->box_nn_processor = dec.box_nn_processor;
......
...@@ -115,14 +115,16 @@ struct Box_sub_k ...@@ -115,14 +115,16 @@ struct Box_sub_k
template<unsigned int dim,typename T> template<unsigned int dim,typename T>
struct Box_map struct Box_map
{ {
Box<dim,T> box; typedef boost::fusion::vector<Box<dim,T>,long int> type;
long int prc; type data;
static bool noPointers() static bool noPointers()
{ {
return true; return true;
} }
static const unsigned int max_prop = 2;
}; };
//! Case for local ghost box //! Case for local ghost box
......
/*
* CartDecomposition_gpu.hpp
*
* Created on: Aug 7, 2018
* Author: i-bird
*/
#ifndef CARTDECOMPOSITION_GPU_HPP_
#define CARTDECOMPOSITION_GPU_HPP_
#ifdef __NVCC__
template<typename cartdec_gpu, typename particles_type, typename vector_out>
__global__ void process_id_proc_each_part(cartdec_gpu cdg, particles_type parts, vector_out output , int rank)
{
int p = threadIdx.x + blockIdx.x * blockDim.x;
if (p >= parts.size()) return;
Point<3,float> xp = parts.template get<0>(p);
int pr = cdg.processorIDBC(xp);
output.template get<1>(p) = (pr == rank)?-1:pr;
output.template get<0>(p) = p;
}
#endif
template<typename T2, typename fine_s_type, typename vsub_domain_type>
__device__ __host__ inline int processorID_impl(T2 & p, fine_s_type & fine_s, vsub_domain_type & sub_domains_global)
{
// Get the number of elements in the cell
int e = -1;
int cl = fine_s.getCell(p);
int n_ele = fine_s.getNelements(cl);
for (int i = 0 ; i < n_ele ; i++)
{
e = fine_s.get(cl,i);
if (sub_domains_global.template get<0>(e).isInsideNP(p) == true)
{
break;
}
}
#if defined(SE_CLASS1) && !defined(__NVCC__)
if (n_ele == 0)
{
std::cout << __FILE__ << ":" << __LINE__ << " I cannot detect in which processor this particle go" << std::endl;
return -1;
}
#endif
return sub_domains_global.template get<1>(e);
}
template<unsigned int dim, typename T, typename Memory, template <typename> class layout_base>
class CartDecomposition_gpu
{
CellList_cpu_ker<dim,T,Mem_fast_ker<Memory,memory_traits_lin,int>,shift<dim,T>> clk;
Box<dim,T> domain;
int bc[dim];
openfpm::vector_gpu_ker<Box_map<dim, T>,layout_base> sub_domains_global;
/*! \brief Apply boundary condition to the point
*
* If the particle go out to the right, bring back the particle on the left
* in case of periodic, nothing in case of non periodic
*
* \param pt Point to apply the boundary conditions.(it's coordinated are changed according the
* the explanation before)
*
*/
__device__ void applyPointBC(Point<dim,T> & pt) const
{
for (int i = 0 ; i < dim ; i++)
{
if (bc[i] == PERIODIC)
{pt.get(i) = openfpm::math::periodic_l(pt.get(i),domain.getHigh(i),domain.getLow(i));}
}
}
public:
CartDecomposition_gpu(CellList_cpu_ker<dim,T,Mem_fast_ker<Memory,memory_traits_lin,int>,shift<dim,T>> clk,
openfpm::vector_gpu_ker<Box_map<dim, T>,layout_base> sub_domains_global,
const Box<dim,T> & domain,
const int (& bc)[dim])
:clk(clk),domain(domain),sub_domains_global(sub_domains_global)
{
for (int s = 0 ; s < dim ; s++)
{this->bc[s] = bc[s];}
}
CartDecomposition_gpu(const CartDecomposition_gpu<dim,T,Memory,layout_base> & dec)
:clk(dec.clk),domain(dec.domain)
{
for (int s = 0 ; s < dim ; s++)
{this->bc[s] = dec.bc[s];}
}
/*! \brief Given a point return in which processor the point/particle should go
*
* Boundary conditions are considered
*
* \param p point
*
* \return processorID
*
*/
__device__ int inline processorIDBC(const Point<dim,T> & p)
{
Point<dim,T> pt = p;
this->applyPointBC(pt);
return processorID_impl(pt,clk,sub_domains_global);
}
};
#endif /* CARTDECOMPOSITION_GPU_HPP_ */
...@@ -8,13 +8,15 @@ ...@@ -8,13 +8,15 @@
BOOST_AUTO_TEST_SUITE( decomposition_to_gpu_test ) BOOST_AUTO_TEST_SUITE( decomposition_to_gpu_test )
BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use ) BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
{ {
auto & v_cl = create_vcluster();
// Vcluster // Vcluster
Vcluster & vcl = create_vcluster(); Vcluster & vcl = create_vcluster();
//! [Create CartDecomposition] CartDecomposition<3, float, CudaMemory, memory_traits_inte> dec(vcl);
CartDecomposition<3, float> dec(vcl);
// Physical domain // Physical domain
Box<3, float> box( { 0.0, 0.0, 0.0 }, { 1.0, 1.0, 1.0 }); Box<3, float> box( { 0.0, 0.0, 0.0 }, { 1.0, 1.0, 1.0 });
...@@ -39,7 +41,38 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use ) ...@@ -39,7 +41,38 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
dec.setParameters(div,box,bc,g); dec.setParameters(div,box,bc,g);
dec.decompose(); dec.decompose();
dec.toKernel() openfpm::vector_gpu<Point<3,float>> vg;
vg.resize(10000);
for (size_t i = 0 ; i < 10000 ; i++)
{
vg.template get<0>(i)[0] = (float)rand()/RAND_MAX;
vg.template get<0>(i)[1] = (float)rand()/RAND_MAX;
vg.template get<0>(i)[2] = (float)rand()/RAND_MAX;
}
vg.hostToDevice<0>();
// process on GPU the processor ID for each particles
auto ite = vg.getGPUIterator();
openfpm::vector_gpu<aggregate<int,int>> proc_id_out;
proc_id_out.resize(vg.size());
process_id_proc_each_part<decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),v_cl.rank());
proc_id_out.deviceToHost<0>();
bool match = true;
for (size_t i = 0 ; i < proc_id_out.size() ; i++)
{
Point<3,float> xp = vg.template get<0>(i);
match &= proc_id_out.template get<0>(i) == dec.processorIDBC(xp);
}
} }
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()
LINKLIBS = $(HDF5_LDFLAGS) $(HDF5_LIBS) $(OPENMP_LDFLAGS) $(LIBHILBERT_LIB) $(METIS_LIB) $(PTHREAD_LIBS) $(OPT_LIBS) $(BOOST_LDFLAGS) $(BOOST_IOSTREAMS_LIB) $(CUDA_LIBS) $(PETSC_LIB) $(SUITESPARSE_LIBS) $(LAPACK_LIBS) $(BLAS_LIBS) $(PARMETIS_LIB) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(BOOST_CHRONO_LIB) $(BOOST_TIMER_LIB) $(BOOST_SYSTEM_LIB) $(LIBIFCORE) LINKLIBS = $(HDF5_LDFLAGS) $(HDF5_LIBS) $(OPENMP_LDFLAGS) $(LIBHILBERT_LIB) $(METIS_LIB) $(PTHREAD_LIBS) $(OPT_LIBS) $(BOOST_LDFLAGS) $(BOOST_IOSTREAMS_LIB) $(CUDA_LIBS) $(PETSC_LIB) $(SUITESPARSE_LIBS) $(LAPACK_LIBS) $(BLAS_LIBS) $(PARMETIS_LIB) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(BOOST_CHRONO_LIB) $(BOOST_TIMER_LIB) $(BOOST_SYSTEM_LIB) $(LIBIFCORE)
FLAGS_NVCC = $(NVCCFLAGS) $(INCLUDES_PATH) $(HDF5_CPPFLAGS) $(BOOST_CPPFLAGS) $(MPI_INC_PATH) $(PETSC_INCLUDE) $(LIBHILBERT_INCLUDE) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) -g --expt-extended-lambda FLAGS_NVCC = -Xcudafe "--display_error_number --diag_suppress=2885 --diag_suppress=2887 --diag_suppress=2888 --diag_suppress=186 --diag_suppress=111" $(NVCCFLAGS) $(INCLUDES_PATH) $(HDF5_CPPFLAGS) $(BOOST_CPPFLAGS) $(MPI_INC_PATH) $(PETSC_INCLUDE) $(LIBHILBERT_INCLUDE) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) -g --expt-extended-lambda
noinst_PROGRAMS = pdata noinst_PROGRAMS = pdata actual_test
pdata_SOURCES = main.cpp Decomposition/cuda/decomposition_cuda_tests.cpp Vector/vector_dist_gpu_unit_tests.cu Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp Grid/tests/grid_dist_id_unit_test.cpp Grid/tests/staggered_grid_dist_unit_test.cpp Vector/tests/vector_dist_cell_list_tests.cpp Vector/tests/vector_dist_complex_prp_unit_test.cpp Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp Vector/tests/vector_dist_MP_unit_tests.cpp Vector/tests/vector_dist_NN_tests.cpp Vector/tests/vector_dist_unit_test.cpp pdata_performance.cpp Decomposition/tests/CartDecomposition_unit_test.cpp Decomposition/tests/shift_vect_converter_tests.cpp Vector/performance/vector_dist_performance_util.cpp lib/pdata.cpp test_multiple_o.cpp ../openfpm_devices/src/memory/CudaMemory.cu ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp pdata_SOURCES = main.cpp Vector/cuda/vector_dist_cuda_func_test.cu Decomposition/cuda/decomposition_cuda_tests.cu Vector/vector_dist_gpu_unit_tests.cu Grid/tests/grid_dist_id_HDF5_chckpnt_restart_test.cpp Grid/tests/grid_dist_id_unit_test.cpp Grid/tests/staggered_grid_dist_unit_test.cpp Vector/tests/vector_dist_cell_list_tests.cpp Vector/tests/vector_dist_complex_prp_unit_test.cpp Vector/tests/vector_dist_HDF5_chckpnt_restart_test.cpp Vector/tests/vector_dist_MP_unit_tests.cpp Vector/tests/vector_dist_NN_tests.cpp Vector/tests/vector_dist_unit_test.cpp pdata_performance.cpp Decomposition/tests/CartDecomposition_unit_test.cpp Decomposition/tests/shift_vect_converter_tests.cpp Vector/performance/vector_dist_performance_util.cpp lib/pdata.cpp test_multiple_o.cpp ../openfpm_devices/src/memory/CudaMemory.cu ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
pdata_CXXFLAGS = $(BOOST_CPPFLAGS) $(HDF5_CPPFLAGS) $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(PETSC_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) $(H5PART_INCLUDE) -DPARALLEL_IO -Wno-unused-local-typedefs pdata_CXXFLAGS = -Wno-unknown-pragmas $(BOOST_CPPFLAGS) $(HDF5_CPPFLAGS) $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(PETSC_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) $(H5PART_INCLUDE) -DPARALLEL_IO -Wno-unused-local-typedefs
pdata_CFLAGS = $(CUDA_CFLAGS) pdata_CFLAGS = $(CUDA_CFLAGS)
pdata_LDADD = $(LINKLIBS) -lparmetis -lmetis pdata_LDADD = $(LINKLIBS) -lparmetis -lmetis
actual_test_SOURCES = Vector/cuda/vector_dist_cuda_func_test.cu Vector/vector_dist_gpu_unit_tests.cu vector_ main_single.cpp lib/pdata.cpp test_multiple_o.cpp ../openfpm_devices/src/memory/CudaMemory.cu ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
actual_test_CXXFLAGS = -Wno-unknown-pragmas $(BOOST_CPPFLAGS) $(HDF5_CPPFLAGS) $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(PETSC_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) $(H5PART_INCLUDE) -DPARALLEL_IO -Wno-unused-local-typedefs
actual_test_CFLAGS = $(CUDA_CFLAGS)
actual_test_LDADD = $(LINKLIBS) -lparmetis -lmetis
nobase_include_HEADERS = Decomposition/CartDecomposition.hpp Decomposition/shift_vect_converter.hpp Decomposition/CartDecomposition_ext.hpp Decomposition/common.hpp Decomposition/Decomposition.hpp Decomposition/ie_ghost.hpp \ nobase_include_HEADERS = Decomposition/CartDecomposition.hpp Decomposition/shift_vect_converter.hpp Decomposition/CartDecomposition_ext.hpp Decomposition/common.hpp Decomposition/Decomposition.hpp Decomposition/ie_ghost.hpp \
Decomposition/Domain_NN_calculator_cart.hpp Decomposition/nn_processor.hpp Decomposition/ie_loc_ghost.hpp Decomposition/ORB.hpp \ Decomposition/Domain_NN_calculator_cart.hpp Decomposition/nn_processor.hpp Decomposition/ie_loc_ghost.hpp Decomposition/ORB.hpp \
Graph/CartesianGraphFactory.hpp \ Graph/CartesianGraphFactory.hpp \
......
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
#include "VCluster/VCluster.hpp"
#include "Vector/map_vector.hpp"
#include "Vector/cuda/vector_dist_cuda_funcs.cuh"
#include "Vector/util/vector_dist_funcs.hpp"
BOOST_AUTO_TEST_SUITE( vector_dist_gpu_util_func_test )
BOOST_AUTO_TEST_CASE( vector_dist_gpu_find_buffer_offsets_test )
{
openfpm::vector_gpu<aggregate<int>> vgp;
openfpm::vector_gpu<aggregate<int,int>> offs;
vgp.resize(200000);
for (size_t k = 0 ; k < vgp.size() ; k++)
{vgp.template get<0>(k) = k / 1000;}
offs.resize(220);
CudaMemory mem;
mem.allocate(sizeof(int));
mem.fill(0);
auto ite = vgp.getGPUIterator();
vgp.hostToDevice<0>();
find_buffer_offsets<decltype(vgp.toKernel()),decltype(offs.toKernel())><<<ite.wthr,ite.thr>>>(vgp.toKernel(),(int *)mem.getDevicePointer(),offs.toKernel());
offs.template deviceToHost<0,1>();
openfpm::vector<int> ofv;
openfpm::vector<int> ofv2;
for (size_t i = 0 ; i < ofv.size() ; i++)
{
ofv.add(offs.template get<0>(i));
ofv2.add(offs.template get<1>(i));
}
ofv.sort();
ofv2.sort();
for (size_t i = 0 ; i < ofv.size() ; i++)
{
BOOST_REQUIRE_EQUAL(ofv.get(i),(i+1)*1000);
BOOST_REQUIRE_EQUAL(ofv2.get(i),i);
}
}
BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
{
openfpm::vector_gpu<aggregate<int,int>> m_opart;
openfpm::vector<openfpm::vector<Point<3,float>,CudaMemory,typename memory_traits_inte<Point<3,float>>::type,memory_traits_inte,openfpm::grow_policy_identity>> m_pos;
openfpm::vector<openfpm::vector<aggregate<float,float[2],float[3][3]>,CudaMemory,typename memory_traits_inte<aggregate<float,float[2],float[3][3]>>::type,memory_traits_inte,openfpm::grow_policy_identity>> m_prp;
openfpm::vector_gpu<Point<3,float>> v_pos;
openfpm::vector_gpu<aggregate<float,float[2],float[3][3]>> v_prp;
unsigned int offset = 0;
v_pos.resize(100000);
v_prp.resize(v_pos.size());
m_opart.resize(v_pos.size());
for (size_t i = 0 ; i < v_pos.size() ; i++)
{
v_pos.template get<0>(i)[0] = (float)rand()/RAND_MAX;
v_pos.template get<0>(i)[1] = (float)rand()/RAND_MAX;
v_pos.template get<0>(i)[2] = (float)rand()/RAND_MAX;
v_prp.template get<0>(i) = 5.0 + (float)rand()/RAND_MAX;
v_prp.template get<1>(i)[0] = 10.0 + (float)rand()/RAND_MAX;
v_prp.template get<1>(i)[1] = 11.0 + (float)rand()/RAND_MAX;
v_prp.template get<2>(i)[0][0] = 40.0 + (float)rand()/RAND_MAX;
v_prp.template get<2>(i)[0][1] = 50.0 + (float)rand()/RAND_MAX;
v_prp.template get<2>(i)[0][2] = 60.0 + (float)rand()/RAND_MAX;
v_prp.template get<2>(i)[1][0] = 70.0 + (float)rand()/RAND_MAX;
v_prp.template get<2>(i)[1][1] = 80.0 + (float)rand()/RAND_MAX;
v_prp.template get<2>(i)[1][2] = 150.0 + (float)rand()/RAND_MAX;
v_prp.template get<2>(i)[2][0] = 160.0 + (float)rand()/RAND_MAX;
v_prp.template get<2>(i)[2][1] = 170.0 + (float)rand()/RAND_MAX;
v_prp.template get<2>(i)[2][2] = 340.0 + (float)rand()/RAND_MAX;
int seg = i / 10000;
m_opart.template get<0>(i) = seg;
m_opart.template get<1>(i) = (9999 - i%10000) + seg * 10000;
}
m_pos.resize(10);
m_prp.resize(10);
for (size_t i = 0 ; i < m_pos.size() ; i++)
{
m_pos.get(i).resize(10000);
m_prp.get(i).resize(10000);
}
v_pos.hostToDevice<0>();
v_prp.hostToDevice<0,1,2>();
m_opart.hostToDevice<0,1>();
for (size_t i = 0 ; i < m_pos.size() ; i++)
{
auto ite = m_pos.get(i).getGPUIterator();
process_map_particles<decltype(m_opart.toKernel()),decltype(m_pos.get(i).toKernel()),decltype(m_prp.get(i).toKernel()),
decltype(v_pos.toKernel()),decltype(v_prp.toKernel())>
<<<ite.wthr,ite.thr>>>
(m_opart.toKernel(),m_pos.get(i).toKernel(), m_prp.get(i).toKernel(),
v_pos.toKernel(),v_prp.toKernel(),offset);
m_pos.get(i).deviceToHost<0>();
m_prp.get(i).deviceToHost<0,1,2>();
bool match = true;
for (size_t j = 0 ; j < m_pos.get(i).size() ; j++)
{
match &= (m_pos.get(i).template get<0>(j)[0] == v_pos.template get<0>(m_opart.template get<1>(offset+j))[0]);
match &= (m_pos.get(i).template get<0>(j)[1] == v_pos.template get<0>(m_opart.template get<1>(offset+j))[1]);
match &= (m_pos.get(i).template get<0>(j)[2] == v_pos.template get<0>(m_opart.template get<1>(offset+j))[2]);
match &= (m_prp.get(i).template get<0>(j) == v_prp.template get<0>(m_opart.template get<1>(offset+j)));
match &= (m_prp.get(i).template get<1>(j)[0] == v_prp.template get<1>(m_opart.template get<1>(offset+j))[0]);
match &= (m_prp.get(i).template get<1>(j)[1] == v_prp.template get<1>(m_opart.template get<1>(offset+j))[1]);
match &= (m_prp.get(i).template get<2>(j)[0][0] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[0][0]);
match &= (m_prp.get(i).template get<2>(j)[0][1] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[0][1]);
match &= (m_prp.get(i).template get<2>(j)[0][2] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[0][2]);
match &= (m_prp.get(i).template get<2>(j)[1][0] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[1][0]);
match &= (m_prp.get(i).template get<2>(j)[1][1] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[1][1]);
match &= (m_prp.get(i).template get<2>(j)[1][2] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[1][2]);
match &= (m_prp.get(i).template get<2>(j)[2][0] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[2][0]);
match &= (m_prp.get(i).template get<2>(j)[2][1] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[2][1]);
match &= (m_prp.get(i).template get<2>(j)[2][2] == v_prp.template get<2>(m_opart.template get<1>(offset+j))[2][2]);
}
BOOST_REQUIRE_EQUAL(match,true);
offset += m_pos.get(i).size();
}
}
BOOST_AUTO_TEST_SUITE_END()
/*
* vector_dist_cuda_funcs.cuh
*
* Created on: Aug 14, 2018
* Author: i-bird
*/
#ifndef VECTOR_DIST_CUDA_FUNCS_CUH_
#define VECTOR_DIST_CUDA_FUNCS_CUH_
#include "Vector/util/vector_dist_funcs.hpp"
template<typename vector_type,typename vector_type_offs>
__global__ void find_buffer_offsets(vector_type vd, int * cnt, vector_type_offs offs)
{
int p = threadIdx.x + blockIdx.x * blockDim.x;
if (p >= vd.size() - 1) return;
if (vd.template get<0>(p) != vd.template get<0>(p+1))
{
int i = atomicAdd(cnt, 1);
offs.template get<0>(i) = p+1;
offs.template get<1>(i) = vd.template get<0>(p);
}
}
template<typename vector_m_opart_type, typename vector_pos_type_out, typename vector_prp_type_out,
typename vector_pos_type_in, typename vector_prp_type_in>
__global__ void process_map_particles(vector_m_opart_type m_opart, vector_pos_type_out m_pos, vector_prp_type_out m_prp,
vector_pos_type_in v_pos, vector_prp_type_in v_prp, unsigned int offset)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= m_pos.size()) return;
process_map_device_particle<proc_without_prp_device>(i,offset,m_opart,m_pos,m_prp,v_pos,v_prp);
}
#endif /* VECTOR_DIST_CUDA_FUNCS_CUH_ */
...@@ -2127,8 +2127,8 @@ BOOST_AUTO_TEST_CASE( vector_dist_particle_NN_MP_iteration ) ...@@ -2127,8 +2127,8 @@ BOOST_AUTO_TEST_CASE( vector_dist_particle_NN_MP_iteration )
size_t pah = p.getKey()/4; size_t pah = p.getKey()/4;
ret &= phases.get(ph).getPropRead<1>(pah) == vd.getPropRead<0>(p); ret &= phases.get(ph).getPropRead<1>(pah) == vd.getPropRead<0>(p);
vd.getPropRead<3>(p).sort(); vd.getPropWrite<3>(p).sort();
phases.get(ph).getPropRead<4>(pah).sort(); phases.get(ph).getPropWrite<4>(pah).sort();
ret &= vd.getPropRead<3>(p).size() == phases.get(ph).getPropRead<4>(pah).size(); ret &= vd.getPropRead<3>(p).size() == phases.get(ph).getPropRead<4>(pah).size();
......
...@@ -57,7 +57,8 @@ long int decrement(long int k, long int step) ...@@ -57,7 +57,8 @@ long int decrement(long int k, long int step)
* \param bc boundary conditions * \param bc boundary conditions
* *
*/ */
template<unsigned int dim, template <typename> class layout> size_t total_n_part_lc(vector_dist<dim,float, Point_test<float>,typename layout<Point_test<float>>::type, layout, CartDecomposition<dim,float> > & vd, size_t (& bc)[dim]) template<unsigned int dim, template <typename> class layout>
size_t total_n_part_lc(vector_dist<dim,float, Point_test<float>, CartDecomposition<dim,float>, HeapMemory, layout > & vd, size_t (& bc)[dim])
{ {
Vcluster & v_cl = vd.getVC();