Commit d518b0dc authored by incardon's avatar incardon

Distributed GPU code working

parent 9fab4db5
......@@ -406,7 +406,7 @@ if test x"$NVCC_EXIST" = x"yes"; then
AC_MSG_RESULT($gpu_support)
if test x"$gpu_support" = x"yes"; then
AC_DEFINE([GPU],[],[GPU support])
AC_DEFINE([CUDA_GPU],[],[GPU support])
else
CUDA_LIBS=""
CUDA_CFLAGS=""
......
openfpm_data @ 6238d107
Subproject commit 64447c56ae67ef2fa9ed0332262ed46963aa913d
Subproject commit 6238d10785c35dcc711dc4e5fd0c2d3a0ea8882e
......@@ -326,9 +326,9 @@ class DistGraph_CSR
{
// If v1 and v2 does not satisfy some criteria return
if (CheckPolicy::valid(v1, v.size()) == false)
return NO_EDGE;
{return (size_t)NO_EDGE;}
if (CheckPolicy::valid(v2, v.size()) == false)
return NO_EDGE;
{return (size_t)NO_EDGE;}
// get the number of adjacent vertex
size_t id_x_end = v_l.template get<0>(v1);
......
......@@ -3,7 +3,7 @@ LINKLIBS = $(HDF5_LDFLAGS) $(HDF5_LIBS) $(OPENMP_LDFLAGS) $(LIBHILBERT_LIB) $(
FLAGS_NVCC = $(NVCCFLAGS) $(INCLUDES_PATH) $(HDF5_CPPFLAGS) $(BOOST_CPPFLAGS) $(MPI_INC_PATH) $(PETSC_INCLUDE) $(LIBHILBERT_INCLUDE) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) -g --expt-extended-lambda
noinst_PROGRAMS = pdata
pdata_SOURCES = main.cpp pdata_performance.cpp Vector/vector_dist_gpu_unit_tests.cu Grid/grid_dist_id_unit_test.cpp lib/pdata.cpp test_multiple_o.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
pdata_SOURCES = main.cpp pdata_performance.cpp Vector/vector_dist_gpu_unit_tests.cu Grid/grid_dist_id_unit_test.cpp lib/pdata.cpp test_multiple_o.cpp ../openfpm_devices/src/memory/HeapMemory.cpp ../openfpm_devices/src/memory/CudaMemory.cu ../openfpm_devices/src/memory/PtrMemory.cpp ../openfpm_vcluster/src/VCluster/VCluster.cpp ../openfpm_devices/src/Memleak_check.cpp
pdata_CXXFLAGS = $(HDF5_CPPFLAGS) $(OPENMP_CFLAGS) $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(PETSC_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(PARMETIS_INCLUDE) $(METIS_INCLUDE) $(BOOST_CPPFLAGS) $(H5PART_INCLUDE) -DPARALLEL_IO -Wno-unused-local-typedefs
pdata_CFLAGS = $(CUDA_CFLAGS)
pdata_LDADD = $(LINKLIBS) -lparmetis -lmetis
......
......@@ -8,6 +8,7 @@
#ifndef VECTOR_HPP_
#define VECTOR_HPP_
#include "config.h"
#include "HDF5_wr/HDF5_wr.hpp"
#include "VCluster/VCluster.hpp"
#include "Space/Shape/Point.hpp"
......@@ -34,7 +35,8 @@
#include "Vector/vector_map_iterator.hpp"
#include "NN/CellList/ParticleIt_Cells.hpp"
#include "NN/CellList/ProcKeys.hpp"
#include "Vector/vector_dist_gpu.hpp"
#include "Vector/vector_dist_kernel.hpp"
#include "NN/CellList/cuda/CellList_gpu.hpp"
#define VECTOR_DIST_ERROR_OBJECT std::runtime_error("Runtime vector distributed error");
......@@ -146,7 +148,7 @@ class vector_dist : public vector_dist_comm<dim,St,prop,Decomposition,Memory,lay
public:
//! Self type
typedef vector_dist<dim,St,prop,Decomposition,Memory> self;
typedef vector_dist<dim,St,prop,Decomposition,Memory,layout_base> self;
//! property object
typedef prop value_type;
......@@ -164,6 +166,14 @@ private:
//! the second element contain unassigned particles
openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> v_prp;
#ifdef CUDA_GPU
openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> v_prp_out;
openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> v_pos_out;
#endif
//! Virtual cluster
Vcluster & v_cl;
......@@ -915,6 +925,76 @@ public:
return getCellList<CellL>(r_cut, g,no_se3);
}
#ifdef CUDA_GPU
/*! \brief Construct a cell list starting from the stored particles, this version of cell-list can be offloaded with
* the function toGPU
*
* \param r_cut interation radius, or size of each cell
*
* \return the Cell list
*
*/
CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>> getCellListGPU(St r_cut, bool no_se3 = false)
{
#ifdef SE_CLASS3
if (no_se3 == false)
{se3.getNN();}
#endif
#ifdef SE_CLASS1
check_ghost_compatible_rcut(r_cut);
#endif
// Get ghost and anlarge by 1%
Ghost<dim,St> g = getDecomposition().getGhost();
g.magnify(1.013);
return getCellListGPU(r_cut, g,no_se3);
}
/*! \brief Construct a cell list starting from the stored particles
*
* It differ from the get getCellList for an additional parameter, in case the
* domain + ghost is not big enough to contain additional padding particles, a Cell list
* with bigger space can be created
* (padding particles in general are particles added by the user out of the domains)
*
* \tparam CellL CellList type to construct
*
* \param r_cut interation radius, or size of each cell
* \param enlarge In case of padding particles the cell list must be enlarged, like a ghost this parameter say how much must be enlarged
*
* \return the CellList
*
*/
CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>> getCellListGPU(St r_cut, const Ghost<dim, St> & enlarge, bool no_se3 = false)
{
#ifdef SE_CLASS3
if (no_se3 == false)
{se3.getNN();}
#endif
// Division array
size_t div[dim];
// get the processor bounding box
Box<dim, St> pbox = getDecomposition().getProcessorBounds();
// Processor bounding box
cl_param_calculate(pbox, div, r_cut, enlarge);
CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>> cell_list(pbox,div);
v_prp_out.resize(v_pos.size());
v_pos_out.resize(v_pos.size());
cell_list.template construct<decltype(v_pos),decltype(v_prp)>(v_pos,v_pos_out,v_prp,v_prp_out);
return cell_list;
}
#endif
/*! \brief Construct an hilbert cell list starting from the stored particles
*
* \tparam CellL CellList type to construct
......@@ -1544,6 +1624,24 @@ public:
return vector_dist_iterator(0, g_m);
}
#ifdef CUDA_GPU
/*! \brief Get an iterator that traverse the particles in the domain
*
* \return an iterator
*
*/
ite_gpu<1> getDomainIteratorGPU(size_t n_thr = 1024) const
{
#ifdef SE_CLASS3
se3.getIterator();
#endif
return v_pos.getGPUIteratorTo(n_thr);
}
#endif
/*! \brief Get an iterator that traverse the particles in the domain
*
* \return an iterator
......@@ -1824,8 +1922,6 @@ public:
// Write the VTK file
return vtk_writer.write(output,prp_names,"particles",ft);
}
return false;
}
/*! \brief Delete the particles on the ghost
......@@ -2114,6 +2210,80 @@ public:
return key;
}
#ifdef CUDA_GPU
/*! \brief Convert the grid into a data-structure compatible for computing into GPU
*
* The object created can be considered like a reference of the original
*
* \return an usable vector in the kernel
*
*/
template<unsigned int ... prp> vector_dist_ker<dim,St,prop> toKernel()
{
vector_dist_ker<dim,St,prop> v(v_pos.toKernel(), v_prp.toKernel());
return v;
}
/*! \brief Convert the grid into a data-structure compatible for computing into GPU
*
* In comparison with toGPU return a version sorted better for coalesced memory
*
* \return an usable vector in the kernel
*
*/
template<unsigned int ... prp> vector_dist_ker<dim,St,prop> toKernel_sorted()
{
vector_dist_ker<dim,St,prop> v(v_pos_out.toKernel(), v_prp_out.toKernel());
return v;
}
/*! \brief Move the memory from the device to host memory
*
* \tparam property to move use POS_PROP for position property
*
*/
template<unsigned int ... prp> void deviceToHostProp()
{
v_prp.template deviceToHost<prp ...>();
}
/*! \brief Move the memory from the device to host memory
*
* \tparam property to move use POS_PROP for position property
*
*/
void deviceToHostPos()
{
v_pos.template deviceToHost<0>();
}
/*! \brief Move the memory from the device to host memory
*
* \tparam property to move use POS_PROP for position property
*
*/
template<unsigned int ... prp> void hostToDeviceProp()
{
v_prp.template deviceToHost<prp ...>();
}
/*! \brief Move the memory from the device to host memory
*
* \tparam property to move use POS_PROP for position property
*
*/
void hostToDevicePos()
{
v_pos.template hostToDevice<0>();
}
#endif
#ifdef SE_CLASS3
se_class3_vector<prop::max_prop,dim,St,Decomposition,self> & get_se_class3()
......
......@@ -13,6 +13,71 @@ void print_test(std::string test, size_t sz)
std::cout << test << " " << sz << "\n";
}
__global__ void initialize_props(vector_dist_ker<3, float, aggregate<float, float [3], float[3]>> vd)
{
auto p = GET_PARTICLE(vd);
vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
vd.template getProp<1>(p)[0] = vd.getPos(p)[0] + vd.getPos(p)[1];
vd.template getProp<1>(p)[1] = vd.getPos(p)[0] + vd.getPos(p)[2];
vd.template getProp<1>(p)[2] = vd.getPos(p)[1] + vd.getPos(p)[2];
}
template<typename CellList_type>
__global__ void calculate_force(vector_dist_ker<3, float, aggregate<float, float[3], float [3]>> vd,
vector_dist_ker<3, float, aggregate<float, float[3], float [3]>> vd_sort,
CellList_type cl)
{
auto p = GET_PARTICLE(vd);
Point<3,float> xp = vd.getPos(p);
auto it = cl.getNNIterator(cl.getCell(xp));
auto cell = cl.getCell(xp);
int s1 = cell.get(0);
int s2 = cell.get(1);
int s3 = cell.get(2);
Point<3,float> force1({0.0,0.0,0.0});
Point<3,float> force2({0.0,0.0,0.0});
while (it.isNext())
{
auto q1 = it.get();
auto q2 = it.get_orig();
if (q2 == p) {++it; continue;}
Point<3,float> xq_1 = vd_sort.getPos(q1);
Point<3,float> xq_2 = vd.getPos(q2);
Point<3,float> r1 = xq_1 - xp;
Point<3,float> r2 = xq_2 - xp;
// Normalize
r1 /= r1.norm();
r2 /= r2.norm();
force1 += vd_sort.template getProp<0>(q1)*r1;
force2 += vd.template getProp<0>(q2)*r2;
++it;
}
vd.template getProp<1>(p)[0] = force1.get(0);
vd.template getProp<1>(p)[1] = force1.get(1);
vd.template getProp<1>(p)[2] = force1.get(2);
vd.template getProp<2>(p)[0] = force2.get(0);
vd.template getProp<2>(p)[1] = force2.get(1);
vd.template getProp<2>(p)[2] = force2.get(2);
}
BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
{
auto & v_cl = create_vcluster();
......@@ -23,12 +88,12 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
// set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
Ghost<3,float> g(0.01);
Ghost<3,float> g(0.1);
// Boundary conditions
size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
vector_dist_gpu<3,float,aggregate<float,float[3]>> vd(1000,domain,bc,g);
vector_dist_gpu<3,float,aggregate<float,float[3],float[3]>> vd(1000,domain,bc,g);
auto it = vd.getDomainIterator();
......@@ -40,12 +105,6 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
vd.getPos(p)[1] = (float)rand() / RAND_MAX;
vd.getPos(p)[2] = (float)rand() / RAND_MAX;
vd.template getProp<0>(p) = vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2];
vd.template getProp<1>(p)[0] = vd.getPos(p)[0] + vd.getPos(p)[1];
vd.template getProp<1>(p)[1] = vd.getPos(p)[1] + vd.getPos(p)[2];
vd.template getProp<1>(p)[2] = vd.getPos(p)[2] + vd.getPos(p)[3];
++it;
}
......@@ -84,6 +143,85 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
// now we offload all the properties
auto it3 = vd.getDomainIteratorGPU();
vd.hostToDevicePos();
initialize_props<<<it3.wthr,it3.thr>>>(vd.toKernel());
// now we check what we initialized
vd.deviceToHostProp<0,1>();
auto it4 = vd.getDomainIterator();
while (it4.isNext())
{
auto p = it4.get();
BOOST_REQUIRE_CLOSE(vd.template getProp<0>(p),vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2],0.01);
BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[0],vd.getPos(p)[0] + vd.getPos(p)[1],0.01);
BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[1],vd.getPos(p)[0] + vd.getPos(p)[2],0.01);
BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[2],vd.getPos(p)[1] + vd.getPos(p)[2],0.01);
//std::cout << "PROP 0 " << vd.template getProp<0>(p) << " " << vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2] << std::endl;
++it4;
}
auto NN = vd.getCellListGPU(0.1);
auto NN_cpu = vd.getCellList(0.1);
auto it5 = vd.getDomainIteratorGPU();
calculate_force<decltype(NN.toKernel())><<<it5.wthr,it5.thr>>>(vd.toKernel(),vd.toKernel_sorted(),NN.toKernel());
vd.template deviceToHostProp<1,2>();
auto it6 = vd.getDomainIterator();
bool match = true;
while (it6.isNext())
{
auto p = it6.get();
Point<3,float> xp = vd.getPos(p);
// Calculate on CPU
Point<3,float> force({0.0,0.0,0.0});
auto NNc = NN_cpu.getNNIterator(NN_cpu.getCell(xp));
while (NNc.isNext())
{
auto q = NNc.get();
if (q == p.getKey()) {++NNc; continue;}
Point<3,float> xq_2 = vd.getPos(q);
Point<3,float> r2 = xq_2 - xp;
// Normalize
r2 /= r2.norm();
force += vd.template getProp<0>(q)*r2;
++NNc;
}
BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[0],vd.template getProp<2>(p)[0],0.001);
BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[1],vd.template getProp<2>(p)[1],0.001);
BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[2],vd.template getProp<2>(p)[2],0.001);
BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[0],force.get(0),0.01);
BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[1],force.get(1),0.01);
BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[2],force.get(2),0.01);
++it6;
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment