Commit 0f7b39e1 authored by incardon's avatar incardon

GPU_branch_start

parent 61e2d5e0
......@@ -73,8 +73,6 @@ __global__ void calc_force_gpu(vector_dist_type vd, NN_type NN, real_number sigm
vd.template getProp<force>(p)[1] = 0.0;
vd.template getProp<force>(p)[2] = 0.0;
// Get an iterator over the neighborhood particles of p
auto Np = NN.getNNIterator(NN.getCell(vd.getPos(p)));
......@@ -331,7 +329,7 @@ int main(int argc, char* argv[])
openfpm_init(&argc,&argv);
real_number sigma = 0.01;
real_number r_cut = 3.0*sigma;
real_number r_cut =3.0*sigma;
// we will use it do place particles on a 10x10x10 Grid like
size_t sz[3] = {100,100,100};
......@@ -480,7 +478,7 @@ int main(int argc, char* argv[])
unsigned long int f = 0;
// MD time stepping
for (size_t i = 0; i < 100 ; i++)
for (size_t i = 0; i < 1000 ; i++)
{
// Get the iterator
auto it3 = vd.getDomainIteratorGPU();
......@@ -494,6 +492,7 @@ int main(int argc, char* argv[])
// calculate forces or a(tn + 1) Step 2
calc_forces(vd,NN,sigma12,sigma6,r_cut*r_cut);
std::cout << "STEP " << std::endl;
// Integrate the velocity Step 3
auto it4 = vd.getDomainIteratorGPU();
......
openfpm_data @ 7eb370cf
Subproject commit e7f888c11045bfad0f927a86f0fc9c5f3efa8053
Subproject commit 7eb370cf88baa62a2c5ce0678edccc57da543392
......@@ -13,6 +13,25 @@
#include "Vector/map_vector.hpp"
/*! \brief Boundary conditions
*
*
*/
template<unsigned int dim> struct periodicity
{
size_t bc[dim];
};
/*! \brief Boundary conditions
*
*
*/
template<unsigned int dim> struct periodicity_int
{
int bc[dim];
};
/*! \brief for each sub-domain box sub contain the real the sub-domain id
*
* When we apply boundary conditions real sub-domains are copied along the border
......
......@@ -44,6 +44,24 @@ __device__ __host__ inline int processorID_impl(T2 & p, fine_s_type & fine_s, vs
return sub_domains_global.template get<1>(e);
}
/*! \brief Apply boundary condition to the point
*
* If the particle go out to the right, bring back the particle on the left
* in case of periodic, nothing in case of non periodic
*
* \param pt encapsulated point object (it's coordinated are changed according the
* the explanation before)
*
*/
template<unsigned int dim, typename St, typename Mem> inline __device__ void applyPointBC_no_dec(Box<dim,St> & domain, periodicity_int<dim> & bc, encapc<1,Point<dim,St>,Mem> && pt)
{
for (size_t i = 0 ; i < dim ; i++)
{
if (bc.bc[i] == PERIODIC)
{pt.template get<0>()[i] = openfpm::math::periodic_l(pt.template get<0>()[i],domain.getHigh(i),domain.getLow(i));}
}
}
template<unsigned int dim, typename T, typename Memory, template <typename> class layout_base>
class CartDecomposition_gpu : public ie_ghost_gpu<dim,T,Memory,layout_base>
{
......
......@@ -6,7 +6,147 @@
#define SUB_UNIT_FACTOR 1024
template<typename dec_type>
__global__ void test_proc_idbc(Point<3,double> p1 ,Point<3,double> p2 , dec_type dec, unsigned int * pr_id)
{
pr_id[0] = dec.processorIDBC(p1);
pr_id[1] = dec.processorIDBC(p2);
}
template<typename dec_type>
__global__ void test_ghost_n(Point<3,double> p1 ,Point<3,double> p2 , dec_type dec, unsigned int * ng_id)
{
ng_id[0] = dec.ghost_processorID_N(p1);
ng_id[1] = dec.ghost_processorID_N(p2);
}
template<typename dec_type, typename output_type>
__global__ void test_ghost(Point<3,double> p1 ,Point<3,double> p2 , dec_type dec, unsigned int * ng_id , output_type g_id)
{
for (unsigned int i = 0 ; i < ng_id[0] ; i++)
{
dec.ghost_processor_ID(p1,g_id,0,i);
}
for (unsigned int i = 0 ; i < ng_id[0] ; i++)
{
dec.ghost_processor_ID(p1,g_id,ng_id[0],i);
}
}
BOOST_AUTO_TEST_SUITE( decomposition_to_gpu_test )
BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idbc_and_ghost2_gpu )
{
// Vcluster
Vcluster<> & vcl = create_vcluster();
CartDecomposition<3, double> dec(vcl);
size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
// Physical domain
Box<3, double> box( { -0.01, -0.01, 0.0 }, { 0.01, 0.01, 0.003 });
Ghost<3,double> g(0.0015);
dec.setGoodParameters(box, bc, g, 512);
dec.decompose();
// Now we check the point
for (size_t j = 0 ; j < 3 ; j++ )
{
for (size_t i = 0 ; i < dec.getNSubDomain() ; i++)
{
Point<3,double> p1;
Point<3,double> p2;
p1.get(0) = SpaceBox<3,double>(dec.getSubDomains().get(i)).getLow(0);
p1.get(1) = SpaceBox<3,double>(dec.getSubDomains().get(i)).getLow(1);
p1.get(2) = SpaceBox<3,double>(dec.getSubDomains().get(i)).getLow(2);
p2 = p1;
p2.get(j) = std::nextafter(SpaceBox<3,double>(dec.getSubDomains().get(i)).getLow(j),-1.0);
auto gpudec = dec.toKernel();
CudaMemory mem;
mem.allocate(2*sizeof(unsigned int));
test_proc_idbc<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
mem.deviceToHost();
BOOST_REQUIRE(((unsigned int *)mem.getPointer())[0] < vcl.size());
BOOST_REQUIRE(((unsigned int *)mem.getPointer())[1] < vcl.size());
CudaMemory mem2;
mem2.allocate(2*sizeof(unsigned int));
test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
unsigned int tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1];
openfpm::vector_gpu<aggregate<int,int>> vd;
vd.resize(tot);
test_ghost<decltype(gpudec),decltype(vd.toKernel())><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd.toKernel());
if (((unsigned int *)mem.getPointer())[0] != ((unsigned int *)mem.getPointer())[1])
{
if (vcl.rank() == ((unsigned int *)mem.getPointer())[1] )
{
BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[1] != 0);
BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[0] == 0);
}
if (vcl.rank() == ((unsigned int *)mem.getPointer())[0])
{
BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[1] == 0 );
BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[0] != 0 );
}
}
p1.get(0) = std::nextafter(SpaceBox<3,double>(dec.getSubDomains().get(i)).getHigh(0),SpaceBox<3,double>(dec.getSubDomains().get(i)).getLow(0));
p1.get(1) = std::nextafter(SpaceBox<3,double>(dec.getSubDomains().get(i)).getHigh(1),SpaceBox<3,double>(dec.getSubDomains().get(i)).getLow(1));
p1.get(2) = std::nextafter(SpaceBox<3,double>(dec.getSubDomains().get(i)).getHigh(2),SpaceBox<3,double>(dec.getSubDomains().get(i)).getLow(2));
p2 = p1;
p2.get(j) = std::nextafter(SpaceBox<3,double>(dec.getSubDomains().get(i)).getHigh(j),1.0);
test_proc_idbc<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem.getDevicePointer());
BOOST_REQUIRE(((unsigned int *)mem.getPointer())[0] < vcl.size());
BOOST_REQUIRE(((unsigned int *)mem.getPointer())[1] < vcl.size());
mem2.allocate(2*sizeof(unsigned int));
test_ghost_n<decltype(gpudec)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer());
tot = ((unsigned int *)mem2.getPointer())[0] + ((unsigned int *)mem2.getPointer())[1];
vd.resize(tot);
test_ghost<decltype(gpudec),decltype(vd)><<<1,1>>>(p1,p2,gpudec,(unsigned int *)mem2.getDevicePointer(),vd);
if (((unsigned int *)mem.getPointer())[0] != ((unsigned int *)mem.getPointer())[1])
{
if (vcl.rank() == ((unsigned int *)mem.getPointer())[2])
{
BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[1] != 0);
BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[0] == 0);
}
if (vcl.rank() == ((unsigned int *)mem.getPointer())[0])
{
BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[1] == 0 );
BOOST_REQUIRE(((unsigned int *)mem2.getPointer())[0] != 0 );
}
}
}
}
}
BOOST_AUTO_TEST_SUITE_END()
......@@ -618,6 +618,11 @@ public:
*/
const openfpm::vector<Point<dim,T>,Memory,typename layout_base<Point<dim,T>>::type,layout_base> & getShiftVectors()
{
if (host_dev_transfer == false)
{
shifts.template hostToDevice<0>();
}
return shifts;
}
......
......@@ -485,6 +485,8 @@ BOOST_AUTO_TEST_CASE( CartDecomposition_check_cross_consistency_between_proc_idb
}
}
BOOST_AUTO_TEST_CASE( CartDecomposition_non_periodic_test_dist_grid)
{
// Vcluster
......
......@@ -9,6 +9,7 @@
#define SRC_GRID_GRID_DIST_UTIL_HPP_
#include "NN/CellList/CellDecomposer.hpp"
#include "Decomposition/common.hpp"
/*! \brief get cellDecomposer parameters
*
......@@ -29,14 +30,6 @@ template<unsigned int dim> void getCellDecomposerPar(size_t (& c_g)[dim], const
}
}
/*! \brief
*
*
*/
template<unsigned int dim> struct periodicity
{
size_t bc[dim];
};
/*! \brief Create NON_PERIODIC data structure
*
......
......@@ -70,6 +70,8 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca
{
#if defined(CUDA_GPU) && defined(__NVCC__)
if (v_cl.size() == 1)
{return;}
proc_id_out.resize(v_pos.size()+1);
proc_id_out.template get<0>(proc_id_out.size()-1) = 0;
......@@ -113,12 +115,22 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,sca
// Trasfer the number of offsets on CPU
mem.deviceToHost();
prc_offset.template deviceToHost<0,1>();
g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);
if (g_opart_device.size() != 0)
{g_opart_device.template deviceToHost<0>(g_opart_device.size()-1,g_opart_device.size()-1);}
int noff = *(int *)mem.getPointer();
// In this case we do not have communications at all
if (g_opart_device.size() == 0)
{noff = -1;}
prc_offset.resize(noff+1);
prc_offset.template get<0>(prc_offset.size()-1) = g_opart_device.size();
prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);
if (g_opart_device.size() != 0)
{prc_offset.template get<1>(prc_offset.size()-1) = g_opart_device.template get<0>(g_opart_device.size()-1);}
else
{prc_offset.template get<1>(prc_offset.size()-1) = 0;}
prc.resize(noff+1);
prc_sz.resize(noff+1);
......@@ -228,7 +240,6 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
<<<ite.wthr,ite.thr>>>
(box_f_dev.toKernel(),v_pos.toKernel(),o_part_loc.toKernel());
// openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> starts;
starts.resize(o_part_loc.size());
mgpu::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
......
......@@ -729,12 +729,14 @@ BOOST_AUTO_TEST_CASE( decomposition_to_gpu_test_use )
auto ite = vg.getGPUIterator();
openfpm::vector_gpu<aggregate<int,int>> proc_id_out;
openfpm::vector_gpu<aggregate<int,int,int>> proc_id_out;
proc_id_out.resize(vg.size());
process_id_proc_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel())>
openfpm::vector_gpu<aggregate<int,int,int>> dev_counter;
process_id_proc_each_part<3,float,decltype(dec.toKernel()),decltype(vg.toKernel()),decltype(proc_id_out.toKernel()),decltype(dev_counter.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),v_cl.rank());
(dec.toKernel(),vg.toKernel(),proc_id_out.toKernel(),dev_counter.toKernel(),v_cl.rank());
proc_id_out.deviceToHost<0>();
......@@ -793,6 +795,44 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_find_buffer_offsets_test )
}
}
BOOST_AUTO_TEST_CASE(vector_dist_reorder_lbl)
{
openfpm::vector_gpu<aggregate<int,int,int>> lbl_p;
openfpm::vector_gpu<aggregate<int>> starts;
lbl_p.resize(100);
starts.resize(10);
for (int i = 0 ; i < 10 ; i++) // <------ particle id
{
for (int j = 0 ; j < 10 ; j++) // <----- processor
{
lbl_p.template get<2>(i*10+j) = i;
lbl_p.template get<1>(i*10+j) = j;
}
starts.template get<0>(i) = (i*10);
}
// move lbl and starts to gpu
starts.template hostToDevice<0>();
lbl_p.template hostToDevice<1,2>();
auto ite = lbl_p.getGPUIterator();
reorder_lbl<decltype(lbl_p.toKernel()),decltype(starts.toKernel())><<<ite.wthr,ite.thr>>>(lbl_p.toKernel(),starts.toKernel());
starts.template deviceToHost<0>();
lbl_p.template deviceToHost<0,1,2>();
for (int i = 0 ; i < 10 ; i++) // <------ particle id
{
for (int j = 0 ; j < 10 ; j++) // <----- processor
{
BOOST_REQUIRE_EQUAL(lbl_p.template get<0>(j*10+i),i*10+j);
}
}
}
BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
{
openfpm::vector_gpu<aggregate<int,int>> m_opart;
......
......@@ -10,6 +10,7 @@
#include "Vector/util/vector_dist_funcs.hpp"
#include "util/cuda/moderngpu/kernel_reduce.hxx"
#include "Decomposition/common.hpp"
template<unsigned int dim, typename St, typename decomposition_type, typename vector_type, typename start_type, typename output_type>
__global__ void proc_label_id_ghost(decomposition_type dec,vector_type vd, start_type starts, output_type out)
......@@ -37,8 +38,30 @@ __global__ void num_proc_ghost_each_part(decomposition_type dec, vector_type vd,
out.template get<0>(p) = dec.ghost_processorID_N(xp);
}
template<unsigned int dim, typename St, typename cartdec_gpu, typename particles_type, typename vector_out>
__global__ void process_id_proc_each_part(cartdec_gpu cdg, particles_type parts, vector_out output , int rank)
template<unsigned int dim, typename St, typename particles_type>
__global__ void apply_bc_each_part(Box<dim,St> domain, periodicity_int<dim> bc, particles_type parts)
{
int p = threadIdx.x + blockIdx.x * blockDim.x;
if (p >= parts.size()) return;
applyPointBC_no_dec(domain,bc,parts.get(p));
}
template<typename vector_pos_type, typename vector_prp_type, typename stns_type, unsigned int ... prp>
__global__ void merge_sort_part(vector_pos_type vd_pos, vector_prp_type vd_prp,
vector_pos_type v_pos_ord, vector_prp_type vd_prp_ord,
stns_type nss)
{
int p = threadIdx.x + blockIdx.x * blockDim.x;
if (p >= vd_pos.size()) return;
vd_prp.template set<prp...>(p,vd_prp_ord,nss.template get<0>(p));
}
template<unsigned int dim, typename St, typename cartdec_gpu, typename particles_type, typename vector_out, typename prc_sz_type>
__global__ void process_id_proc_each_part(cartdec_gpu cdg, particles_type parts, vector_out output, prc_sz_type prc_sz , int rank)
{
int p = threadIdx.x + blockIdx.x * blockDim.x;
......@@ -49,8 +72,14 @@ __global__ void process_id_proc_each_part(cartdec_gpu cdg, particles_type parts,
int pr = cdg.processorID(xp);
#ifndef TEST1
output.template get<1>(p) = (pr == rank)?-1:pr;
output.template get<0>(p) = p;
#else
output.template get<1>(p) = pr;
int nl = atomicAdd(&prc_sz.template get<0>(pr), 1);
output.template get<2>(p) = nl;
#endif
}
template<unsigned int prp_off, typename vector_type,typename vector_type_offs>
......@@ -58,7 +87,7 @@ __global__ void find_buffer_offsets(vector_type vd, int * cnt, vector_type_offs
{
int p = threadIdx.x + blockIdx.x * blockDim.x;
if (p >= vd.size() - 1) return;
if (p >= (int)vd.size() - 1) return;
if (vd.template get<prp_off>(p) != vd.template get<prp_off>(p+1))
{
......@@ -201,6 +230,18 @@ __global__ void shift_ghost_each_part(vector_of_box box_f, vector_of_shifts box_
}
}
template<typename vector_lbl_type, typename starts_type>
__global__ void reorder_lbl(vector_lbl_type m_opart, starts_type starts)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= m_opart.size()) return;
int pr = m_opart.template get<1>(i);
m_opart.template get<0>(starts.template get<0>(pr) + m_opart.template get<2>(i)) = i;
}
template<unsigned int prp, typename vector_type>
auto reduce(vector_type & vd) -> typename std::remove_reference<decltype(vd.template getProp<prp>(0))>::type
{
......
......@@ -274,6 +274,8 @@ void check_cell_list_cpu_and_gpu(vector_type & vd, CellList_type & NN, CellList_
test = check_force(NN_cpu,vd);
BOOST_REQUIRE_EQUAL(test,true);
vd.template merge_sort<1>(NN);
}
BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
......@@ -291,7 +293,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
// Boundary conditions
size_t bc[3]={NON_PERIODIC,NON_PERIODIC,NON_PERIODIC};
vector_dist_gpu<3,float,aggregate<float,float[3],float[3]>> vd(1000,domain,bc,g);
vector_dist_gpu<3,float,aggregate<float,float[3],float[3]>> vd(10000,domain,bc,g);
auto it = vd.getDomainIterator();
......@@ -314,7 +316,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
v_cl.sum(size_l);
v_cl.execute();
BOOST_REQUIRE_EQUAL(size_l,1000);
BOOST_REQUIRE_EQUAL(size_l,10000);
auto & ct = vd.getDecomposition();
......@@ -370,7 +372,7 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
// here we do a ghost_get
vd.ghost_get<0>();
// Doube ghost get to check crashes
// Double ghost get to check crashes
vd.ghost_get<0>();
// we re-offload what we received
......@@ -385,23 +387,6 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
NN_up.clear();
vd.updateCellList(NN_up);
check_cell_list_cpu_and_gpu(vd,NN_up,NN_cpu);
// We check if we opotain the same result from updateCellList
// check
// Now we do a ghost_get from CPU
// Than we offload on GPU
// We construct a Cell-list
// We calculate force on CPU and GPU to check if they match
}
template<typename St>
......@@ -533,6 +518,8 @@ void vdist_calc_gpu_test()
{
vd.map(RUN_ON_DEVICE);
CUDA_SAFE(cudaGetLastError());
vd.deviceToHostPos();
vd.template deviceToHostProp<0,1,2>();
......
......@@ -1840,7 +1840,25 @@ public:
se3.getIterator();
#endif
return v_pos.getGPUIteratorTo(n_thr);
return v_pos.getGPUIteratorTo(g_m,n_thr);
}
/*! \brief Merge the properties calculated on the sorted vector on the original vector
*
* \parameter Cell-list from which has been constructed the sorted vector
*
*/
template<unsigned int ... prp> void merge_sort(CellList_gpu<dim,St,CudaMemory,shift_only<dim, St>> & cl, size_t n_thr = 1024)
{
#if defined(__NVCC__)
auto ite = v_pos.getGPUIteratorTo(g_m,n_thr);
merge_sort_part<decltype(v_pos.toKernel()),decltype(v_prp.toKernel()),decltype(cl.getNonSortedToSorted().toKernel()),prp...>
<<<ite.wthr,ite.thr>>>
(v_pos.toKernel(),v_prp.toKernel(),v_pos_out.toKernel(),v_prp_out.toKernel(),cl.getNonSortedToSorted().toKernel());
#endif
}
#endif
......
......@@ -8,6 +8,8 @@
#ifndef SRC_VECTOR_VECTOR_DIST_COMM_HPP_
#define SRC_VECTOR_VECTOR_DIST_COMM_HPP_
//#define TEST1
#if defined(CUDA_GPU) && defined(__NVCC__)
#include "util/cuda/moderngpu/kernel_mergesort.hxx"
#include "Vector/cuda/vector_dist_cuda_funcs.cuh"
......@@ -203,6 +205,7 @@ class vector_dist_comm
{
if (opt & RUN_ON_DEVICE)
{
#ifndef TEST1
size_t prev_off = 0;
for (size_t i = 0; i < prc_sz.size() ; i++)
{
......@@ -213,6 +216,21 @@ class vector_dist_comm
}
prev_off = prc_sz.template get<0>(i);
}