Commit 7edb0b29 authored by incardon's avatar incardon

dlb gpu tested

parent 7b73d120
Pipeline #945 failed with stages
in 1 minute and 54 seconds
......@@ -8,5 +8,5 @@ It can happen in compilation phase. The installation system in general should de
If you show the hidden folder in your "holy" home, you will see how many program actually "violate" your home folder. They do costantly Creating/Reading/Writing such folder every time you open them, they are just hiding. OpenFPM create such text file only one time in installation and report it, mooving such file somewhere else after installation will bring the home folder to be "virgin" from OpenFPM forever.
* The examples does not scale on my 4 - x core PC.
Not all example are made for scalability. In particular example that do not have computation inside like the 0_simple_ ... . If instead is an example that has computation inside, when you are benchmarking in particular using all the cores of your PC, close all applications, like Browser, IDE, pdf reader, ..., unusefull shell running command (anything that could consume resources, consider that the Desktop environment consume resources, in particular if you have OpenGL/3D effects, disable them. The best would be close the X-server/Desktop environment). Check also that no program are running in background using resources, use top/Task manager to check this. Consider also that 99.9% of Laptop/Desktop today can adjust their frequency dynamically. In particular it is common that the system increase the CPU frequency at higher level when only one-core is used compared to N-core, and this can significantly affect scalability.
Not all examples are made for scalability. In particular example that do not have computation inside like the 0_simple_ ... . If instead is an example that has computation inside, when you are benchmarking in particular using all the cores of your PC, close all applications, like Browser, IDE, pdf reader, ..., unusefull shell running command (anything that could consume resources, consider that the Desktop environment consume resources, in particular if you have OpenGL/3D effects, disable them. The best would be close the X-server/Desktop environment). Check also that no program are running in background using resources, use top/Task manager to check this. Consider also that 99.9% of Laptop/Desktop today can adjust their frequency dynamically. In particular it is common that the system increase the CPU frequency at higher level when only one-core is used compared to N-core, and this can significantly affect scalability. Another potential scalability problem is given by the memory bandwidth. Some of the examples can exaust your memory bandwidth and once is saturated increare the core counting does not improve. Consider (in late 2018) main-stream CPUs 2 two memory channels for 4 o 6 cores, while high end cpus has 4 memory channels but can have 16 or more cores. High end server grade cpus like Xeon has 4 or 6 memory channels but they can have 20 or more cores. while 8 channels are probably in the near future
\ No newline at end of file
......@@ -647,6 +647,10 @@ public:
ie_ghost<dim, T,Memory,layout_base>::create_box_nn_processor_int(v_cl, ghost, sub_domains, box_nn_processor, *this);
ie_loc_ghost<dim,T>::create(sub_domains,domain,ghost,bc);
// Ghost box information must be re-offloaded
host_dev_transfer = false;
ie_ghost<dim, T,Memory,layout_base>::reset_host_dev_transfer();
}
......
......@@ -1339,6 +1339,15 @@ public:
return igg;
}
/*! \brief Notify that the next toKernel() data-structures must be re-offloaded
*
*
*/
void reset_host_dev_transfer()
{
host_dev_transfer = false;
}
};
......
......@@ -1197,5 +1197,6 @@ BOOST_AUTO_TEST_CASE(vector_dist_remove_marked)
vector_dist_remove_marked_type<3>();
}
BOOST_AUTO_TEST_SUITE_END()
......@@ -757,7 +757,169 @@ BOOST_AUTO_TEST_CASE(vector_dist_reduce)
BOOST_REQUIRE_EQUAL(reds2,vd.size_local());
}
BOOST_AUTO_TEST_CASE(vector_dist_dlb_on_cuda)
{
typedef vector_dist_gpu<3,double,aggregate<double>> vector_type;
Vcluster<> & v_cl = create_vcluster();
if (v_cl.getProcessingUnits() > 8)
return;
Box<3,double> domain({0.0,0.0,0.0},{1.0,1.0,1.0});
Ghost<3,double> g(0.1);
size_t bc[3] = {PERIODIC,PERIODIC,PERIODIC};
vector_type vd(0,domain,bc,g,DEC_GRAN(2048));
// Only processor 0 initialy add particles on a corner of a domain
if (v_cl.getProcessUnitID() == 0)
{
for(size_t i = 0 ; i < 50000 ; i++)
{
vd.add();
vd.getLastPos()[0] = ((double)rand())/RAND_MAX * 0.3;
vd.getLastPos()[1] = ((double)rand())/RAND_MAX * 0.3;
vd.getLastPos()[2] = ((double)rand())/RAND_MAX * 0.3;
}
}
// Move to GPU
vd.hostToDevicePos();
vd.template hostToDeviceProp<0>();
vd.map(RUN_ON_DEVICE);
vd.template ghost_get<>(RUN_ON_DEVICE);
// now move to CPU
vd.deviceToHostPos();
vd.template deviceToHostProp<0>();
// Get the neighborhood of each particles
auto VV = vd.getVerlet(0.01);
// store the number of neighborhood for each particles
auto it = vd.getDomainIterator();
while (it.isNext())
{
auto p = it.get();
vd.template getProp<0>(p) = VV.getNNPart(p.getKey());
++it;
}
// Move to GPU
vd.template hostToDeviceProp<0>();
ModelSquare md;
md.factor = 10;
vd.addComputationCosts(md);
vd.getDecomposition().decompose();
vd.map(RUN_ON_DEVICE);
vd.deviceToHostPos();
// Move info to CPU for addComputationcosts
vd.addComputationCosts(md);
openfpm::vector<size_t> loads;
size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
v_cl.allGather(load,loads);
v_cl.execute();
for (size_t i = 0 ; i < loads.size() ; i++)
{
double load_f = load;
double load_fc = loads.get(i);
BOOST_REQUIRE_CLOSE(load_f,load_fc,7.0);
}
BOOST_REQUIRE(vd.size_local() != 0);
Point<3,double> v({1.0,1.0,1.0});
for (size_t i = 0 ; i < 25 ; i++)
{
// move particles to CPU and move the particles by 0.1
vd.deviceToHostPos();
auto it = vd.getDomainIterator();
while (it.isNext())
{
auto p = it.get();
vd.getPos(p)[0] += v.get(0) * 0.09;
vd.getPos(p)[1] += v.get(1) * 0.09;
vd.getPos(p)[2] += v.get(2) * 0.09;
++it;
}
//Back to GPU
vd.hostToDevicePos();
vd.map(RUN_ON_DEVICE);
vd.template ghost_get<>(RUN_ON_DEVICE);
vd.deviceToHostPos();
vd.template deviceToHostProp<0>();
auto VV2 = vd.getVerlet(0.01);
auto it2 = vd.getDomainIterator();
bool match = true;
while (it2.isNext())
{
auto p = it2.get();
match &= vd.template getProp<0>(p) == VV2.getNNPart(p.getKey());
if (match == false)
{
std::cout << vd.template getProp<0>(p) << " " << VV2.getNNPart(p.getKey()) << std::endl;
break;
}
++it2;
}
BOOST_REQUIRE_EQUAL(match,true);
ModelSquare md;
vd.addComputationCosts(md);
vd.getDecomposition().redecompose(200);
vd.map(RUN_ON_DEVICE);
BOOST_REQUIRE(vd.size_local() != 0);
vd.template ghost_get<0>(RUN_ON_DEVICE);
vd.deviceToHostPos();
vd.template deviceToHostProp<0>();
vd.addComputationCosts(md);
openfpm::vector<size_t> loads;
size_t load = vd.getDecomposition().getDistribution().getProcessorLoad();
v_cl.allGather(load,loads);
v_cl.execute();
for (size_t i = 0 ; i < loads.size() ; i++)
{
double load_f = load;
double load_fc = loads.get(i);
BOOST_REQUIRE_CLOSE(load_f,load_fc,10.0);
}
}
}
BOOST_AUTO_TEST_SUITE_END()
......@@ -1018,7 +1018,7 @@ public:
* \return the Cell list
*
*/
template<typename CellL = CellList<dim, St, Mem_fast<>, shift<dim, St> > > CellL getCellListSym(St r_cut)
template<typename CellL = CellList<dim, St, Mem_fast<>, shift<dim, St>,decltype(v_pos) > > CellL getCellListSym(St r_cut)
{
#ifdef SE_CLASS1
if (!(opt & BIND_DEC_TO_GHOST))
......@@ -1108,7 +1108,7 @@ public:
* \return the Cell list
*
*/
template<typename CellL = CellList_gen<dim, St, Process_keys_lin, Mem_fast<>, shift<dim, St> > >
template<typename CellL = CellList_gen<dim, St, Process_keys_lin, Mem_fast<>, shift<dim, St>, decltype(v_pos) > >
CellL getCellList(St r_cut, bool no_se3 = false)
{
#ifdef SE_CLASS3
......@@ -1471,7 +1471,7 @@ public:
* \return a VerletList object
*
*/
template <typename VerletL = VerletList<dim,St,Mem_fast<>,shift<dim,St> >>
template <typename VerletL = VerletList<dim,St,Mem_fast<>,shift<dim,St>,decltype(v_pos) >>
VerletL getVerlet(St r_cut)
{
#ifdef SE_CLASS3
......@@ -1526,7 +1526,7 @@ public:
VerletList<dim,St,Mem_type,shift<dim,St> > ver_tmp;
ver_tmp = getVerlet<VerletList<dim,St,Mem_type,shift<dim,St> >>(r_cut);
ver.swap(ver);
ver.swap(ver_tmp);
}
}
else if (opt == VL_CRS_SYMMETRIC)
......@@ -2514,7 +2514,7 @@ public:
* \return Particle iterator
*
*/
template<typename cli> ParticleItCRS_Cells<dim,cli> getParticleIteratorCRS_Cell(cli & NN)
template<typename cli> ParticleItCRS_Cells<dim,cli,decltype(v_pos)> getParticleIteratorCRS_Cell(cli & NN)
{
#ifdef SE_CLASS3
se3.getIterator();
......@@ -2540,7 +2540,7 @@ public:
getDecomposition().setNNParameters(shift,gs);
// First we check that
return ParticleItCRS_Cells<dim,cli>(NN,getDecomposition().getCRSDomainCells(),
return ParticleItCRS_Cells<dim,cli,decltype(v_pos)>(NN,getDecomposition().getCRSDomainCells(),
getDecomposition().getCRSAnomDomainCells(),
NN.getNNc_sym());
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment