From ad776bb4df886aad9ada9d613f492cda5d40808f Mon Sep 17 00:00:00 2001
From: Pietro Incardona <incardon@mpi-cbg.de>
Date: Mon, 24 Dec 2018 00:10:15 +0100
Subject: [PATCH] Adding missing files

---
 .../3_molecular_dynamic_gpu_opt/main_gpu.cu   | 476 ++++++++++++++++++
 1 file changed, 476 insertions(+)
 create mode 100644 example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu

diff --git a/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu b/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu
new file mode 100644
index 000000000..2d22c56cc
--- /dev/null
+++ b/example/Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu
@@ -0,0 +1,476 @@
+/*!
+ * \page Vector_3_md_dyn_gpu_opt Vector 3 molecular dynamic on GPU (optimized version)
+ *
+ * [TOC]
+ *
+ * # Molecular Dynamic with Lennard-Jones potential on GPU (Optimized) {#e3_md_gpu_opt}
+ *
+ * \htmlonly
+ * <img src="http://openfpm.mpi-cbg.de/web/images/examples/3_md_gpu/md_4_GPU.png"/>
+ * \endhtmlonly
+ *
+ * In this optimized version of the example \ref e3_md_gpu, we operate two optimization:
+ *
+ * * We make the access coalesced
+ * * We use half radius cell-list spacing
+ *
+ * ## Coalesced access {#e3_gpu_opt_ca}
+ *
+ * In GPU to get the maximum performance it is very important to access in a coalesced way. Access in a coalesced way mean that
+ *  if thread 1 access adress 0x000 thread 2 (in the same Streaming Multiprocessors) should ideally access 0x004 or more in
+ *  general an adress in the same cache line. Another factor that contribute to speed is to overall restrict the threads in
+ *  the same SM should to possibly work on a limited number of caches lines so that the L1 cache of each SM could optimally
+ *  speed up the access to global memory.
+ *
+ *  Unfortunately particles by nature can be randomly distributed in space and memory, and reach the ideal situation
+ * in the case of neighborhood access of the particles can be challenging. Suppose that thread 1 take particle 0 and thread 2
+ * take particle 1, but 0 and 1 are far in space the neighborhood of 0 does not overlap the neighborhood of 1. This mean that
+ * most probably the access of the neighborhood of 0 will be scattered in memory and the same for 1, having an extremely low
+ * probability that 2 thread in the SM hit the same cache line and increasing the number of cache lines the SM has to retrieve.
+ *  On the other hand if we reorder the particle in memory by their spatial position using a cell list like in figure.
+ *
+ * \htmlonly
+  <table style="width:100%">
+  <tr>
+    <td><img src="http://openfpm.mpi-cbg.de/web/images/examples/3_md_gpu/vector_sorted.jpg"/></td>
+    <td><img src="http://openfpm.mpi-cbg.de/web/images/examples/3_md_gpu/vector_unsorted.jpg"/></td>
+  </tr>
+  <tr>
+    <td><strong>Fig1: Sorted vector</strong></td>
+    <td><strong>Fig2: Unsorted vector</strong></td>
+  </tr>
+</table>
+  \endhtmlonly
+ *
+ * We can see that now the neighborhood of particle 0 and particle 1 overlap increasing that chance of cache hit, additionally if
+ * all particles processed by one SM stay in one cell or few neighborhood cell, the number of cache line that an SM has to read is
+ * reduced, with a significant speed-up.
+ *
+ * In OpenFPM get a Cell-list produce a re-ordered version of the original vector by default. It is possible to offload the sorted
+ *  version vector_dist_gpu instead of the normal one using the function \b toKernel_sorted() \b instead of the function \b toKernel \b.
+ *
+ * \snippet Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu calc_force_sorted
+ *
+ * The rest remain mainly the same, with the expectation, that we now use the macro GET_PARTICLE_SORT. This macro is similar to GET_PARTICLE
+ * but with a substantial difference. While in the normal unsorted vector particles in the ghost area are always added at the end
+ * in the sorted one domain + ghost are reordered, and there is not a clear separation between them. This mean that we need a list of all
+ * the domain particles, if we want iterate cross them. GET_PARTICLE_SORT use a list to convert thread index to domain particle index.
+ * Additionally when we get a neighborhood iterator from the Cell-list we must use \bget_sorted_index\b instead of \bget\b
+ *
+ * \snippet Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu get_sorted_index
+ *
+ * After we launched the kernel all the data are written in the sorted vector. In order to merge back the data to the unsorted one
+ * we have to use the function \b vd.merge_sort<force>(NN) \b. Where vd is the vector_dist_gpu where we want to merge the
+ * data from sorted to non sorted. \b force \b is the property we want to merge and \b NN \b is the Cell-list that produced the
+ * sorted distribution.
+ *
+ * \snippet Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu merge_sort
+ *
+ * \note it is possible to launch multiple kernel on the sorted version, but consider that at some point the data must be merged
+ * back because functions like map and ghost_get work on the unsorted version
+ *
+ * ## Half radius cell-list spacing {#e3_gpu_opt_hr}
+ *
+ * Using Cell-lists with spacing equal to the radius in general require to fetch all the 9 cells in 2D and 27 cells in 3D. All the
+ * particles in such cells include particles within radius r and others more distant than r. This mean that we have to filter the particles
+ * checking the radius. It is possible to filter further more the particles using finer cell-list cells. Suppose that you use
+ * cell-lists with spacing half of the radius. we just the to check the 25 cells in 2D and the 125 cells in 3D. While we have more
+ * cells the overall volume spanned by the 25/125 cells is just a fraction. In fact the surface of the 25 cells is given by
+ *
+ * \f$ (5\frac{h}{2})^2 = \frac{25}{4} h^2 \f$
+ * \f$ (5\frac{h}{2})^3 = \frac{125}{8} h^3 \f$
+ *
+ * while for the normal cell-list is
+ *
+ * \f$ (3h)^2 = 9h^2 \f$
+ * \f$ (3h)^3 = 27h^3 \f$
+ *
+ * This mean that the finer cell-list in order to find the neighborhood particles use an area smaller: precisely is 69% of
+ * the normal cell-list in 2D, and 57% of the normal cell-list in 3D. In particles this mean that normal cell-list return
+ * in average 45% more particles in 2D and  75% more in 3D.
+ *
+ * Constructing an half spacing cell-list is standard. In the function \b getCellListGPU \b we specify half radius
+ *
+ * \snippet Vector/3_molecular_dynamic_gpu_opt/main_gpu.cu get_half_cl
+ *
+ * while to use it, instead of the \b getNNIteratorBox \b we use
+ *
+ * \note \b getNNIteratorBox \b has a template parameter (default = 2) that indicate how many neighborhood cell the NN iterator has to span.
+ *       For example \b getNNIteratorBox<1> \b is the standard 9/27 neighborhood cell-list.\b getNNIteratorBox<2> \b is the 25/125 neighborhood
+ *        and so on.
+ *
+ */
+
+#ifdef __NVCC__
+
+#include "Vector/vector_dist.hpp"
+#include "Plot/GoogleChart.hpp"
+#include "Plot/util.hpp"
+#include "timer.hpp"
+
+#ifdef TEST_RUN
+size_t nstep = 100;
+#else
+size_t nstep = 1000;
+#endif
+
+typedef float real_number;
+
+constexpr int velocity = 0;
+constexpr int force = 1;
+constexpr int energy = 2;
+
+template<typename vector_dist_type,typename NN_type>
+__global__ void calc_force_gpu(vector_dist_type vd, NN_type NN, real_number sigma12, real_number sigma6, real_number r_cut2)
+{
+	unsigned int p;
+	GET_PARTICLE_SORT(p,NN);
+
+	// Get the position xp of the particle
+	Point<3,real_number> xp = vd.getPos(p);
+
+	// Reset the force counter
+	vd.template getProp<force>(p)[0] = 0.0;
+	vd.template getProp<force>(p)[1] = 0.0;
+	vd.template getProp<force>(p)[2] = 0.0;
+
+	Point<3,real_number> force_;
+	force_.get(0) = 0.0;
+	force_.get(1) = 0.0;
+	force_.get(2) = 0.0;
+
+	// Get an iterator over the neighborhood particles of p
+	auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p)));
+
+	// For each neighborhood particle ...
+	while (Np.isNext())
+	{
+		//! \cond [get_sorted_index] \endcond
+
+		// ... q
+		auto q = Np.get_sort();
+
+		//! \cond [get_sorted_index] \endcond
+
+		// if (p == q) skip this particle
+		if (q == p)	{++Np; continue;};
+
+		// Get the position of p
+		Point<3,real_number> xq = vd.getPos(q);
+
+		// Get the distance between p and q
+		Point<3,real_number> r = xp - xq;
+
+		// take the norm of this vector
+		real_number rn = norm2(r);
+
+		if (rn > r_cut2)
+		{++Np; continue;};
+
+		// Calculate the force, using pow is slower
+		Point<3,real_number> f = 24.0*(2.0 *sigma12 / (rn*rn*rn*rn*rn*rn*rn) -  sigma6 / (rn*rn*rn*rn)) * r;
+		force_ += f;
+
+		// Next neighborhood
+		++Np;
+	}
+
+	// we sum the force produced by q on p
+	vd.template getProp<force>(p)[0] = force_.get(0);
+	vd.template getProp<force>(p)[1] = force_.get(1);
+	vd.template getProp<force>(p)[2] = force_.get(2);
+}
+
+template<typename vector_dist_type>
+__global__ void update_velocity_position(vector_dist_type vd, real_number dt)
+{
+	auto p = GET_PARTICLE(vd);
+
+	// here we calculate v(tn + 0.5)
+	vd.template getProp<velocity>(p)[0] += 0.5*dt*vd.template getProp<force>(p)[0];
+	vd.template getProp<velocity>(p)[1] += 0.5*dt*vd.template getProp<force>(p)[1];
+	vd.template getProp<velocity>(p)[2] += 0.5*dt*vd.template getProp<force>(p)[2];
+
+	// here we calculate x(tn + 1)
+	vd.getPos(p)[0] += vd.template getProp<velocity>(p)[0]*dt;
+	vd.getPos(p)[1] += vd.template getProp<velocity>(p)[1]*dt;
+	vd.getPos(p)[2] += vd.template getProp<velocity>(p)[2]*dt;
+}
+
+template<typename vector_dist_type>
+__global__ void update_velocity(vector_dist_type vd, real_number dt)
+{
+	auto p = GET_PARTICLE(vd);
+
+	// here we calculate v(tn + 1)
+	vd.template getProp<velocity>(p)[0] += 0.5*dt*vd.template getProp<force>(p)[0];
+	vd.template getProp<velocity>(p)[1] += 0.5*dt*vd.template getProp<force>(p)[1];
+	vd.template getProp<velocity>(p)[2] += 0.5*dt*vd.template getProp<force>(p)[2];
+}
+
+template<typename vector_dist_type,typename NN_type>
+__global__ void particle_energy(vector_dist_type vd, NN_type NN, real_number sigma12, real_number sigma6, real_number shift, real_number r_cut2)
+{
+	unsigned int p;
+	GET_PARTICLE_SORT(p,NN);
+
+	// Get the position of the particle p
+	Point<3,real_number> xp = vd.getPos(p);
+
+	// Get an iterator over the neighborhood of the particle p
+	auto Np = NN.getNNIteratorBox(NN.getCell(vd.getPos(p)));
+
+	real_number E = 0;
+
+	// For each neighborhood of the particle p
+	while (Np.isNext())
+	{
+		// Neighborhood particle q
+		auto q = Np.get_sort();
+
+		// if p == q skip this particle
+		if (q == p)	{++Np; continue;};
+
+		// Get position of the particle q
+		Point<3,real_number> xq = vd.getPos(q);
+
+		// take the normalized direction
+		real_number rn = norm2(xp - xq);
+
+		if (rn > r_cut2)
+		{++Np;continue;}
+
+		// potential energy (using pow is slower)
+		E += 2.0 * ( sigma12 / (rn*rn*rn*rn*rn*rn) - sigma6 / ( rn*rn*rn) ) - shift;
+
+		// Next neighborhood
+		++Np;
+	}
+
+	// Kinetic energy of the particle given by its actual speed
+	vd.template getProp<energy>(p) = E + (vd.template getProp<velocity>(p)[0]*vd.template getProp<velocity>(p)[0] +
+			vd.template getProp<velocity>(p)[1]*vd.template getProp<velocity>(p)[1] +
+			vd.template getProp<velocity>(p)[2]*vd.template getProp<velocity>(p)[2]) / 2;
+}
+
+template<typename CellList> void calc_forces(vector_dist_gpu<3,real_number, aggregate<real_number[3],real_number[3],real_number> > & vd, CellList & NN, real_number sigma12, real_number sigma6, real_number r_cut2)
+{
+	vd.updateCellList(NN);
+
+	// Get an iterator over particles
+	auto it2 = vd.getDomainIteratorGPU();
+
+	//! \cond [calc_force_sorted] \endcond
+
+	calc_force_gpu<<<it2.wthr,it2.thr>>>(vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,r_cut2);
+
+	//! \cond [calc_force_sorted] \endcond
+
+	//! \cond [merge_sort] \endcond
+
+	vd.merge_sort<force>(NN);
+
+	//! \cond [merge_sort] \endcond
+}
+
+template<typename CellList> real_number calc_energy(vector_dist_gpu<3,real_number, aggregate<real_number[3],real_number[3],real_number> > & vd, CellList & NN, real_number sigma12, real_number sigma6, real_number r_cut2)
+{
+	real_number rc = r_cut2;
+	real_number shift = 2.0 * ( sigma12 / (rc*rc*rc*rc*rc*rc) - sigma6 / ( rc*rc*rc) );
+
+	vd.updateCellList(NN);
+
+	auto it2 = vd.getDomainIteratorGPU();
+
+	particle_energy<<<it2.wthr,it2.thr>>>(vd.toKernel_sorted(),NN.toKernel(),sigma12,sigma6,shift,r_cut2);
+
+	vd.merge_sort<energy>(NN);
+
+	// Calculated energy
+	return reduce_local<energy,_add_>(vd);
+}
+
+int main(int argc, char* argv[])
+{
+	openfpm_init(&argc,&argv);
+
+	real_number sigma = 0.01;
+	real_number r_cut =3.0*sigma;
+
+	// we will use it do place particles on a 10x10x10 Grid like
+	size_t sz[3] = {100,100,100};
+
+	// domain
+	Box<3,float> box({0.0,0.0,0.0},{1.0,1.0,1.0});
+
+	// Boundary conditions
+	size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
+
+	// ghost, big enough to contain the interaction radius
+	Ghost<3,float> ghost(r_cut);
+
+	real_number dt = 0.00005;
+	real_number sigma12 = pow(sigma,12);
+	real_number sigma6 = pow(sigma,6);
+
+	openfpm::vector<real_number> x;
+	openfpm::vector<openfpm::vector<real_number>> y;
+
+	vector_dist_gpu<3,real_number, aggregate<real_number[3],real_number[3],real_number> > vd(0,box,bc,ghost);
+
+	// We create the grid iterator
+	auto it = vd.getGridIterator(sz);
+
+	while (it.isNext())
+	{
+		// Create a new particle
+		vd.add();
+
+		// key contain (i,j,k) index of the grid
+		auto key = it.get();
+
+		// The index of the grid can be accessed with key.get(0) == i, key.get(1) == j ...
+		// We use getLastPos to set the position of the last particle added
+		vd.getLastPos()[0] = key.get(0) * it.getSpacing(0);
+		vd.getLastPos()[1] = key.get(1) * it.getSpacing(1);
+		vd.getLastPos()[2] = key.get(2) * it.getSpacing(2);
+
+		// We use getLastProp to set the property value of the last particle we added
+		vd.template getLastProp<velocity>()[0] = 0.0;
+		vd.template getLastProp<velocity>()[1] = 0.0;
+		vd.template getLastProp<velocity>()[2] = 0.0;
+
+		vd.template getLastProp<force>()[0] = 0.0;
+		vd.template getLastProp<force>()[1] = 0.0;
+		vd.template getLastProp<force>()[2] = 0.0;
+
+		++it;
+	}
+
+	vd.hostToDevicePos();
+	vd.hostToDeviceProp<velocity,force>();
+
+	vd.map(RUN_ON_DEVICE);
+	vd.ghost_get<>(RUN_ON_DEVICE);
+
+	timer tsim;
+	tsim.start();
+
+	//! \cond [md steps] \endcond
+
+	//! \cond [get_half_cl] \endcond
+
+	// Get the Cell list structure
+	auto NN = vd.getCellListGPU(r_cut / 2.0);
+
+	//! \cond [get_half_cl] \endcond
+
+	// The standard
+	// auto NN = vd.getCellList(r_cut);
+
+	// calculate forces
+	calc_forces(vd,NN,sigma12,sigma6,r_cut*r_cut);
+	unsigned long int f = 0;
+
+	// MD time stepping
+	for (size_t i = 0; i < nstep ; i++)
+	{
+		// Get the iterator
+		auto it3 = vd.getDomainIteratorGPU();
+
+		update_velocity_position<<<it3.wthr,it3.thr>>>(vd.toKernel(),dt);
+
+		// Because we moved the particles in space we have to map them and re-sync the ghost
+		vd.map(RUN_ON_DEVICE);
+		vd.template ghost_get<>(RUN_ON_DEVICE);
+
+		// calculate forces or a(tn + 1) Step 2
+		calc_forces(vd,NN,sigma12,sigma6,r_cut*r_cut);
+
+		// Integrate the velocity Step 3
+		auto it4 = vd.getDomainIteratorGPU();
+
+		update_velocity<<<it4.wthr,it4.thr>>>(vd.toKernel(),dt);
+
+		// After every iteration collect some statistic about the configuration
+		if (i % 1000 == 0)
+		{
+			vd.deviceToHostPos();
+			vd.deviceToHostProp<0,1,2>();
+
+			// We write the particle position for visualization (Without ghost)
+			vd.deleteGhost();
+			vd.write_frame("particles_",f);
+
+			// we resync the ghost
+			vd.ghost_get<>(RUN_ON_DEVICE);
+
+			// We calculate the energy
+			real_number energy = calc_energy(vd,NN,sigma12,sigma6,r_cut*r_cut);
+			auto & vcl = create_vcluster();
+			vcl.sum(energy);
+			vcl.execute();
+
+			// we save the energy calculated at time step i c contain the time-step y contain the energy
+			x.add(i);
+			y.add({energy});
+
+			// We also print on terminal the value of the energy
+			// only one processor (master) write on terminal
+			if (vcl.getProcessUnitID() == 0)
+				std::cout << "Energy: " << energy << std::endl;
+
+			f++;
+		}
+	}
+
+	tsim.stop();
+	std::cout << "Time: " << tsim.getwct() << std::endl;
+
+	// Google charts options, it store the options to draw the X Y graph
+	GCoptions options;
+
+	// Title of the graph
+	options.title = std::string("Energy with time");
+
+	// Y axis name
+	options.yAxis = std::string("Energy");
+
+	// X axis name
+	options.xAxis = std::string("iteration");
+
+	// width of the line
+	options.lineWidth = 1.0;
+
+	// Resolution in x
+	options.width = 1280;
+
+	// Resolution in y
+	options.heigh = 720;
+
+	// Add zoom capability
+	options.more = GC_ZOOM;
+
+	// Object that draw the X Y graph
+	GoogleChart cg;
+
+	// Add the graph
+	// The graph that it produce is in svg format that can be opened on browser
+	cg.AddLinesGraph(x,y,options);
+
+	// Write into html format
+	cg.write("gc_plot2_out.html");
+
+	openfpm_finalize();
+}
+
+#else
+
+int main(int argc, char* argv[])
+{
+        return 0;
+}
+
+#endif
+
+
-- 
GitLab