Commit 61e2d5e0 authored by incardon's avatar incardon

Fixing Allocations Free of memory

parent e9155785
Pipeline #462 failed with stages
in 4 seconds
# Change Log
All notable changes to this project will be documented in this file.
## [2.0.0] December 2018
### Added
- Adding GPU support (see example 3_molecular_dynamic_gpu)
### Changed
- The type Vcluster now is templated and the standard Vcluster is Vcluster<>
## [1.1.0] February 2018
### Added
......
......@@ -388,6 +388,9 @@ int main(int argc, char* argv[])
++it;
}
vd.map();
vd.ghost_get<>();
//! \cond [vect grid] \endcond
/*!
......
......@@ -53,7 +53,7 @@ int main(int argc, char* argv[])
openfpm_init(&argc,&argv);
Vcluster & vcl = create_vcluster();
Vcluster<> & vcl = create_vcluster();
size_t sz[3] = {10,10,10};
Box<3,float> box({0.0,0.0,0.0},{1.0,1.0,1.0});
......
......@@ -272,7 +272,7 @@ int main(int argc, char* argv[])
openfpm::vector<openfpm::vector<double>> y;
openfpm_init(&argc,&argv);
Vcluster & v_cl = create_vcluster();
Vcluster<> & v_cl = create_vcluster();
// we will use it do place particles on a 10x10x10 Grid like
size_t sz[3] = {10,10,10};
......
include ../../example.mk
CC_SCOREP=scorep --nocompiler --cuda --mpp=mpi nvcc
CC=${CC_SCOREP}
LDIR =
OBJ = main.o
all: md_dyn
%.o: %.cu
$(CC) -O3 -g -c -isystem=/home/i-bird/MPI/include --std=c++11 -o $@ $< $(INCLUDE_PATH_NVCC)
md_dyn: $(OBJ)
$(CC) -o $@ $^ $(CFLAGS) $(LIBS_PATH_NVCC) $(LIBS) -L/home/i-bird/MPI/lib -L/usr/local/cuda/lib64 -lcudart -lmpi -L/usr/local/cuda/extras/CUPTI/lib64 -lhdf5
run: all
mpirun -np 3 ./md_dyn && mpirun -np 3 ./md_dyn_expr && mpirun -np 3 ./md_dyn_vl;
.PHONY: clean all run
clean:
rm -f *.o *~ core md_dyn md_dyn_expr md_dyn_vl
This diff is collapsed.
openfpm_data @ e7f888c1
Subproject commit e7010bdc474736c4211947ad8f08dd29d6e4e59f
Subproject commit e7f888c11045bfad0f927a86f0fc9c5f3efa8053
openfpm_devices @ 2f254618
Subproject commit e5d5d31a0af09312118f72c0818a824443ba80fb
Subproject commit 2f2546180dc070a0db59b359250defcc2862f96c
......@@ -8,14 +8,24 @@
#ifndef VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
#define VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, bool is_ok_cuda>
template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, typename scan_type, bool is_ok_cuda>
struct labelParticlesGhost_impl
{
static void run(Decomposition & dec,
openfpm::vector<aggregate<unsigned int,unsigned long int>,
CudaMemory,
typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type,
memory_traits_inte> & g_opart_device,
static void run(CudaMemory & mem,
scan_type & sc,
Decomposition & dec,
openfpm::vector<aggregate<unsigned int,unsigned long int>,
CudaMemory,
typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type,
memory_traits_inte> & g_opart_device,
openfpm::vector<aggregate<unsigned int>,
Memory,
typename layout_base<aggregate<unsigned int>>::type,
layout_base> & proc_id_out,
openfpm::vector<aggregate<unsigned int>,
Memory,
typename layout_base<aggregate<unsigned int>>::type,
layout_base> & starts,
Vcluster<Memory> & v_cl,
openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
......@@ -31,14 +41,24 @@ struct labelParticlesGhost_impl
template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition>
struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,true>
template<unsigned int dim, typename St, typename prop, typename Memory, template<typename> class layout_base, typename Decomposition, typename scan_type>
struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,scan_type,true>
{
static void run(Decomposition & dec,
static void run(CudaMemory & mem,
scan_type & sc,
Decomposition & dec,
openfpm::vector<aggregate<unsigned int,unsigned long int>,
CudaMemory,
typename memory_traits_inte<aggregate<unsigned int,unsigned long int>>::type,
memory_traits_inte> & g_opart_device,
openfpm::vector<aggregate<unsigned int>,
Memory,
typename layout_base<aggregate<unsigned int>>::type,
layout_base> & proc_id_out,
openfpm::vector<aggregate<unsigned int>,
Memory,
typename layout_base<aggregate<unsigned int>>::type,
layout_base> & starts,
Vcluster<Memory> & v_cl,
openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
......@@ -50,10 +70,6 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
{
#if defined(CUDA_GPU) && defined(__NVCC__)
openfpm::vector<aggregate<unsigned int>,
Memory,
typename layout_base<aggregate<unsigned int>>::type,
layout_base> proc_id_out;
proc_id_out.resize(v_pos.size()+1);
proc_id_out.template get<0>(proc_id_out.size()-1) = 0;
......@@ -62,17 +78,12 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
auto ite = v_pos.getGPUIterator();
// First we have to see how many entry each particle produce
num_proc_ghost_each_part<3,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>
num_proc_ghost_each_part<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(proc_id_out.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),v_pos.toKernel(),proc_id_out.toKernel());
openfpm::vector<aggregate<unsigned int>,
Memory,
typename layout_base<aggregate<unsigned int>>::type,
layout_base> starts;
// scan
scan<unsigned int,unsigned int>(proc_id_out,starts);
sc.scan_(proc_id_out,starts);
starts.resize(proc_id_out.size());
starts.template deviceToHost<0>(starts.size()-1,starts.size()-1);
size_t sz = starts.template get<0>(starts.size()-1);
......@@ -84,14 +95,13 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
ite = v_pos.getGPUIterator();
// we compute processor id for each particle
proc_label_id_ghost<3,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>
proc_label_id_ghost<dim,St,decltype(dec.toKernel()),decltype(v_pos.toKernel()),decltype(starts.toKernel()),decltype(g_opart_device.toKernel())>
<<<ite.wthr,ite.thr>>>
(dec.toKernel(),v_pos.toKernel(),starts.toKernel(),g_opart_device.toKernel());
// sort particles
mergesort((int *)g_opart_device.template getDeviceBuffer<0>(),(long unsigned int *)g_opart_device.template getDeviceBuffer<1>(), g_opart_device.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
CudaMemory mem;
mem.allocate(sizeof(int));
mem.fill(0);
prc_offset.resize(v_cl.size());
......@@ -179,6 +189,7 @@ struct local_ghost_from_dec_impl
openfpm::vector<Box<dim, St>,Memory,typename layout_base<Box<dim,St>>::type,layout_base> & box_f_dev,
openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & box_f_sv,
Vcluster<Memory> & v_cl,
openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & starts,
openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
size_t & g_m,
......@@ -197,6 +208,7 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
openfpm::vector<Box<dim, St>,Memory,typename layout_base<Box<dim,St>>::type,layout_base> & box_f_dev,
openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & box_f_sv,
Vcluster<Memory> & v_cl,
openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> & starts,
openfpm::vector<Point<dim, St>,Memory,typename layout_base<Point<dim,St>>::type,layout_base> & v_pos,
openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp,
size_t & g_m,
......@@ -216,7 +228,7 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
<<<ite.wthr,ite.thr>>>
(box_f_dev.toKernel(),v_pos.toKernel(),o_part_loc.toKernel());
openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> starts;
// openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> starts;
starts.resize(o_part_loc.size());
mgpu::scan((unsigned int *)o_part_loc.template getDeviceBuffer<0>(), o_part_loc.size(), (unsigned int *)starts.template getDeviceBuffer<0>() , v_cl.getmgpuContext());
......
......@@ -600,8 +600,10 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use )
starts.resize(proc_id_out.size());
scan<unsigned int,unsigned int> sc;
// scan
scan<unsigned int,unsigned int>(proc_id_out,starts);
sc.scan_(proc_id_out,starts);
starts.deviceToHost<0>(starts.size()-1,starts.size()-1);
size_t sz = starts.template get<0>(starts.size()-1);
......
......@@ -57,8 +57,8 @@ __global__ void calculate_force(vector_dist_ker<3, float, aggregate<float, floa
while (it.isNext())
{
auto q1 = it.get();
auto q2 = it.get_orig();
auto q1 = it.get_sort();
auto q2 = it.get();
if (q2 == p) {++it; continue;}
......@@ -104,7 +104,7 @@ __global__ void calculate_force_full_sort(vector_dist_ker<3, float, aggregate<f
while (it.isNext())
{
auto q1 = it.get();
auto q1 = it.get_sort();
if (q1 == p) {++it; continue;}
......@@ -364,14 +364,15 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[1],vd.getPos(p)[0] + vd.getPos(p)[2],0.01);
BOOST_REQUIRE_CLOSE(vd.template getProp<1>(p)[2],vd.getPos(p)[1] + vd.getPos(p)[2],0.01);
//std::cout << "PROP 0 " << vd.template getProp<0>(p) << " " << vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2] << std::endl;
++it4;
}
// here we do a ghost_get
vd.ghost_get<0>();
// Doube ghost get to check crashes
vd.ghost_get<0>();
// we re-offload what we received
vd.hostToDevicePos();
vd.template hostToDeviceProp<0>();
......@@ -419,7 +420,7 @@ void vdist_calc_gpu_test()
// Boundary conditions
size_t bc[3]={PERIODIC,PERIODIC,PERIODIC};
vector_dist_gpu<3,St,aggregate<float,float[3],float[3]>> vd(1000,domain,bc,g);
vector_dist_gpu<3,St,aggregate<St,St[3],St[3]>> vd(1000,domain,bc,g);
auto it = vd.getDomainIterator();
......@@ -540,7 +541,7 @@ void vdist_calc_gpu_test()
// To test we copy on a cpu distributed vector and we do a map
vector_dist<3,St,aggregate<float,float[3],float[3]>> vd_cpu(vd.getDecomposition().template duplicate_convert<HeapMemory,memory_traits_lin>(),0);
vector_dist<3,St,aggregate<St,St[3],St[3]>> vd_cpu(vd.getDecomposition().template duplicate_convert<HeapMemory,memory_traits_lin>(),0);
auto itc = vd.getDomainIterator();
......@@ -581,9 +582,10 @@ void vdist_calc_gpu_test()
{
Point<3,St> xp;
float prp0;
float prp1[3];
float prp2[3];
St prp0;
St prp1[3];
St prp2[3];
bool operator<(const part & tmp) const
{
......
......@@ -11,10 +11,10 @@
#if defined(CUDA_GPU) && defined(__NVCC__)
#include "util/cuda/moderngpu/kernel_mergesort.hxx"
#include "Vector/cuda/vector_dist_cuda_funcs.cuh"
#include "util/cuda/scan_cuda.cuh"
#include "util/cuda/moderngpu/kernel_scan.hxx"
#endif
#include "util/cuda/scan_cuda.cuh"
#include "Vector/util/vector_dist_funcs.hpp"
#include "cuda/vector_dist_comm_util_funcs.cuh"
......@@ -88,6 +88,9 @@ class vector_dist_comm
//! It map the processor id with the communication request into map procedure
openfpm::vector<size_t> p_map_req;
//! scan functionality required for gpu
scan<unsigned int,unsigned int> sc;
//! For each near processor, outgoing particle id
//! \warning opart is assumed to be an ordered list
//! first id particle id
......@@ -144,16 +147,31 @@ class vector_dist_comm
//! The same as recv_sz_get but for map
openfpm::vector<size_t> recv_sz_map;
//! temporary buffer to processors ids
openfpm::vector<aggregate<unsigned int>,
Memory,
typename layout_base<aggregate<unsigned int>>::type,
layout_base> proc_id_out;
//! temporary buffer for the scan result
openfpm::vector<aggregate<unsigned int>,
Memory,
typename layout_base<aggregate<unsigned int>>::type,
layout_base> starts;
//! Processor communication size
openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,typename layout_base<aggregate<unsigned int, unsigned int>>::type,layout_base> prc_offset;
//! Temporary CudaMemory to do stuff
CudaMemory mem;
//! Local ghost marker (across the ghost particles it mark from where we have the)
//! replicated ghost particles that are local
size_t lg_m;
//! Sending buffer
openfpm::vector<Memory> hsmem;
//! Receiving buffer
openfpm::vector<HeapMemory> hrmem;
openfpm::vector_fr<Memory> hsmem;
//! process the particle with properties
template<typename prp_object, int ... prp>
......@@ -234,6 +252,9 @@ class vector_dist_comm
//! Id of the local particle to replicate for ghost_get
openfpm::vector<aggregate<unsigned int,unsigned int>,Memory,typename layout_base<aggregate<unsigned int,unsigned int>>::type,layout_base> o_part_loc;
//! Processor communication size
openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,typename layout_base<aggregate<unsigned int, unsigned int>>::type,layout_base> prc_sz;
/*! \brief For every internal ghost box we create a structure that order such internal local ghost box in
* shift vectors
*
......@@ -369,7 +390,7 @@ class vector_dist_comm
if (opt & RUN_ON_DEVICE)
{
local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,std::is_same<Memory,CudaMemory>::value>
::run(o_part_loc,shifts,box_f_dev,box_f_sv,v_cl,v_pos,v_prp,g_m,opt);
::run(o_part_loc,shifts,box_f_dev,box_f_sv,v_cl,starts,v_pos,v_prp,g_m,opt);
}
else
{
......@@ -623,7 +644,7 @@ class vector_dist_comm
*
*
*/
void resize_retained_buffer(openfpm::vector<Memory> & rt_buf, size_t nbf)
void resize_retained_buffer(openfpm::vector_fr<Memory> & rt_buf, size_t nbf)
{
// Release all the buffer that are going to be deleted
for (size_t i = nbf ; i < rt_buf.size() ; i++)
......@@ -645,12 +666,12 @@ class vector_dist_comm
size_t i;
openfpm::vector<Memory> & hsmem;
openfpm::vector_fr<Memory> & hsmem;
size_t j;
set_mem_retained_buffers_inte(openfpm::vector<send_vector> & g_send_prp, size_t i ,
openfpm::vector<Memory> & hsmem, size_t j)
openfpm::vector_fr<Memory> & hsmem, size_t j)
:g_send_prp(g_send_prp),i(i),hsmem(hsmem),j(j)
{}
......@@ -670,7 +691,7 @@ class vector_dist_comm
static inline size_t set_mem_retained_buffers_(openfpm::vector<send_vector> & g_send_prp,
openfpm::vector<size_t> & prc_sz,
size_t i,
openfpm::vector<Memory> & hsmem,
openfpm::vector_fr<Memory> & hsmem,
size_t j)
{
// Set the memory for retain the send buffer
......@@ -689,17 +710,19 @@ class vector_dist_comm
static inline size_t set_mem_retained_buffers_(openfpm::vector<send_vector> & g_send_prp,
openfpm::vector<size_t> & prc_sz,
size_t i,
openfpm::vector<Memory> & hsmem,
openfpm::vector_fr<Memory> & hsmem,
size_t j)
{
set_mem_retained_buffers_inte<send_vector,v_mpl> smrbi(g_send_prp,i,hsmem,j);
boost::mpl::for_each_ref<boost::mpl::range_c<int,0,boost::mpl::size<v_mpl>::type::value>>(smrbi);
g_send_prp.get(i).resize(prc_sz.get(i));
// resize the sending vector (No allocation is produced)
g_send_prp.get(i).resize(prc_sz.get(i));
// if we do not send properties do not reallocate
if (boost::mpl::size<v_mpl>::type::value != 0)
{
// resize the sending vector (No allocation is produced)
g_send_prp.get(i).resize(prc_sz.get(i));
}
return smrbi.j;
}
......@@ -752,17 +775,20 @@ class vector_dist_comm
size_t offset = 0;
// Fill the sending buffers
for (size_t i = 0 ; i < g_send_prp.size() ; i++)
if (sizeof...(prp) != 0)
{
auto ite = g_send_prp.get(i).getGPUIterator();
// Fill the sending buffers
for (size_t i = 0 ; i < g_send_prp.size() ; i++)
{
auto ite = g_send_prp.get(i).getGPUIterator();
process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>
<<<ite.wthr,ite.thr>>>
(g_opart_device.toKernel(), g_send_prp.get(i).toKernel(),
v_prp.toKernel(),offset);
process_ghost_particles_prp<decltype(g_opart_device.toKernel()),decltype(g_send_prp.get(i).toKernel()),decltype(v_prp.toKernel()),prp...>
<<<ite.wthr,ite.thr>>>
(g_opart_device.toKernel(), g_send_prp.get(i).toKernel(),
v_prp.toKernel(),offset);
offset += prc_sz.get(i);
offset += prc_sz.get(i);
}
}
#else
......@@ -966,7 +992,6 @@ class vector_dist_comm
// sort particles
mergesort((int *)lbl_p.template getDeviceBuffer<1>(),(int *)lbl_p.template getDeviceBuffer<0>(), lbl_p.size(), mgpu::template less_t<int>(), v_cl.getmgpuContext());
CudaMemory mem;
mem.allocate(sizeof(int));
mem.fill(0);
......@@ -1079,8 +1104,8 @@ class vector_dist_comm
if (opt & RUN_ON_DEVICE)
{
labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,
Decomposition,std::is_same<Memory,CudaMemory>::value>
::run(dec,g_opart_device,v_cl,v_pos,v_prp,prc,prc_sz,prc_offset,g_m,opt);
Decomposition,scan<unsigned int,unsigned int>,std::is_same<Memory,CudaMemory>::value>
::run(mem,sc,dec,g_opart_device,proc_id_out,starts,v_cl,v_pos,v_prp,prc,prc_sz,prc_offset,g_m,opt);
}
else
{
......@@ -1296,9 +1321,6 @@ public:
// send vector for each processor
typedef openfpm::vector<prp_object,Memory,typename layout_base<prp_object>::type,layout_base,openfpm::grow_policy_identity> send_vector;
// Processor communication size
openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,typename layout_base<aggregate<unsigned int, unsigned int>>::type,layout_base> prc_offset;
// elements to send for each processors
openfpm::vector<size_t> prc_sz;
......@@ -1480,8 +1502,7 @@ public:
openfpm::vector<prop,Memory,typename layout_base<prop>::type,layout_base> & v_prp, size_t & g_m,
size_t opt)
{
// Processor communication size
openfpm::vector<aggregate<unsigned int, unsigned int>,Memory,typename layout_base<aggregate<unsigned int, unsigned int>>::type,layout_base> prc_sz(v_cl.getProcessingUnits());
prc_sz.resize(v_cl.getProcessingUnits());
// map completely reset the ghost part
v_pos.resize(g_m);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment