Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
O
openfpm_pdata
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
1
Issues
1
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Requirements
Requirements
List
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Operations
Operations
Environments
Analytics
Analytics
CI / CD
Code Review
Insights
Issue
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
openfpm
openfpm_pdata
Commits
61e2d5e0
Commit
61e2d5e0
authored
Sep 24, 2018
by
incardon
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fixing Allocations Free of memory
parent
e9155785
Pipeline
#462
failed with stages
in 4 seconds
Changes
13
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
784 additions
and
70 deletions
+784
-70
CHANGELOG.md
CHANGELOG.md
+10
-0
example/Vector/3_molecular_dynamic/main.cpp
example/Vector/3_molecular_dynamic/main.cpp
+3
-0
example/Vector/3_molecular_dynamic/main_expr.cpp
example/Vector/3_molecular_dynamic/main_expr.cpp
+1
-1
example/Vector/3_molecular_dynamic/main_vl.cpp
example/Vector/3_molecular_dynamic/main_vl.cpp
+1
-1
example/Vector/3_molecular_dynamic_gpu_opt/Makefile
example/Vector/3_molecular_dynamic_gpu_opt/Makefile
+26
-0
example/Vector/3_molecular_dynamic_gpu_opt/config.cfg
example/Vector/3_molecular_dynamic_gpu_opt/config.cfg
+2
-0
example/Vector/3_molecular_dynamic_gpu_opt/main.cu
example/Vector/3_molecular_dynamic_gpu_opt/main.cu
+636
-0
openfpm_data
openfpm_data
+1
-1
openfpm_devices
openfpm_devices
+1
-1
src/Vector/cuda/vector_dist_comm_util_funcs.cuh
src/Vector/cuda/vector_dist_comm_util_funcs.cuh
+35
-23
src/Vector/cuda/vector_dist_cuda_func_test.cu
src/Vector/cuda/vector_dist_cuda_func_test.cu
+3
-1
src/Vector/cuda/vector_dist_gpu_unit_tests.cu
src/Vector/cuda/vector_dist_gpu_unit_tests.cu
+12
-10
src/Vector/vector_dist_comm.hpp
src/Vector/vector_dist_comm.hpp
+53
-32
No files found.
CHANGELOG.md
View file @
61e2d5e0
# Change Log
All notable changes to this project will be documented in this file.
## [2.0.0] December 2018
### Added
-
Adding GPU support (see example 3_molecular_dynamic_gpu)
### Changed
-
The type Vcluster now is templated and the standard Vcluster is Vcluster
<>
## [1.1.0] February 2018
### Added
...
...
example/Vector/3_molecular_dynamic/main.cpp
View file @
61e2d5e0
...
...
@@ -388,6 +388,9 @@ int main(int argc, char* argv[])
++
it
;
}
vd
.
map
();
vd
.
ghost_get
<>
();
//! \cond [vect grid] \endcond
/*!
...
...
example/Vector/3_molecular_dynamic/main_expr.cpp
View file @
61e2d5e0
...
...
@@ -53,7 +53,7 @@ int main(int argc, char* argv[])
openfpm_init
(
&
argc
,
&
argv
);
Vcluster
&
vcl
=
create_vcluster
();
Vcluster
<>
&
vcl
=
create_vcluster
();
size_t
sz
[
3
]
=
{
10
,
10
,
10
};
Box
<
3
,
float
>
box
({
0.0
,
0.0
,
0.0
},{
1.0
,
1.0
,
1.0
});
...
...
example/Vector/3_molecular_dynamic/main_vl.cpp
View file @
61e2d5e0
...
...
@@ -272,7 +272,7 @@ int main(int argc, char* argv[])
openfpm
::
vector
<
openfpm
::
vector
<
double
>>
y
;
openfpm_init
(
&
argc
,
&
argv
);
Vcluster
&
v_cl
=
create_vcluster
();
Vcluster
<>
&
v_cl
=
create_vcluster
();
// we will use it do place particles on a 10x10x10 Grid like
size_t
sz
[
3
]
=
{
10
,
10
,
10
};
...
...
example/Vector/3_molecular_dynamic_gpu_opt/Makefile
0 → 100644
View file @
61e2d5e0
include
../../example.mk
CC_SCOREP
=
scorep
--nocompiler
--cuda
--mpp
=
mpi nvcc
CC
=
${CC_SCOREP}
LDIR
=
OBJ
=
main.o
all
:
md_dyn
%.o
:
%.cu
$(CC)
-O3
-g
-c
-isystem
=
/home/i-bird/MPI/include
--std
=
c++11
-o
$@
$<
$(INCLUDE_PATH_NVCC)
md_dyn
:
$(OBJ)
$(CC)
-o
$@
$^
$(CFLAGS)
$(LIBS_PATH_NVCC)
$(LIBS)
-L
/home/i-bird/MPI/lib
-L
/usr/local/cuda/lib64
-lcudart
-lmpi
-L
/usr/local/cuda/extras/CUPTI/lib64
-lhdf5
run
:
all
mpirun
-np
3 ./md_dyn
&&
mpirun
-np
3 ./md_dyn_expr
&&
mpirun
-np
3 ./md_dyn_vl
;
.PHONY
:
clean all run
clean
:
rm
-f
*
.o
*
~ core md_dyn md_dyn_expr md_dyn_vl
example/Vector/3_molecular_dynamic_gpu_opt/config.cfg
0 → 100644
View file @
61e2d5e0
[pack]
files = main.cu Makefile
example/Vector/3_molecular_dynamic_gpu_opt/main.cu
0 → 100644
View file @
61e2d5e0
This diff is collapsed.
Click to expand it.
openfpm_data
@
e7f888c1
Subproject commit e7
010bdc474736c4211947ad8f08dd29d6e4e59f
Subproject commit e7
f888c11045bfad0f927a86f0fc9c5f3efa8053
openfpm_devices
@
2f254618
Subproject commit
e5d5d31a0af09312118f72c0818a824443ba80fb
Subproject commit
2f2546180dc070a0db59b359250defcc2862f96c
src/Vector/cuda/vector_dist_comm_util_funcs.cuh
View file @
61e2d5e0
...
...
@@ -8,14 +8,24 @@
#ifndef VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
#define VECTOR_DIST_COMM_UTIL_FUNCS_HPP_
template
<
unsigned
int
dim
,
typename
St
,
typename
prop
,
typename
Memory
,
template
<
typename
>
class
layout_base
,
typename
Decomposition
,
bool
is_ok_cuda
>
template
<
unsigned
int
dim
,
typename
St
,
typename
prop
,
typename
Memory
,
template
<
typename
>
class
layout_base
,
typename
Decomposition
,
typename
scan_type
,
bool
is_ok_cuda
>
struct
labelParticlesGhost_impl
{
static
void
run
(
Decomposition
&
dec
,
openfpm
::
vector
<
aggregate
<
unsigned
int
,
unsigned
long
int
>
,
CudaMemory
,
typename
memory_traits_inte
<
aggregate
<
unsigned
int
,
unsigned
long
int
>>::
type
,
memory_traits_inte
>
&
g_opart_device
,
static
void
run
(
CudaMemory
&
mem
,
scan_type
&
sc
,
Decomposition
&
dec
,
openfpm
::
vector
<
aggregate
<
unsigned
int
,
unsigned
long
int
>
,
CudaMemory
,
typename
memory_traits_inte
<
aggregate
<
unsigned
int
,
unsigned
long
int
>>::
type
,
memory_traits_inte
>
&
g_opart_device
,
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
&
proc_id_out
,
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
&
starts
,
Vcluster
<
Memory
>
&
v_cl
,
openfpm
::
vector
<
Point
<
dim
,
St
>
,
Memory
,
typename
layout_base
<
Point
<
dim
,
St
>>::
type
,
layout_base
>
&
v_pos
,
openfpm
::
vector
<
prop
,
Memory
,
typename
layout_base
<
prop
>::
type
,
layout_base
>
&
v_prp
,
...
...
@@ -31,14 +41,24 @@ struct labelParticlesGhost_impl
template
<
unsigned
int
dim
,
typename
St
,
typename
prop
,
typename
Memory
,
template
<
typename
>
class
layout_base
,
typename
Decomposition
>
struct
labelParticlesGhost_impl
<
dim
,
St
,
prop
,
Memory
,
layout_base
,
Decomposition
,
true
>
template
<
unsigned
int
dim
,
typename
St
,
typename
prop
,
typename
Memory
,
template
<
typename
>
class
layout_base
,
typename
Decomposition
,
typename
scan_type
>
struct
labelParticlesGhost_impl
<
dim
,
St
,
prop
,
Memory
,
layout_base
,
Decomposition
,
scan_type
,
true
>
{
static
void
run
(
Decomposition
&
dec
,
static
void
run
(
CudaMemory
&
mem
,
scan_type
&
sc
,
Decomposition
&
dec
,
openfpm
::
vector
<
aggregate
<
unsigned
int
,
unsigned
long
int
>
,
CudaMemory
,
typename
memory_traits_inte
<
aggregate
<
unsigned
int
,
unsigned
long
int
>>::
type
,
memory_traits_inte
>
&
g_opart_device
,
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
&
proc_id_out
,
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
&
starts
,
Vcluster
<
Memory
>
&
v_cl
,
openfpm
::
vector
<
Point
<
dim
,
St
>
,
Memory
,
typename
layout_base
<
Point
<
dim
,
St
>>::
type
,
layout_base
>
&
v_pos
,
openfpm
::
vector
<
prop
,
Memory
,
typename
layout_base
<
prop
>::
type
,
layout_base
>
&
v_prp
,
...
...
@@ -50,10 +70,6 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
{
#if defined(CUDA_GPU) && defined(__NVCC__)
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
proc_id_out
;
proc_id_out
.
resize
(
v_pos
.
size
()
+
1
);
proc_id_out
.
template
get
<
0
>(
proc_id_out
.
size
()
-
1
)
=
0
;
...
...
@@ -62,17 +78,12 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
auto
ite
=
v_pos
.
getGPUIterator
();
// First we have to see how many entry each particle produce
num_proc_ghost_each_part
<
3
,
St
,
decltype
(
dec
.
toKernel
()),
decltype
(
v_pos
.
toKernel
()),
decltype
(
proc_id_out
.
toKernel
())
>
num_proc_ghost_each_part
<
dim
,
St
,
decltype
(
dec
.
toKernel
()),
decltype
(
v_pos
.
toKernel
()),
decltype
(
proc_id_out
.
toKernel
())
>
<<<
ite
.
wthr
,
ite
.
thr
>>>
(
dec
.
toKernel
(),
v_pos
.
toKernel
(),
proc_id_out
.
toKernel
());
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
starts
;
// scan
sc
an
<
unsigned
int
,
unsigned
int
>
(
proc_id_out
,
starts
);
sc
.
scan_
(
proc_id_out
,
starts
);
starts
.
resize
(
proc_id_out
.
size
());
starts
.
template
deviceToHost
<
0
>(
starts
.
size
()
-
1
,
starts
.
size
()
-
1
);
size_t
sz
=
starts
.
template
get
<
0
>(
starts
.
size
()
-
1
);
...
...
@@ -84,14 +95,13 @@ struct labelParticlesGhost_impl<dim,St,prop,Memory,layout_base,Decomposition,tru
ite
=
v_pos
.
getGPUIterator
();
// we compute processor id for each particle
proc_label_id_ghost
<
3
,
St
,
decltype
(
dec
.
toKernel
()),
decltype
(
v_pos
.
toKernel
()),
decltype
(
starts
.
toKernel
()),
decltype
(
g_opart_device
.
toKernel
())
>
proc_label_id_ghost
<
dim
,
St
,
decltype
(
dec
.
toKernel
()),
decltype
(
v_pos
.
toKernel
()),
decltype
(
starts
.
toKernel
()),
decltype
(
g_opart_device
.
toKernel
())
>
<<<
ite
.
wthr
,
ite
.
thr
>>>
(
dec
.
toKernel
(),
v_pos
.
toKernel
(),
starts
.
toKernel
(),
g_opart_device
.
toKernel
());
// sort particles
mergesort
((
int
*
)
g_opart_device
.
template
getDeviceBuffer
<
0
>(),(
long
unsigned
int
*
)
g_opart_device
.
template
getDeviceBuffer
<
1
>(),
g_opart_device
.
size
(),
mgpu
::
template
less_t
<
int
>(),
v_cl
.
getmgpuContext
());
CudaMemory
mem
;
mem
.
allocate
(
sizeof
(
int
));
mem
.
fill
(
0
);
prc_offset
.
resize
(
v_cl
.
size
());
...
...
@@ -179,6 +189,7 @@ struct local_ghost_from_dec_impl
openfpm
::
vector
<
Box
<
dim
,
St
>
,
Memory
,
typename
layout_base
<
Box
<
dim
,
St
>>::
type
,
layout_base
>
&
box_f_dev
,
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
&
box_f_sv
,
Vcluster
<
Memory
>
&
v_cl
,
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
&
starts
,
openfpm
::
vector
<
Point
<
dim
,
St
>
,
Memory
,
typename
layout_base
<
Point
<
dim
,
St
>>::
type
,
layout_base
>
&
v_pos
,
openfpm
::
vector
<
prop
,
Memory
,
typename
layout_base
<
prop
>::
type
,
layout_base
>
&
v_prp
,
size_t
&
g_m
,
...
...
@@ -197,6 +208,7 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
openfpm
::
vector
<
Box
<
dim
,
St
>
,
Memory
,
typename
layout_base
<
Box
<
dim
,
St
>>::
type
,
layout_base
>
&
box_f_dev
,
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
&
box_f_sv
,
Vcluster
<
Memory
>
&
v_cl
,
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
&
starts
,
openfpm
::
vector
<
Point
<
dim
,
St
>
,
Memory
,
typename
layout_base
<
Point
<
dim
,
St
>>::
type
,
layout_base
>
&
v_pos
,
openfpm
::
vector
<
prop
,
Memory
,
typename
layout_base
<
prop
>::
type
,
layout_base
>
&
v_prp
,
size_t
&
g_m
,
...
...
@@ -216,7 +228,7 @@ struct local_ghost_from_dec_impl<dim,St,prop,Memory,layout_base,true>
<<<
ite
.
wthr
,
ite
.
thr
>>>
(
box_f_dev
.
toKernel
(),
v_pos
.
toKernel
(),
o_part_loc
.
toKernel
());
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
starts
;
//
openfpm::vector<aggregate<unsigned int>,Memory,typename layout_base<aggregate<unsigned int>>::type,layout_base> starts;
starts
.
resize
(
o_part_loc
.
size
());
mgpu
::
scan
((
unsigned
int
*
)
o_part_loc
.
template
getDeviceBuffer
<
0
>(),
o_part_loc
.
size
(),
(
unsigned
int
*
)
starts
.
template
getDeviceBuffer
<
0
>()
,
v_cl
.
getmgpuContext
());
...
...
src/Vector/cuda/vector_dist_cuda_func_test.cu
View file @
61e2d5e0
...
...
@@ -600,8 +600,10 @@ BOOST_AUTO_TEST_CASE( decomposition_ie_ghost_gpu_test_use )
starts
.
resize
(
proc_id_out
.
size
());
scan
<
unsigned
int
,
unsigned
int
>
sc
;
// scan
sc
an
<
unsigned
int
,
unsigned
int
>
(
proc_id_out
,
starts
);
sc
.
scan_
(
proc_id_out
,
starts
);
starts
.
deviceToHost
<
0
>
(
starts
.
size
()
-
1
,
starts
.
size
()
-
1
);
size_t
sz
=
starts
.
template
get
<
0
>(
starts
.
size
()
-
1
);
...
...
src/Vector/cuda/vector_dist_gpu_unit_tests.cu
View file @
61e2d5e0
...
...
@@ -57,8 +57,8 @@ __global__ void calculate_force(vector_dist_ker<3, float, aggregate<float, floa
while
(
it
.
isNext
())
{
auto
q1
=
it
.
get
();
auto
q2
=
it
.
get
_orig
();
auto
q1
=
it
.
get
_sort
();
auto
q2
=
it
.
get
();
if
(
q2
==
p
)
{
++
it
;
continue
;}
...
...
@@ -104,7 +104,7 @@ __global__ void calculate_force_full_sort(vector_dist_ker<3, float, aggregate<f
while
(
it
.
isNext
())
{
auto
q1
=
it
.
get
();
auto
q1
=
it
.
get
_sort
();
if
(
q1
==
p
)
{
++
it
;
continue
;}
...
...
@@ -364,14 +364,15 @@ BOOST_AUTO_TEST_CASE( vector_dist_gpu_test)
BOOST_REQUIRE_CLOSE
(
vd
.
template
getProp
<
1
>(
p
)[
1
],
vd
.
getPos
(
p
)[
0
]
+
vd
.
getPos
(
p
)[
2
],
0.01
);
BOOST_REQUIRE_CLOSE
(
vd
.
template
getProp
<
1
>(
p
)[
2
],
vd
.
getPos
(
p
)[
1
]
+
vd
.
getPos
(
p
)[
2
],
0.01
);
//std::cout << "PROP 0 " << vd.template getProp<0>(p) << " " << vd.getPos(p)[0] + vd.getPos(p)[1] + vd.getPos(p)[2] << std::endl;
++
it4
;
}
// here we do a ghost_get
vd
.
ghost_get
<
0
>
();
// Doube ghost get to check crashes
vd
.
ghost_get
<
0
>
();
// we re-offload what we received
vd
.
hostToDevicePos
();
vd
.
template
hostToDeviceProp
<
0
>();
...
...
@@ -419,7 +420,7 @@ void vdist_calc_gpu_test()
// Boundary conditions
size_t
bc
[
3
]
=
{
PERIODIC
,
PERIODIC
,
PERIODIC
};
vector_dist_gpu
<
3
,
St
,
aggregate
<
float
,
float
[
3
],
floa
t
[
3
]
>>
vd
(
1000
,
domain
,
bc
,
g
);
vector_dist_gpu
<
3
,
St
,
aggregate
<
St
,
St
[
3
],
S
t
[
3
]
>>
vd
(
1000
,
domain
,
bc
,
g
);
auto
it
=
vd
.
getDomainIterator
();
...
...
@@ -540,7 +541,7 @@ void vdist_calc_gpu_test()
// To test we copy on a cpu distributed vector and we do a map
vector_dist
<
3
,
St
,
aggregate
<
float
,
float
[
3
],
floa
t
[
3
]
>>
vd_cpu
(
vd
.
getDecomposition
().
template
duplicate_convert
<
HeapMemory
,
memory_traits_lin
>(),
0
);
vector_dist
<
3
,
St
,
aggregate
<
St
,
St
[
3
],
S
t
[
3
]
>>
vd_cpu
(
vd
.
getDecomposition
().
template
duplicate_convert
<
HeapMemory
,
memory_traits_lin
>(),
0
);
auto
itc
=
vd
.
getDomainIterator
();
...
...
@@ -581,9 +582,10 @@ void vdist_calc_gpu_test()
{
Point
<
3
,
St
>
xp
;
float
prp0
;
float
prp1
[
3
];
float
prp2
[
3
];
St
prp0
;
St
prp1
[
3
];
St
prp2
[
3
];
bool
operator
<
(
const
part
&
tmp
)
const
{
...
...
src/Vector/vector_dist_comm.hpp
View file @
61e2d5e0
...
...
@@ -11,10 +11,10 @@
#if defined(CUDA_GPU) && defined(__NVCC__)
#include "util/cuda/moderngpu/kernel_mergesort.hxx"
#include "Vector/cuda/vector_dist_cuda_funcs.cuh"
#include "util/cuda/scan_cuda.cuh"
#include "util/cuda/moderngpu/kernel_scan.hxx"
#endif
#include "util/cuda/scan_cuda.cuh"
#include "Vector/util/vector_dist_funcs.hpp"
#include "cuda/vector_dist_comm_util_funcs.cuh"
...
...
@@ -88,6 +88,9 @@ class vector_dist_comm
//! It map the processor id with the communication request into map procedure
openfpm
::
vector
<
size_t
>
p_map_req
;
//! scan functionality required for gpu
scan
<
unsigned
int
,
unsigned
int
>
sc
;
//! For each near processor, outgoing particle id
//! \warning opart is assumed to be an ordered list
//! first id particle id
...
...
@@ -144,16 +147,31 @@ class vector_dist_comm
//! The same as recv_sz_get but for map
openfpm
::
vector
<
size_t
>
recv_sz_map
;
//! temporary buffer to processors ids
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
proc_id_out
;
//! temporary buffer for the scan result
openfpm
::
vector
<
aggregate
<
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
>>::
type
,
layout_base
>
starts
;
//! Processor communication size
openfpm
::
vector
<
aggregate
<
unsigned
int
,
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
,
unsigned
int
>>::
type
,
layout_base
>
prc_offset
;
//! Temporary CudaMemory to do stuff
CudaMemory
mem
;
//! Local ghost marker (across the ghost particles it mark from where we have the)
//! replicated ghost particles that are local
size_t
lg_m
;
//! Sending buffer
openfpm
::
vector
<
Memory
>
hsmem
;
//! Receiving buffer
openfpm
::
vector
<
HeapMemory
>
hrmem
;
openfpm
::
vector_fr
<
Memory
>
hsmem
;
//! process the particle with properties
template
<
typename
prp_object
,
int
...
prp
>
...
...
@@ -234,6 +252,9 @@ class vector_dist_comm
//! Id of the local particle to replicate for ghost_get
openfpm
::
vector
<
aggregate
<
unsigned
int
,
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
,
unsigned
int
>>::
type
,
layout_base
>
o_part_loc
;
//! Processor communication size
openfpm
::
vector
<
aggregate
<
unsigned
int
,
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
,
unsigned
int
>>::
type
,
layout_base
>
prc_sz
;
/*! \brief For every internal ghost box we create a structure that order such internal local ghost box in
* shift vectors
*
...
...
@@ -369,7 +390,7 @@ class vector_dist_comm
if
(
opt
&
RUN_ON_DEVICE
)
{
local_ghost_from_dec_impl
<
dim
,
St
,
prop
,
Memory
,
layout_base
,
std
::
is_same
<
Memory
,
CudaMemory
>::
value
>
::
run
(
o_part_loc
,
shifts
,
box_f_dev
,
box_f_sv
,
v_cl
,
v_pos
,
v_prp
,
g_m
,
opt
);
::
run
(
o_part_loc
,
shifts
,
box_f_dev
,
box_f_sv
,
v_cl
,
starts
,
v_pos
,
v_prp
,
g_m
,
opt
);
}
else
{
...
...
@@ -623,7 +644,7 @@ class vector_dist_comm
*
*
*/
void
resize_retained_buffer
(
openfpm
::
vector
<
Memory
>
&
rt_buf
,
size_t
nbf
)
void
resize_retained_buffer
(
openfpm
::
vector
_fr
<
Memory
>
&
rt_buf
,
size_t
nbf
)
{
// Release all the buffer that are going to be deleted
for
(
size_t
i
=
nbf
;
i
<
rt_buf
.
size
()
;
i
++
)
...
...
@@ -645,12 +666,12 @@ class vector_dist_comm
size_t
i
;
openfpm
::
vector
<
Memory
>
&
hsmem
;
openfpm
::
vector
_fr
<
Memory
>
&
hsmem
;
size_t
j
;
set_mem_retained_buffers_inte
(
openfpm
::
vector
<
send_vector
>
&
g_send_prp
,
size_t
i
,
openfpm
::
vector
<
Memory
>
&
hsmem
,
size_t
j
)
openfpm
::
vector
_fr
<
Memory
>
&
hsmem
,
size_t
j
)
:
g_send_prp
(
g_send_prp
),
i
(
i
),
hsmem
(
hsmem
),
j
(
j
)
{}
...
...
@@ -670,7 +691,7 @@ class vector_dist_comm
static
inline
size_t
set_mem_retained_buffers_
(
openfpm
::
vector
<
send_vector
>
&
g_send_prp
,
openfpm
::
vector
<
size_t
>
&
prc_sz
,
size_t
i
,
openfpm
::
vector
<
Memory
>
&
hsmem
,
openfpm
::
vector
_fr
<
Memory
>
&
hsmem
,
size_t
j
)
{
// Set the memory for retain the send buffer
...
...
@@ -689,17 +710,19 @@ class vector_dist_comm
static
inline
size_t
set_mem_retained_buffers_
(
openfpm
::
vector
<
send_vector
>
&
g_send_prp
,
openfpm
::
vector
<
size_t
>
&
prc_sz
,
size_t
i
,
openfpm
::
vector
<
Memory
>
&
hsmem
,
openfpm
::
vector
_fr
<
Memory
>
&
hsmem
,
size_t
j
)
{
set_mem_retained_buffers_inte
<
send_vector
,
v_mpl
>
smrbi
(
g_send_prp
,
i
,
hsmem
,
j
);
boost
::
mpl
::
for_each_ref
<
boost
::
mpl
::
range_c
<
int
,
0
,
boost
::
mpl
::
size
<
v_mpl
>::
type
::
value
>>
(
smrbi
);
g_send_prp
.
get
(
i
).
resize
(
prc_sz
.
get
(
i
));
// resize the sending vector (No allocation is produced)
g_send_prp
.
get
(
i
).
resize
(
prc_sz
.
get
(
i
));
// if we do not send properties do not reallocate
if
(
boost
::
mpl
::
size
<
v_mpl
>::
type
::
value
!=
0
)
{
// resize the sending vector (No allocation is produced)
g_send_prp
.
get
(
i
).
resize
(
prc_sz
.
get
(
i
));
}
return
smrbi
.
j
;
}
...
...
@@ -752,17 +775,20 @@ class vector_dist_comm
size_t
offset
=
0
;
// Fill the sending buffers
for
(
size_t
i
=
0
;
i
<
g_send_prp
.
size
()
;
i
++
)
if
(
sizeof
...(
prp
)
!=
0
)
{
auto
ite
=
g_send_prp
.
get
(
i
).
getGPUIterator
();
// Fill the sending buffers
for
(
size_t
i
=
0
;
i
<
g_send_prp
.
size
()
;
i
++
)
{
auto
ite
=
g_send_prp
.
get
(
i
).
getGPUIterator
();
process_ghost_particles_prp
<
decltype
(
g_opart_device
.
toKernel
()),
decltype
(
g_send_prp
.
get
(
i
).
toKernel
()),
decltype
(
v_prp
.
toKernel
()),
prp
...
>
<<<
ite
.
wthr
,
ite
.
thr
>>>
(
g_opart_device
.
toKernel
(),
g_send_prp
.
get
(
i
).
toKernel
(),
v_prp
.
toKernel
(),
offset
);
process_ghost_particles_prp
<
decltype
(
g_opart_device
.
toKernel
()),
decltype
(
g_send_prp
.
get
(
i
).
toKernel
()),
decltype
(
v_prp
.
toKernel
()),
prp
...
>
<<<
ite
.
wthr
,
ite
.
thr
>>>
(
g_opart_device
.
toKernel
(),
g_send_prp
.
get
(
i
).
toKernel
(),
v_prp
.
toKernel
(),
offset
);
offset
+=
prc_sz
.
get
(
i
);
offset
+=
prc_sz
.
get
(
i
);
}
}
#else
...
...
@@ -966,7 +992,6 @@ class vector_dist_comm
// sort particles
mergesort
((
int
*
)
lbl_p
.
template
getDeviceBuffer
<
1
>(),(
int
*
)
lbl_p
.
template
getDeviceBuffer
<
0
>(),
lbl_p
.
size
(),
mgpu
::
template
less_t
<
int
>(),
v_cl
.
getmgpuContext
());
CudaMemory
mem
;
mem
.
allocate
(
sizeof
(
int
));
mem
.
fill
(
0
);
...
...
@@ -1079,8 +1104,8 @@ class vector_dist_comm
if
(
opt
&
RUN_ON_DEVICE
)
{
labelParticlesGhost_impl
<
dim
,
St
,
prop
,
Memory
,
layout_base
,
Decomposition
,
std
::
is_same
<
Memory
,
CudaMemory
>::
value
>
::
run
(
dec
,
g_opart_device
,
v_cl
,
v_pos
,
v_prp
,
prc
,
prc_sz
,
prc_offset
,
g_m
,
opt
);
Decomposition
,
s
can
<
unsigned
int
,
unsigned
int
>
,
s
td
::
is_same
<
Memory
,
CudaMemory
>::
value
>
::
run
(
mem
,
sc
,
dec
,
g_opart_device
,
proc_id_out
,
starts
,
v_cl
,
v_pos
,
v_prp
,
prc
,
prc_sz
,
prc_offset
,
g_m
,
opt
);
}
else
{
...
...
@@ -1296,9 +1321,6 @@ public:
// send vector for each processor
typedef
openfpm
::
vector
<
prp_object
,
Memory
,
typename
layout_base
<
prp_object
>::
type
,
layout_base
,
openfpm
::
grow_policy_identity
>
send_vector
;
// Processor communication size
openfpm
::
vector
<
aggregate
<
unsigned
int
,
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
,
unsigned
int
>>::
type
,
layout_base
>
prc_offset
;
// elements to send for each processors
openfpm
::
vector
<
size_t
>
prc_sz
;
...
...
@@ -1480,8 +1502,7 @@ public:
openfpm
::
vector
<
prop
,
Memory
,
typename
layout_base
<
prop
>::
type
,
layout_base
>
&
v_prp
,
size_t
&
g_m
,
size_t
opt
)
{
// Processor communication size
openfpm
::
vector
<
aggregate
<
unsigned
int
,
unsigned
int
>
,
Memory
,
typename
layout_base
<
aggregate
<
unsigned
int
,
unsigned
int
>>::
type
,
layout_base
>
prc_sz
(
v_cl
.
getProcessingUnits
());
prc_sz
.
resize
(
v_cl
.
getProcessingUnits
());
// map completely reset the ghost part
v_pos
.
resize
(
g_m
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment