Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
O
openfpm_pdata
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
1
Issues
1
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Requirements
Requirements
List
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Operations
Operations
Environments
Analytics
Analytics
CI / CD
Code Review
Insights
Issue
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
openfpm
openfpm_pdata
Commits
509a43a2
Commit
509a43a2
authored
Oct 28, 2018
by
incardon
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Adding missing files
parent
88c20d93
Changes
19
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
1531 additions
and
716 deletions
+1531
-716
example/Vector/3_molecular_dynamic_gpu_opt/Makefile
example/Vector/3_molecular_dynamic_gpu_opt/Makefile
+17
-6
example/Vector/3_molecular_dynamic_gpu_opt/config.cfg
example/Vector/3_molecular_dynamic_gpu_opt/config.cfg
+1
-1
example/Vector/3_molecular_dynamic_gpu_opt/main.cu
example/Vector/3_molecular_dynamic_gpu_opt/main.cu
+0
-635
example/Vector/7_SPH_dlb/Makefile
example/Vector/7_SPH_dlb/Makefile
+1
-0
example/Vector/7_SPH_dlb/main.cpp
example/Vector/7_SPH_dlb/main.cpp
+16
-10
example/Vector/7_SPH_dlb_gpu/Makefile
example/Vector/7_SPH_dlb_gpu/Makefile
+32
-0
example/Vector/7_SPH_dlb_gpu/config.cfg
example/Vector/7_SPH_dlb_gpu/config.cfg
+2
-0
example/Vector/7_SPH_dlb_gpu/main.cu
example/Vector/7_SPH_dlb_gpu/main.cu
+998
-0
example/Vector/7_SPH_dlb_opt/main.cpp
example/Vector/7_SPH_dlb_opt/main.cpp
+18
-23
openfpm_data
openfpm_data
+1
-1
src/Decomposition/shift_vect_converter.hpp
src/Decomposition/shift_vect_converter.hpp
+1
-1
src/Vector/cuda/vector_dist_cuda_func_test.cu
src/Vector/cuda/vector_dist_cuda_func_test.cu
+120
-0
src/Vector/cuda/vector_dist_cuda_funcs.cuh
src/Vector/cuda/vector_dist_cuda_funcs.cuh
+131
-2
src/Vector/cuda/vector_dist_gpu_unit_tests.cu
src/Vector/cuda/vector_dist_gpu_unit_tests.cu
+18
-5
src/Vector/tests/vector_dist_cell_list_tests.cpp
src/Vector/tests/vector_dist_cell_list_tests.cpp
+81
-1
src/Vector/tests/vector_dist_unit_test.cpp
src/Vector/tests/vector_dist_unit_test.cpp
+3
-1
src/Vector/vector_dist.hpp
src/Vector/vector_dist.hpp
+52
-2
src/Vector/vector_dist_comm.hpp
src/Vector/vector_dist_comm.hpp
+30
-27
src/Vector/vector_dist_kernel.hpp
src/Vector/vector_dist_kernel.hpp
+9
-1
No files found.
example/Vector/3_molecular_dynamic_gpu_opt/Makefile
View file @
509a43a2
include
../../example.mk
CC_SCOREP
=
scorep
--nocompiler
--cuda
--mpp
=
mpi nvcc
CC
=
${CC_SCOREP}
CC
=
nvcc
#
${CC_SCOREP}
CC_MPI
=
mpic++
LDIR
=
OBJ
=
main.o
OBJ_GPU
=
main_gpu.o
OBJ_CPU
=
main_cpu.o
OBJ_CPU_BEST
=
main_cpu_best.o
all
:
md_dyn
all
:
md_dyn
_gpu md_dyn_cpu md_dyn_cpu_best
%.o
:
%.cu
$(CC)
-O3
-g
-c
-isystem
=
/home/i-bird/MPI/include
--std
=
c++11
-o
$@
$<
$(INCLUDE_PATH_NVCC)
md_dyn
:
$(OBJ)
%.o
:
%.cpp
$(CC_MPI)
-O3
-g
-c
--std
=
c++11
-o
$@
$<
$(INCLUDE_PATH)
md_dyn_gpu
:
$(OBJ_GPU)
$(CC)
-o
$@
$^
$(CFLAGS)
$(LIBS_PATH_NVCC)
$(LIBS)
-L
/home/i-bird/MPI/lib
-L
/usr/local/cuda/lib64
-lcudart
-lmpi
-L
/usr/local/cuda/extras/CUPTI/lib64
-lhdf5
md_dyn_cpu
:
$(OBJ_CPU)
$(CC_MPI)
-o
$@
$^
$(CFLAGS)
$(LIBS_PATH)
$(LIBS)
md_dyn_cpu_best
:
$(OBJ_CPU_BEST)
$(CC_MPI)
-o
$@
$^
$(CFLAGS)
$(LIBS_PATH)
$(LIBS)
run
:
all
mpirun
-np
3 ./md_dyn
&&
mpirun
-np
3 ./md_dyn_expr
&&
mpirun
-np
3 ./md_dyn_vl
;
mpirun
-np
3 ./md_dyn
_gpu
&&
mpirun
-np
3 ./md_dyn_cpu
&&
mpirun
-np
3 ./md_dyn_cpu_best
;
.PHONY
:
clean all run
clean
:
rm
-f
*
.o
*
~ core md_dyn
md_dyn_expr md_dyn_vl
rm
-f
*
.o
*
~ core md_dyn
_gpu md_dyn_cpu md_dyn_cpu_best
example/Vector/3_molecular_dynamic_gpu_opt/config.cfg
View file @
509a43a2
[pack]
files = main.cu Makefile
files = main.cu
main_cpu.cpp
Makefile
example/Vector/3_molecular_dynamic_gpu_opt/main.cu
deleted
100644 → 0
View file @
88c20d93
This diff is collapsed.
Click to expand it.
example/Vector/7_SPH_dlb/Makefile
View file @
509a43a2
...
...
@@ -7,6 +7,7 @@ OPT=
OBJ
=
main.o
sph_dlb
:
sph_dlb_test
:
OPT += -DTEST_RUN
sph_dlb_test
:
sph_dlb
...
...
example/Vector/7_SPH_dlb/main.cpp
View file @
509a43a2
...
...
@@ -454,10 +454,9 @@ inline double Pi(const Point<3,double> & dr, double rr2, Point<3,double> & dv, d
/*! \cond [calc_forces] \endcond */
template
<
typename
CellList
>
inline
double
calc_forces
(
particles
&
vd
,
CellList
&
NN
,
double
&
max_visc
)
template
<
typename
CellList
>
inline
void
calc_forces
(
particles
&
vd
,
CellList
&
NN
,
double
&
max_visc
)
{
auto
part
=
vd
.
getDomainIterator
();
double
visc
=
0
;
// Update the cell-list
vd
.
updateCellList
(
NN
);
...
...
@@ -583,7 +582,7 @@ template<typename CellList> inline double calc_forces(particles & vd, CellList &
Point
<
3
,
double
>
DW
;
DWab
(
dr
,
DW
,
r
,
false
);
double
factor
=
-
massb
*
((
vd
.
getProp
<
Pressure
>
(
a
)
+
vd
.
getProp
<
Pressure
>
(
b
))
/
(
rhoa
*
rhob
)
+
Tensile
(
r
,
rhoa
,
rhob
,
Pa
,
Pb
)
+
Pi
(
dr
,
r2
,
v_rel
,
rhoa
,
rhob
,
massb
,
visc
));
double
factor
=
-
massb
*
((
vd
.
getProp
<
Pressure
>
(
a
)
+
vd
.
getProp
<
Pressure
>
(
b
))
/
(
rhoa
*
rhob
)
+
Tensile
(
r
,
rhoa
,
rhob
,
Pa
,
Pb
)
+
Pi
(
dr
,
r2
,
v_rel
,
rhoa
,
rhob
,
massb
,
max_
visc
));
vd
.
getProp
<
force
>
(
a
)[
0
]
+=
factor
*
DW
.
get
(
0
);
vd
.
getProp
<
force
>
(
a
)[
1
]
+=
factor
*
DW
.
get
(
1
);
...
...
@@ -644,7 +643,7 @@ void max_acceleration_and_velocity(particles & vd, double & max_acc, double & ma
max_acc
=
sqrt
(
max_acc
);
max_vel
=
sqrt
(
max_vel
);
Vcluster
&
v_cl
=
create_vcluster
();
Vcluster
<>
&
v_cl
=
create_vcluster
();
v_cl
.
max
(
max_acc
);
v_cl
.
max
(
max_vel
);
v_cl
.
execute
();
...
...
@@ -759,7 +758,8 @@ void verlet_int(particles & vd, double dt)
vd
.
template
getProp
<
velocity
>(
a
)[
0
]
=
0.0
;
vd
.
template
getProp
<
velocity
>(
a
)[
1
]
=
0.0
;
vd
.
template
getProp
<
velocity
>(
a
)[
2
]
=
0.0
;
vd
.
template
getProp
<
rho
>(
a
)
=
vd
.
template
getProp
<
rho_prev
>(
a
)
+
dt2
*
vd
.
template
getProp
<
drho
>(
a
);
double
rhonew
=
vd
.
template
getProp
<
rho_prev
>(
a
)
+
dt2
*
vd
.
template
getProp
<
drho
>(
a
);
vd
.
template
getProp
<
rho
>(
a
)
=
(
rhonew
<
rho_zero
)
?
rho_zero
:
rhonew
;
vd
.
template
getProp
<
rho_prev
>(
a
)
=
rhop
;
...
...
@@ -836,7 +836,8 @@ void euler_int(particles & vd, double dt)
vd
.
template
getProp
<
velocity
>(
a
)[
0
]
=
0.0
;
vd
.
template
getProp
<
velocity
>(
a
)[
1
]
=
0.0
;
vd
.
template
getProp
<
velocity
>(
a
)[
2
]
=
0.0
;
vd
.
template
getProp
<
rho
>(
a
)
=
vd
.
template
getProp
<
rho
>(
a
)
+
dt
*
vd
.
template
getProp
<
drho
>(
a
);
double
rhonew
=
vd
.
template
getProp
<
rho
>(
a
)
+
dt
*
vd
.
template
getProp
<
drho
>(
a
);
vd
.
template
getProp
<
rho
>(
a
)
=
(
rhonew
<
rho_zero
)
?
rho_zero
:
rhonew
;
vd
.
template
getProp
<
rho_prev
>(
a
)
=
rhop
;
...
...
@@ -919,7 +920,7 @@ inline void sensor_pressure(Vector & vd,
openfpm
::
vector
<
openfpm
::
vector
<
double
>>
&
press_t
,
openfpm
::
vector
<
Point
<
3
,
double
>>
&
probes
)
{
Vcluster
&
v_cl
=
create_vcluster
();
Vcluster
<>
&
v_cl
=
create_vcluster
();
press_t
.
add
();
...
...
@@ -1390,7 +1391,7 @@ int main(int argc, char* argv[])
double
t
=
0.0
;
while
(
t
<=
t_end
)
{
Vcluster
&
v_cl
=
create_vcluster
();
Vcluster
<>
&
v_cl
=
create_vcluster
();
timer
it_time
;
////// Do rebalancing every 200 timesteps
...
...
@@ -1441,6 +1442,11 @@ int main(int argc, char* argv[])
if
(
write
<
t
*
100
)
{
// sensor_pressure calculation require ghost and update cell-list
vd
.
map
(
RUN_ON_DEVICE
);
vd
.
ghost_get
<
type
,
rho
,
Pressure
,
velocity
>
(
RUN_ON_DEVICE
);
vd
.
updateCellList
(
NN
);
// calculate the pressure at the sensor points
sensor_pressure
(
vd
,
NN
,
press_t
,
probes
);
...
...
@@ -1448,12 +1454,12 @@ int main(int argc, char* argv[])
write
++
;
if
(
v_cl
.
getProcessUnitID
()
==
0
)
std
::
cout
<<
"TIME: "
<<
t
<<
" write "
<<
it_time
.
getwct
()
<<
" "
<<
v_cl
.
getProcessUnitID
()
<<
" "
<<
cnt
<<
std
::
endl
;
{
std
::
cout
<<
"TIME: "
<<
t
<<
" write "
<<
it_time
.
getwct
()
<<
" "
<<
v_cl
.
getProcessUnitID
()
<<
" "
<<
cnt
<<
" Max visc: "
<<
max_visc
<<
std
::
endl
;}
}
else
{
if
(
v_cl
.
getProcessUnitID
()
==
0
)
std
::
cout
<<
"TIME: "
<<
t
<<
" "
<<
it_time
.
getwct
()
<<
" "
<<
v_cl
.
getProcessUnitID
()
<<
" "
<<
cnt
<<
std
::
endl
;
{
std
::
cout
<<
"TIME: "
<<
t
<<
" "
<<
it_time
.
getwct
()
<<
" "
<<
v_cl
.
getProcessUnitID
()
<<
" "
<<
cnt
<<
" Max visc: "
<<
max_visc
<<
std
::
endl
;}
}
}
...
...
example/Vector/7_SPH_dlb_gpu/Makefile
0 → 100644
View file @
509a43a2
include
../../example.mk
CC
=
mpic++
LDIR
=
OPT
=
OBJ
=
main.o
sph_dlb
:
sph_dlb_test
:
OPT += -DTEST_RUN
sph_dlb_test
:
sph_dlb
%.o
:
%.cu
nvcc
-O3
-g
-c
-isystem
=
/home/i-bird/MPI/include
--std
=
c++11
-o
$@
$<
$(INCLUDE_PATH_NVCC)
%.o
:
%.cpp
$(CC)
-O3
$(OPT)
-g
-c
--std
=
c++11
-o
$@
$<
$(INCLUDE_PATH)
sph_dlb
:
$(OBJ)
$(CC)
-o
$@
$^
$(CFLAGS)
$(LIBS_PATH)
$(LIBS)
all
:
sph_dlb
run
:
sph_dlb_test
mpirun
-np
2 ./sph_dlb
.PHONY
:
clean all run
clean
:
rm
-f
*
.o
*
~ core sph_dlb
example/Vector/7_SPH_dlb_gpu/config.cfg
0 → 100644
View file @
509a43a2
[pack]
files = main.cpp Makefile
example/Vector/7_SPH_dlb_gpu/main.cu
0 → 100644
View file @
509a43a2
This diff is collapsed.
Click to expand it.
example/Vector/7_SPH_dlb_opt/main.cpp
View file @
509a43a2
...
...
@@ -325,9 +325,9 @@ struct ModelCustom
template
<
typename
Decomposition
,
typename
vector
>
inline
void
addComputation
(
Decomposition
&
dec
,
vector
&
vd
,
size_t
v
,
size_t
p
)
{
if
(
vd
.
template
getProp
<
type
>(
p
)
==
FLUID
)
dec
.
addComputationCost
(
v
,
4
);
{
dec
.
addComputationCost
(
v
,
4
);}
else
dec
.
addComputationCost
(
v
,
3
);
{
dec
.
addComputationCost
(
v
,
3
);}
}
template
<
typename
Decomposition
>
inline
void
applyModel
(
Decomposition
&
dec
,
size_t
v
)
...
...
@@ -335,7 +335,7 @@ struct ModelCustom
dec
.
setSubSubDomainComputationCost
(
v
,
dec
.
getSubSubDomainComputationCost
(
v
)
*
dec
.
getSubSubDomainComputationCost
(
v
));
}
float
distributionTol
()
float
distributionTol
()
{
return
1.01
;
}
...
...
@@ -344,19 +344,19 @@ struct ModelCustom
//! Second model for dynamic load balancing
struct
ModelCustom2
{
template
<
typename
Decomposition
,
typename
vector
>
inline
void
addComputation
(
Decomposition
&
dec
,
vector
&
vd
,
size_t
v
,
size_t
p
)
{
dec
.
addComputationCost
(
v
,
vd
.
template
getProp
<
nn_num
>(
p
)
+
4
);
}
template
<
typename
Decomposition
,
typename
vector
>
inline
void
addComputation
(
Decomposition
&
dec
,
vector
&
vd
,
size_t
v
,
size_t
p
)
{
dec
.
addComputationCost
(
v
,
vd
.
template
getProp
<
nn_num
>(
p
)
+
4
);
}
template
<
typename
Decomposition
>
inline
void
applyModel
(
Decomposition
&
dec
,
size_t
v
)
{
}
template
<
typename
Decomposition
>
inline
void
applyModel
(
Decomposition
&
dec
,
size_t
v
)
{
}
float
distributionTol
()
{
return
1.01
;
}
float
distributionTol
()
{
return
1.01
;
}
};
inline
void
EqState
(
particles
&
vd
)
...
...
@@ -383,11 +383,11 @@ inline double Wab(double r)
r
/=
H
;
if
(
r
<
1.0
)
return
(
1.0
-
3.0
/
2.0
*
r
*
r
+
3.0
/
4.0
*
r
*
r
*
r
)
*
a2
;
{
return
(
1.0
-
3.0
/
2.0
*
r
*
r
+
3.0
/
4.0
*
r
*
r
*
r
)
*
a2
;}
else
if
(
r
<
2.0
)
return
(
1.0
/
4.0
*
(
2.0
-
r
*
r
)
*
(
2.0
-
r
*
r
)
*
(
2.0
-
r
*
r
))
*
a2
;
{
return
(
1.0
/
4.0
*
(
2.0
-
r
*
r
)
*
(
2.0
-
r
*
r
)
*
(
2.0
-
r
*
r
))
*
a2
;}
else
return
0.0
;
{
return
0.0
;}
}
const
double
c1
=
-
3.0
/
M_PI
/
H
/
H
/
H
/
H
;
...
...
@@ -662,7 +662,7 @@ double calc_deltaT(particles & vd, double ViscDtMax)
//-dt new value of time step.
double
dt
=
double
(
CFLnumber
)
*
std
::
min
(
dt_f
,
dt_cv
);
if
(
dt
<
double
(
DtMin
))
dt
=
double
(
DtMin
);
{
dt
=
double
(
DtMin
);}
return
dt
;
}
...
...
@@ -745,11 +745,6 @@ void verlet_int(particles & vd, double dt, double & max_disp)
vd
.
template
getProp
<
rho
>(
a
)
<
RhoMin
||
vd
.
template
getProp
<
rho
>(
a
)
>
RhoMax
)
{
to_remove
.
add
(
a
.
getKey
());
/*! \cond [big_number_set] \endcond */
/*! \cond [big_number_set] \endcond */
}
vd
.
template
getProp
<
velocity_prev
>(
a
)[
0
]
=
velX
;
...
...
openfpm_data
@
d8f45e69
Subproject commit
7ae9fb5851c9065623ee28628b8f8729332340cc
Subproject commit
d8f45e69a4c75e526d0a5c8e73b2cc426f898676
src/Decomposition/shift_vect_converter.hpp
View file @
509a43a2
...
...
@@ -40,7 +40,7 @@ class shift_vect_converter
HyperCube
<
dim
>
hyp
;
for
(
long
int
i
=
dim
-
1
;
i
>=
0
;
i
--
)
for
(
long
int
i
=
dim
;
i
>=
0
;
i
--
)
{
std
::
vector
<
comb
<
dim
>>
cmbs
=
hyp
.
getCombinations_R
(
i
);
...
...
src/Vector/cuda/vector_dist_cuda_func_test.cu
View file @
509a43a2
...
...
@@ -7,6 +7,7 @@
#include "Decomposition/CartDecomposition.hpp"
#include "util/cuda/scan_cuda.cuh"
#include "util/cuda/moderngpu/kernel_scan.hxx"
#include "Vector/vector_dist.hpp"
#define SUB_UNIT_FACTOR 1024
...
...
@@ -1070,5 +1071,124 @@ BOOST_AUTO_TEST_CASE(vector_dist_gpu_map_fill_send_buffer_test)
}
}
template
<
unsigned
int
prp
>
void
vector_dist_remove_marked_type
()
{
auto
&
v_cl
=
create_vcluster
();
if
(
v_cl
.
size
()
>
16
)
{
return
;}
Box
<
3
,
float
>
domain
({
0.0
,
0.0
,
0.0
},{
1.0
,
1.0
,
1.0
});
// set the ghost based on the radius cut off (make just a little bit smaller than the spacing)
Ghost
<
3
,
float
>
g
(
0.1
);
// Boundary conditions
size_t
bc
[
3
]
=
{
PERIODIC
,
PERIODIC
,
PERIODIC
};
vector_dist_gpu
<
3
,
float
,
aggregate
<
float
,
float
,
int
,
int
>>
vd
(
5000
*
v_cl
.
size
(),
domain
,
bc
,
g
);
auto
it
=
vd
.
getDomainIterator
();
float
fc
=
1.0
;
float
dc
=
1.0
;
int
ic
=
1
;
int
sc
=
1
;
while
(
it
.
isNext
())
{
auto
p
=
it
.
get
();
vd
.
template
getProp
<
0
>(
p
)
=
fc
;
vd
.
template
getProp
<
1
>(
p
)
=
dc
;
vd
.
template
getProp
<
2
>(
p
)
=
ic
;
vd
.
template
getProp
<
3
>(
p
)
=
sc
;
vd
.
template
getProp
<
prp
>(
p
)
=
(
ic
%
3
==
0
);
fc
+=
1.0
;
dc
+=
1.0
;
ic
+=
1
;
sc
+=
1
;
++
it
;
}
size_t
sz
=
vd
.
size_local
()
-
vd
.
size_local
()
/
3
;
vd
.
template
hostToDeviceProp
<
0
,
1
,
2
,
3
>();
remove_marked
<
prp
>
(
vd
);
BOOST_REQUIRE_EQUAL
(
vd
.
size_local
(),
sz
);
vd
.
template
deviceToHostProp
<
0
,
1
,
2
,
3
>();
auto
it2
=
vd
.
getDomainIterator
();
// There should not be number divisible by 3
bool
test
=
true
;
while
(
it2
.
isNext
())
{
auto
p
=
it2
.
get
();
if
(
prp
!=
0
)
{
test
&=
((
int
)
vd
.
template
getProp
<
0
>(
p
)
%
3
!=
0
);}
if
(
prp
!=
1
)
{
test
&=
((
int
)
vd
.
template
getProp
<
1
>(
p
)
%
3
!=
0
);}
if
(
prp
!=
2
)
{
test
&=
((
int
)
vd
.
template
getProp
<
2
>(
p
)
%
3
!=
0
);}
if
(
prp
!=
3
)
{
test
&=
((
int
)
vd
.
template
getProp
<
3
>(
p
)
%
3
!=
0
);}
if
(
test
==
false
)
{
if
(
prp
!=
0
)
{
std
::
cout
<<
(
int
)
vd
.
template
getProp
<
0
>(
p
)
<<
std
::
endl
;}
if
(
prp
!=
1
)
{
std
::
cout
<<
(
int
)
vd
.
template
getProp
<
1
>(
p
)
<<
std
::
endl
;}
if
(
prp
!=
2
)
{
std
::
cout
<<
(
int
)
vd
.
template
getProp
<
2
>(
p
)
<<
std
::
endl
;}
if
(
prp
!=
3
)
{
std
::
cout
<<
(
int
)
vd
.
template
getProp
<
3
>(
p
)
<<
std
::
endl
;}
break
;
}
++
it2
;
}
BOOST_REQUIRE_EQUAL
(
test
,
true
);
// We test where we do not remove anything
size_t
size_old
=
vd
.
size_local
();
// Because remove_marked is destructive we have to reset the property
vd
.
getPropVector
().
template
fill
<
prp
>(
0
);
remove_marked
<
prp
>
(
vd
);
BOOST_REQUIRE_EQUAL
(
vd
.
size_local
(),
size_old
);
}
BOOST_AUTO_TEST_CASE
(
vector_dist_remove_marked
)
{
vector_dist_remove_marked_type
<
0
>
();
vector_dist_remove_marked_type
<
1
>
();
vector_dist_remove_marked_type
<
2
>
();
vector_dist_remove_marked_type
<
3
>
();
}
BOOST_AUTO_TEST_SUITE_END
()
src/Vector/cuda/vector_dist_cuda_funcs.cuh
View file @
509a43a2
...
...
@@ -10,6 +10,7 @@
#include "Vector/util/vector_dist_funcs.hpp"
#include "util/cuda/moderngpu/kernel_reduce.hxx"
#include "util/cuda/moderngpu/kernel_scan.hxx"
#include "Decomposition/common.hpp"
template
<
unsigned
int
dim
,
typename
St
,
typename
decomposition_type
,
typename
vector_type
,
typename
start_type
,
typename
output_type
>
...
...
@@ -102,6 +103,20 @@ __global__ void find_buffer_offsets(vector_type vd, int * cnt, vector_type_offs
}
}
template
<
unsigned
int
prp_off
,
typename
vector_type
,
typename
vector_type_offs
>
__global__
void
find_buffer_offsets_no_prc
(
vector_type
vd
,
int
*
cnt
,
vector_type_offs
offs
)
{
int
p
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
p
>=
(
int
)
vd
.
size
()
-
1
)
return
;
if
(
vd
.
template
get
<
prp_off
>(
p
)
!=
vd
.
template
get
<
prp_off
>(
p
+
1
))
{
int
i
=
atomicAdd
(
cnt
,
1
);
offs
.
template
get
<
0
>(
i
)
=
p
+
1
;
}
}
template
<
typename
vector_m_opart_type
,
typename
vector_pos_type_out
,
typename
vector_prp_type_out
,
typename
vector_pos_type_in
,
typename
vector_prp_type_in
>
__global__
void
process_map_particles
(
vector_m_opart_type
m_opart
,
vector_pos_type_out
m_pos
,
vector_prp_type_out
m_prp
,
...
...
@@ -247,7 +262,15 @@ __global__ void reorder_lbl(vector_lbl_type m_opart, starts_type starts)
m_opart
.
template
get
<
0
>(
starts
.
template
get
<
0
>(
pr
)
+
m_opart
.
template
get
<
2
>(
i
))
=
i
;
}
template
<
unsigned
int
prp
,
typename
vector_type
>
template
<
typename
red_type
>
struct
_add_
:
mgpu
::
plus_t
<
red_type
>
{};
template
<
typename
red_type
>
struct
_max_
:
mgpu
::
maximum_t
<
red_type
>
{};
template
<
unsigned
int
prp
,
template
<
typename
>
class
op
,
typename
vector_type
>
auto
reduce
(
vector_type
&
vd
)
->
typename
std
::
remove_reference
<
decltype
(
vd
.
template
getProp
<
prp
>(
0
))
>::
type
{
typedef
typename
std
::
remove_reference
<
decltype
(
vd
.
template
getProp
<
prp
>(
0
))
>::
type
reduce_type
;
...
...
@@ -257,11 +280,117 @@ auto reduce(vector_type & vd) -> typename std::remove_reference<decltype(vd.temp
mgpu
::
reduce
((
reduce_type
*
)
vd
.
getPropVector
().
template
getDeviceBuffer
<
prp
>(),
vd
.
size_local
(),
(
reduce_type
*
)
mem
.
getDevicePointer
()
,
mgpu
::
plus_t
<
reduce_type
>
(),
vd
.
getVC
().
getmgpuContext
());
op
<
reduce_type
>
(),
vd
.
getVC
().
getmgpuContext
());
mem
.
deviceToHost
();
return
*
(
reduce_type
*
)(
mem
.
getPointer
());
}
template
<
typename
vector_type
>
__global__
void
create_index
(
vector_type
vd
)
{
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
i
>=
vd
.
size
())
return
;
vd
.
template
get
<
0
>(
i
)
=
i
;
}
template
<
unsigned
int
dim
,
typename
vector_pos_type
,
typename
vector_prp_type
,
typename
scan_type
>
__global__
void
copy_new_to_old
(
vector_pos_type
vd_pos_dst
,
vector_prp_type
vd_prp_dst
,
vector_pos_type
vd_pos_src
,
vector_prp_type
vd_prp_src
,
scan_type
idx
)
{
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
i
>=
vd_prp_dst
.
size
())
return
;
for
(
unsigned
int
k
=
0
;
k
<
dim
;
k
++
)
{
vd_pos_dst
.
template
get
<
0
>(
i
)[
k
]
=
vd_pos_src
.
template
get
<
0
>(
idx
.
template
get
<
0
>(
i
))[
k
];}
vd_prp_dst
.
set
(
i
,
vd_prp_src
,
idx
.
template
get
<
0
>(
i
));
}
/*! \brief Remove the particles marked on the properties prp (particles marked has has property set to 1, the others to 0)
*
* \warning the function is destructive on prp, it mean that after destruction the prp of the particles can contain garbage
*
* \tparam prp property that indicate the particles to remove
*
* \param vd distributed vector
*
*/
template
<
unsigned
int
prp
,
typename
vector_type
>
void
remove_marked
(
vector_type
&
vd
)
{
// This function make sense only if prp is an int or unsigned int
if
(
std
::
is_same
<
typename
boost
::
mpl
::
at
<
typename
vector_type
::
value_type
::
type
,
boost
::
mpl
::
int_
<
prp
>>::
type
,
int
>::
value
==
false
&&
std
::
is_same
<
typename
boost
::
mpl
::
at
<
typename
vector_type
::
value_type
::
type
,
boost
::
mpl
::
int_
<
prp
>>::
type
,
unsigned
int
>::
value
==
false
&&
std
::
is_same
<
typename
boost
::
mpl
::
at
<
typename
vector_type
::
value_type
::
type
,
boost
::
mpl
::
int_
<
prp
>>::
type
,
float
>::
value
==
false
&&
std
::
is_same
<
typename
boost
::
mpl
::
at
<
typename
vector_type
::
value_type
::
type
,
boost
::
mpl
::
int_
<
prp
>>::
type
,
double
>::
value
==
false
&&
std
::
is_same
<
typename
boost
::
mpl
::
at
<
typename
vector_type
::
value_type
::
type
,
boost
::
mpl
::
int_
<
prp
>>::
type
,
size_t
>::
value
==
false
)
{
std
::
cout
<<
__FILE__
<<
":"
<<
__LINE__
<<
" error, the function remove_marked work only if is an integer or unsigned int"
<<
std
::
endl
;
return
;
}
typedef
typename
boost
::
mpl
::
at
<
typename
vector_type
::
value_type
::
type
,
boost
::
mpl
::
int_
<
prp
>>::
type
remove_type
;
// first we do a scan of the property
openfpm
::
vector_gpu
<
aggregate
<
unsigned
int
>>
idx
;
idx
.
resize
(
vd
.
size_local
());
auto
ite
=
idx
.
getGPUIterator
();
create_index
<<<
ite
.
wthr
,
ite
.
thr
>>>
(
idx
.
toKernel
());
// sort particles, so the particles to remove stay at the end
mergesort
((
remove_type
*
)
vd
.
getPropVector
().
template
getDeviceBuffer
<
prp
>(),(
unsigned
int
*
)
idx
.
template
getDeviceBuffer
<
0
>(),
idx
.
size
(),
mgpu
::
template
less_t
<
remove_type
>(),
vd
.
getVC
().
getmgpuContext
());
openfpm
::
vector_gpu
<
aggregate
<
int
>>
mark
;
mark
.
resize
(
1
);
CudaMemory
mem
;
mem
.
allocate
(
sizeof
(
int
));
mem
.
fill
(
0
);
// mark point, particle that stay and to remove
find_buffer_offsets_no_prc
<
prp
,
decltype
(
vd
.
getPropVector
().
toKernel
()),
decltype
(
mark
.
toKernel
())
><<<
ite
.
wthr
,
ite
.
thr
>>>
(
vd
.
getPropVector
().
toKernel
(),(
int
*
)
mem
.
getDevicePointer
(),
mark
.
toKernel
());
mem
.
deviceToHost
();
// we have no particles to remove
if
(
*
(
int
*
)
mem
.
getPointer
()
==
0
)
{
return
;}
// Get the mark point
mark
.
template
deviceToHost
<
0
>();
// than create an equivalent buffer prop and pos
typename
std
::
remove_reference
<
decltype
(
vd
.
getPosVector
())
>::
type
vd_pos_new
;
typename
std
::
remove_reference
<
decltype
(
vd
.
getPropVector
())
>::
type
vd_prp_new
;
// resize them
vd_pos_new
.
resize
(
mark
.
template
get
<
0
>(
0
));
vd_prp_new
.
resize
(
mark
.
template
get
<
0
>(
0
));
auto
&
vd_pos_old
=
vd
.
getPosVector
();
auto
&
vd_prp_old
=
vd
.
getPropVector
();
// now we copy from the old vector to the new one
ite
=
vd_pos_old
.
getGPUIterator
();
copy_new_to_old
<
vector_type
::
dims
><<<
ite
.
wthr
,
ite
.
thr
>>>
(
vd_pos_new
.
toKernel
(),
vd_prp_new
.
toKernel
(),
vd_pos_old
.
toKernel
(),
vd_prp_old
.
toKernel
(),
idx
.
toKernel
());
// and we swap
vd
.
set_g_m
(
vd_pos_new
.
size
());
vd
.
getPosVector
().
swap
(
vd_pos_new
);
vd
.
getPropVector
().
swap
(
vd_prp_new
);
}
#endif
/* VECTOR_DIST_CUDA_FUNCS_CUH_ */
src/Vector/cuda/vector_dist_gpu_unit_tests.cu
View file @
509a43a2
...
...
@@ -92,7 +92,8 @@ template<typename CellList_type>
__global__
void
calculate_force_full_sort
(
vector_dist_ker
<
3
,
float
,
aggregate
<
float
,
float
[
3
],
float
[
3
]
>>
vd
,
CellList_type
cl
,
int
rank
)
{
auto
p
=
GET_PARTICLE_SORT
(
cl
);
unsigned
int
p
;
GET_PARTICLE_SORT
(
p
,
cl
);
unsigned
int
ns_id
=
cl
.
getSortToNonSort
().
template
get
<
0
>(
p
);
...
...
@@ -731,16 +732,28 @@ BOOST_AUTO_TEST_CASE(vector_dist_reduce)
vd
.
template
hostToDeviceProp
<
0
,
1
,
2
,
3
>();
float
redf
=
reduce
<
0
>
(
vd
);
double
redd
=
reduce
<
1
>
(
vd
);
int
redi
=
reduce
<
2
>
(
vd
);
size_t
reds
=
reduce
<
3
>
(
vd
);
float
redf
=
reduce
<
0
,
_add_
>
(
vd
);
double
redd
=
reduce
<
1