Commit c3bedcd4 authored by incardon's avatar incardon
Browse files

Moving forward gpu + eliminatig warning messages

parent aeb35be2
......@@ -42,7 +42,7 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func)
auto gcf2 = c3.getGPUIterator(k3,k4);
BOOST_REQUIRE_EQUAL(gcf2.thr.x,16ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.x,13ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.z,8ul);
......
......@@ -3,14 +3,25 @@
#include "Point_test.hpp"
#include <stdio.h>
/*__global__ void grid_fill_vector(grid_gpu_ker<3,Point_test<float>> g1, ite_gpu ite_gpu)
__global__ void grid_gradient_vector(grid_gpu_ker<3,Point_test<float>> g1, grid_gpu_ker<3,Point_test<float>> g2, ite_gpu<3> ite_gpu)
{
GRID_ID_3(ite_gpu);
g2.template get<4>(key)[0] = (g1.template get<0>(key.move(0,1)) - g1.template get<0>(key.move(0,-1))) / 2.0;
g2.template get<4>(key)[1] = (g1.template get<0>(key.move(1,1)) - g1.template get<0>(key.move(1,-1))) / 2.0;
g2.template get<4>(key)[2] = (g1.template get<0>(key.move(2,1)) - g1.template get<0>(key.move(2,-1))) / 2.0;
}
__global__ void grid_fill_vector(grid_gpu_ker<3,Point_test<float>> g1, ite_gpu<3> ite_gpu)
{
GRID_ID_3(ite_gpu);
g1.template get<4>(key)[0] = 1.0;
}*/
g1.template get<4>(key)[1] = 2.0;
g1.template get<4>(key)[2] = 3.0;
}
__global__ void compute_stencil_grid(grid_gpu_ker<3,Point_test<float>> g1, grid_gpu_ker<3,Point_test<float>> g2, ite_gpu ite_gpu)
__global__ void compute_stencil_grid(grid_gpu_ker<3,Point_test<float>> g1, grid_gpu_ker<3,Point_test<float>> g2, ite_gpu<3> ite_gpu)
{
GRID_ID_3(ite_gpu);
......@@ -100,10 +111,17 @@ void gpu_grid_3D_compute_grid_stencil(grid_gpu<3,Point_test<float>> & g1, grid_g
compute_stencil_grid<<< gpu_it.thr, gpu_it.wthr >>>(g1k,g2k,gpu_it);
}
/*void gpu_grid_fill_vector(grid_gpu<3,Point_test<float>> & g1, grid_key_dx<3> & start, grid_key_dx<3> & stop)
void gpu_grid_fill_vector(grid_gpu<3,Point_test<float>> & g1, grid_key_dx<3> & start, grid_key_dx<3> & stop)
{
auto gpu_it = g1.getGPUIterator(start,stop);
grid_fill_vector<<< gpu_it.thr, gpu_it.wthr >>>(g1.toGPU(),gpu_it);
}*/
}
void gpu_grid_gradient_vector(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2, grid_key_dx<3> & start, grid_key_dx<3> & stop)
{
auto gpu_it = g1.getGPUIterator(start,stop);
grid_gradient_vector<<< gpu_it.thr, gpu_it.wthr >>>(g1.toGPU(),g2.toGPU(),gpu_it);
}
......@@ -16,6 +16,9 @@ void gpu_grid_3D_one(grid_gpu<3,Point_test<float>> & g);
void gpu_grid_3D_compute_grid_stencil(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2,
grid_key_dx<3> & start, grid_key_dx<3> & stop);
//void gpu_grid_fill_vector(grid_gpu<3,Point_test<float>> & g1, grid_key_dx<3> & start, grid_key_dx<3> & stop);
void gpu_grid_fill_vector(grid_gpu<3,Point_test<float>> & g1, grid_key_dx<3> & start, grid_key_dx<3> & stop);
void gpu_grid_gradient_vector(grid_gpu<3,Point_test<float>> & g1, grid_gpu<3,Point_test<float>> & g2, grid_key_dx<3> & start, grid_key_dx<3> & stop);
#endif /* OPENFPM_DATA_SRC_GRID_CUDA_GPU_COMPUTE_CUH_ */
......@@ -100,6 +100,8 @@ struct mem_get<p,layout,data_type,g1_type,key_type,1>
{
__host__ __device__ static inline auto get(data_type & data_, const g1_type & g1, const key_type & v1) -> decltype(boost::fusion::at_c<p>(data_).mem_r.operator[](g1.LinId(v1)))
{
auto diocane2 = g1.LinId(v1);
boost::fusion::at_c<p>(data_).mem_r.operator[](diocane2);
return boost::fusion::at_c<p>(data_).mem_r.operator[](g1.LinId(v1));
}
......
......@@ -41,13 +41,14 @@
if (key.get(0) > ite_gpu.stop.get(0) || key.get(1) > ite_gpu.stop.get(1) || key.get(2) > ite_gpu.stop.get(2))\
{return;}
template<unsigned int dim>
struct ite_gpu
{
dim3 thr;
dim3 wthr;
grid_key_dx<3> start;
grid_key_dx<3> stop;
grid_key_dx<dim> start;
grid_key_dx<dim> stop;
};
#else
#define __host__
......@@ -367,12 +368,23 @@ public:
}
#ifdef CUDA_GPU
struct ite_gpu getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = 1024)
/*! \brief Get an iterator for the GPU
*
* \param start starting point
* \param stop end point
*
*/
struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = 1024)
{
size_t n = n_thr;
size_t tot_work = 1;
for (size_t i = 0 ; i < dim ; i++)
{tot_work *= key2.get(i) - key1.get(i) + 1;}
size_t n = (tot_work <= n_thr)?openfpm::math::round_big_2(tot_work):n_thr;
// Work to do
ite_gpu ig;
ite_gpu<dim> ig;
ig.thr.x = 1;
ig.thr.y = 1;
......@@ -392,11 +404,37 @@ public:
n = n >> 1;
dir++;
dir %= dim;
}
if (dim >= 1)
{ig.wthr.x = (key2.get(0) - key1.get(0) + 1) / ig.thr.x + (((key2.get(0) - key1.get(0) + 1)%ig.thr.x != 0)?1:0);}
if (dim >= 2)
{ig.wthr.y = (key2.get(1) - key1.get(1) + 1) / ig.thr.y + (((key2.get(1) - key1.get(1) + 1)%ig.thr.y != 0)?1:0);}
else
{ig.wthr.y = 1;}
if (dim >= 3)
{
// Roll the other dimensions on z
ig.wthr.z = 1;
for (size_t i = 2 ; i < dim ; i++)
{ig.wthr.z *= (key2.get(i) - key1.get(i) + 1) / ig.thr.z + (((key2.get(i) - key1.get(i) + 1)%ig.thr.z != 0)?1:0);}
}
else
{ig.wthr.z = 1;}
// crop if wthr == 1
if (dim >= 1 && ig.wthr.x == 1)
{ig.thr.x = (key2.get(0) - key1.get(0) + 1);}
if (dim >= 2 && ig.wthr.y == 1)
{ig.wthr.y = key2.get(1) - key1.get(1) + 1;}
ig.wthr.x = (key2.get(0) - key1.get(0) + 1) / ig.thr.x + (((key2.get(0) - key1.get(0) + 1)%ig.thr.x != 0)?1:0);
ig.wthr.y = (key2.get(1) - key1.get(1) + 1) / ig.thr.y + (((key2.get(1) - key1.get(1) + 1)%ig.thr.y != 0)?1:0);
ig.wthr.z = (key2.get(2) - key1.get(2) + 1) / ig.thr.z + (((key2.get(2) - key1.get(2) + 1)%ig.thr.z != 0)?1:0);
if (dim == 3 && ig.wthr.z == 1)
{ig.wthr.z = key2.get(2) - key1.get(2) + 1;}
ig.start = key1;
ig.stop = key2;
......
......@@ -450,7 +450,7 @@ public:
}
//! Linearize a set of index
template<typename a> inline mem_id Lin(a v) const
template<typename a> __device__ __host__ inline mem_id Lin(a v) const
{
return v;
}
......
......@@ -947,7 +947,78 @@ BOOST_AUTO_TEST_CASE (gpu_computation_grid_stencil)
// We also try to fill a vectorial quantity
// gpu_grid_fill_vector(c3,zero,keyl);
gpu_grid_fill_vector(c3,zero,keyl);
}
#endif
}
BOOST_AUTO_TEST_CASE (gpu_computation_grid_stencil_vector)
{
#ifdef CUDA_GPU
{
size_t sz[3] = {64,64,64};
grid_gpu<3, Point_test<float> > c3(sz);
grid_gpu<3, Point_test<float> > c2(sz);
grid_key_dx<3> key1({1,1,1});
grid_key_dx<3> zero({0,0,0});
grid_key_dx<3> key2({62,62,62});
grid_key_dx<3> keyl({63,63,63});
c3.setMemory();
c2.setMemory();
gpu_grid_fill_vector(c3,zero,keyl);
// Check property 1 is 1.0
c3.deviceToHost<4>();
{
auto it = c3.getIterator(key1,key2);
bool good = true;
while(it.isNext())
{
auto key = it.get();
good &= c3.get<4>(key)[0] == 1.0;
good &= c3.get<4>(key)[1] == 2.0;
good &= c3.get<4>(key)[2] == 3.0;
++it;
}
BOOST_REQUIRE_EQUAL(good,true);
}
// Fill c3
gpu_grid_3D_compute(c3);
gpu_grid_gradient_vector(c3,c2,key1,key2);
// Check property 1 is 1.0
c2.deviceToHost<4>();
{
auto it = c2.getIterator(key1,key2);
bool good = true;
while(it.isNext())
{
auto key = it.get();
good &= c2.get<4>(key)[0] == 1.0;
good &= c2.get<4>(key)[1] == 64.0;
good &= c2.get<4>(key)[2] == 4096.0;
++it;
}
BOOST_REQUIRE_EQUAL(good,true);
}
}
......
......@@ -193,7 +193,7 @@ public:
*/
template<typename T_type>
struct copy_switch_memory_c
struct copy_switch_memory_c_no_cpy
{
//! encapsulated source object
const typename memory_traits_inte<T_type>::type & src;
......@@ -207,7 +207,7 @@ struct copy_switch_memory_c
* \param dst source encapsulated object
*
*/
inline copy_switch_memory_c(const typename memory_traits_inte<T_type>::type & src,
inline copy_switch_memory_c_no_cpy(const typename memory_traits_inte<T_type>::type & src,
typename memory_traits_inte<T_type>::type & dst)
:src(src),dst(dst)
{
......@@ -222,7 +222,47 @@ struct copy_switch_memory_c
// Increment the reference of mem
boost::fusion::at_c<T::value>(dst).mem->incRef();
boost::fusion::at_c<T::value>(dst).mem_r.bind_ref(boost::fusion::at_c<T::value>(src).mem_r);
boost::fusion::at_c<T::value>(dst).switchToDevicePtr();
boost::fusion::at_c<T::value>(dst).switchToDevicePtrNoCopy();
}
};
/*! \brief this class is a functor for "for_each" algorithm
*
* This class is a functor for "for_each" algorithm. For each
* element of the boost::vector the operator() is called.
* Is mainly used to copy one encap into another encap object
*
* \tparam encap source
* \tparam encap dst
*
*/
template<typename T_type, unsigned int ... prp>
struct copy_host_to_device
{
//! encapsulated destination object
typename memory_traits_inte<T_type>::type & dst;
//! Convert the packed properties into an MPL vector
typedef typename to_boost_vmpl<prp...>::type v_prp;
/*! \brief constructor
*
* \param src source encapsulated object
* \param dst source encapsulated object
*
*/
inline copy_host_to_device(typename memory_traits_inte<T_type>::type & dst)
:dst(dst)
{
};
//! It call the copy function for each property
template<typename T>
inline void operator()(T& t) const
{
boost::fusion::at_c<boost::mpl::at<v_prp,boost::mpl::int_<T::value>>::type::value>(dst).switchToDevicePtr();
}
};
......@@ -254,7 +294,7 @@ struct grid_gpu_ker
grid_gpu_ker(const grid_gpu_ker & cpy)
:g1(cpy.g1)
{
copy_switch_memory_c<T> bp_mc(cpy.data_,this->data_);
copy_switch_memory_c_no_cpy<T> bp_mc(cpy.data_,this->data_);
boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(bp_mc);
}
......@@ -436,13 +476,16 @@ public:
* The object created can be considered like a reference of the original
*
*/
grid_gpu_ker<dim,T> toGPU()
template<unsigned int ... prp> grid_gpu_ker<dim,T> toGPU()
{
grid_gpu_ker<dim,T> g(this->g1);
copy_switch_memory_c<T> cp_mc(this->data_,g.data_);
copy_switch_memory_c_no_cpy<T> cp_mc(this->data_,g.data_);
boost::mpl::for_each_ref< boost::mpl::range_c<int,0,T::max_prop> >(cp_mc);
copy_host_to_device<T,prp...> cpod(g.data_);
boost::mpl::for_each_ref< boost::mpl::range_c<int,0,sizeof...(prp)> >(cpod);
return g;
}
......
......@@ -10,7 +10,7 @@ else
endif
noinst_PROGRAMS = mem_map
mem_map_SOURCES = ../../openfpm_devices/src/Memleak_check.cpp main.cpp Grid/gpu_test/cuda_gpu_func.cpp $(CUDA_SOURCES) ../../openfpm_devices/src/memory/HeapMemory.cpp ../../openfpm_devices/src/memory/PtrMemory.cpp Grid/gpu_test/cuda_grid_unit_tests.cu
mem_map_SOURCES = ../../openfpm_devices/src/Memleak_check.cpp main.cpp NN/CellList/CellList_gpu_test.cu Grid/gpu_test/cuda_gpu_func.cpp $(CUDA_SOURCES) ../../openfpm_devices/src/memory/HeapMemory.cpp ../../openfpm_devices/src/memory/PtrMemory.cpp Grid/gpu_test/cuda_grid_unit_tests.cu
mem_map_CXXFLAGS = $(AM_CXXFLAGS) $(LIBHILBERT_INCLUDE) $(CUDA_CFLAGS) $(INCLUDES_PATH) $(BOOST_CPPFLAGS) -I/usr/local/include -I/usr/local/libhilbert/include
mem_map_CFLAGS = $(CUDA_CFLAGS)
mem_map_LDADD = $(LINKLIBS)
......
......@@ -420,7 +420,7 @@ private:
size_t bc[dim];
for (size_t i = 0 ; i < dim ; i++)
bc[i] = NON_PERIODIC;
{bc[i] = NON_PERIODIC;}
Box<dim,long int> bx = cd_sm.convertDomainSpaceIntoCellUnits(dom_box,bc);
......
/*
* CellList_gpu.hpp
*
* Created on: Jun 11, 2018
* Author: i-bird
*/
#ifndef OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_
#define OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_
#include "config.h"
#ifdef CUDA_GPU
#include <cuda_runtime_api.h>
#include "CellDecomposer.hpp"
#include "Vector/map_vector.hpp"
#include "cuda/Cuda_cell_list_util_func.hpp"
constexpr int count = 0;
constexpr int start = 1;
template<unsigned int dim, typename T, typename Memory, typename cnt_type = int, typename ids_type = short int, typename transform = no_transform<dim,T>>
class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
{
//! \brief Cell information
openfpm::vector<aggregate<cnt_type,cnt_type>,Memory,typename memory_traits_inte<aggregate<cnt_type,cnt_type>>::type,memory_traits_inte> cl_n;
//! \brief particle information
openfpm::vector<aggregate<ids_type[dim+1]>,Memory,typename memory_traits_inte<aggregate<ids_type[dim+1]>>::type,memory_traits_inte> part_ids;
//! Spacing
CudaMemory spacing;
//! \brief number of sub-divisions in each direction
CudaMemory div;
//! Initialize the structures of the data structure
void InitializeStructures(const size_t (& div)[dim], size_t tot_n_cell)
{
spacing.allocate(sizeof(T)*dim);
this->div.allocate(dim*sizeof(ids_type));
T (& div_p)[dim] = *static_cast<T (*)[dim]>(this->div.getPointer());
T (& spacing_p)[dim] = *static_cast<T (*)[dim]>(this->spacing.getPointer());
for (size_t i = 0 ; i < dim ; i++)
{
div_p[i] = div[i];
spacing_p[i] = this->getCellBox().getP2().get(i);
}
// Force to copy into device
this->spacing.getDevicePointer();
this->div.getDevicePointer();
cl_n.resize(tot_n_cell);
}
public:
CellList_gpu(const Box<dim,T> & box, const size_t (&div)[dim], const size_t pad = 1)
{
Initialize(box,div,pad);
}
/*! Initialize the cell list
*
* \param box Domain where this cell list is living
* \param div grid size on each dimension
* \param pad padding cell
* \param slot maximum number of slot
*
*/
void Initialize(const Box<dim,T> & box, const size_t (&div)[dim], const size_t pad = 1)
{
SpaceBox<dim,T> sbox(box);
// Initialize point transformation
Initialize(sbox,div,pad);
}
/*! Initialize the cell list constructor
*
* \param box Domain where this cell list is living
* \param div grid size on each dimension
* \param pad padding cell
* \param slot maximum number of slot
*
*/
void Initialize(const SpaceBox<dim,T> & box, const size_t (&div)[dim], const size_t pad = 1)
{
Matrix<dim,T> mat;
CellDecomposer_sm<dim,T,transform>::setDimensions(box,div, mat, pad);
// create the array that store the number of particle on each cell and se it to 0
InitializeStructures(this->gr_cell.getSize(),this->gr_cell.size());
}
/*! \brief construct from a list of particles
*
* \param pl Particles list
*
*/
template<typename vector> void construct(vector & pl)
{
// First we set the count memory to zero
CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type)));
part_ids.resize(pl.size());
// Than we construct the ids
auto ite_gpu = pl.getGPUIterator();
subindex<dim,T,cnt_type,ids_type><<<ite_gpu.wthr,ite_gpu.thr>>>(*static_cast<ids_type (*)[dim]>(div.getDevicePointer()),
*static_cast<T (*)[dim]>(spacing.getDevicePointer()),
pl.capacity(),
pl.size(),
static_cast<T *>(pl.template getDeviceBuffer<0>()),
static_cast<cnt_type *>(cl_n.template getDeviceBuffer<0>()),
static_cast<ids_type *>(part_ids.template getDeviceBuffer<0>()));
}
};
#endif
#endif /* OPENFPM_DATA_SRC_NN_CELLLIST_CELLLIST_GPU_HPP_ */
/*
* CellList_gpu_test.cpp
*
* Created on: Jun 13, 2018
* Author: i-bird
*/
#define BOOST_GPU_ENABLED __host__ __device__
#include "config.h"
#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
#include "util/cuda_util.hpp"
#include "CellList_gpu.hpp"
BOOST_AUTO_TEST_SUITE( CellList_gpu_test )
template<unsigned int dim, typename T, typename cnt_type, typename ids_type>
void test_sub_index()
{
openfpm::vector<aggregate<cnt_type,cnt_type>,CudaMemory,typename memory_traits_inte<aggregate<cnt_type,cnt_type>>::type,memory_traits_inte> cl_n;
openfpm::vector<aggregate<ids_type[dim+1]>,CudaMemory,typename memory_traits_inte<aggregate<ids_type[dim+1]>>::type,memory_traits_inte> part_ids;
// fill with some particles
openfpm::vector<Point<dim,T>,CudaMemory,typename memory_traits_inte<Point<dim,T>>::type,memory_traits_inte> pl;
// create 3 particles
Point<dim,T> p1({0.2,0.2,0.2});
Point<dim,T> p2({0.9,0.2,0.2});
Point<dim,T> p3({0.2,0.9,0.2});
Point<dim,T> p4({0.2,0.2,0.9});
Point<dim,T> p5({0.9,0.9,0.2});
Point<dim,T> p6({0.9,0.2,0.9});
Point<dim,T> p7({0.2,0.9,0.9});
Point<dim,T> p8({0.9,0.9,0.9});
Point<dim,T> p9({0.0,0.0,0.0});
Point<dim,T> p10({0.205,0.205,0.205});
pl.add(p1);
pl.add(p2);
pl.add(p3);
pl.add(p4);
pl.add(p5);
pl.add(p6);
pl.add(p7);
pl.add(p8);
pl.add(p9);
pl.add(p10);
CudaMemory spacing;
CudaMemory div;
spacing.allocate(sizeof(T)*dim);
div.allocate(dim*sizeof(ids_type));
ids_type (& div_p)[dim] = *static_cast<ids_type (*)[dim]>(div.getPointer());
T (& spacing_p)[dim] = *static_cast<T (*)[dim]>(spacing.getPointer());
for (size_t i = 0 ; i < dim ; i++)
{
div_p[i] = 17;
spacing_p[i] = 0.1;
}
cl_n.resize(17*17*17);
CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type)));
part_ids.resize(9);
size_t sz[3] = {17,17,17};
grid_sm<3,void> gr(sz);
// Force to copy into device
spacing.getDevicePointer();
div.getDevicePointer();
auto ite = pl.getGPUIterator();
subindex<dim,T,cnt_type,ids_type><<<ite.wthr,ite.thr>>>(*static_cast<ids_type (*)[dim]>(div.getDevicePointer()),
*static_cast<T (*)[dim]>(spacing.getDevicePointer()),
pl.capacity(),
pl.size(),
static_cast<T *>(pl.template getDeviceBuffer<0>()),