Commit df7c8617 authored by incardon's avatar incardon
Browse files

Fixing performance

parent f84f0a37
Pipeline #4008 passed with stages
in 23 minutes and 36 seconds
......@@ -98,9 +98,9 @@ __global__ void copy_ndim_grid_device(grid_type src, grid_type dst)
template<bool inte_or_lin,unsigned int dim, typename T>
struct grid_toKernelImpl
{
template<typename grid_type> static grid_gpu_ker<dim,T,memory_traits_lin> toKernel(grid_type & gc)
template<typename grid_type> static grid_gpu_ker<dim,T,memory_traits_lin,typename grid_type::linearizer_type> toKernel(grid_type & gc)
{
grid_gpu_ker<dim,T,memory_traits_lin> g(gc.getGrid());
grid_gpu_ker<dim,T,memory_traits_lin,typename grid_type::linearizer_type> g(gc.getGrid());
g.get_data_().disable_manage_memory();
g.get_data_().mem = gc.get_internal_data_().mem;
......@@ -116,9 +116,9 @@ struct grid_toKernelImpl
template<unsigned int dim, typename T>
struct grid_toKernelImpl<true,dim,T>
{
template<typename grid_type> static grid_gpu_ker<dim,T,memory_traits_inte> toKernel(grid_type & gc)
template<typename grid_type> static grid_gpu_ker<dim,T,memory_traits_inte, typename grid_type::linearizer_type> toKernel(grid_type & gc)
{
grid_gpu_ker<dim,T,memory_traits_inte> g(gc.getGrid());
grid_gpu_ker<dim,T,memory_traits_inte, typename grid_type::linearizer_type> g(gc.getGrid());
copy_switch_memory_c_no_cpy<typename std::remove_reference<decltype(gc.get_internal_data_())>::type,
typename std::remove_reference<decltype(g.get_data_())>::type> cp_mc(gc.get_internal_data_(),g.get_data_());
......
......@@ -4,7 +4,8 @@
#include <stdio.h>
#include "cuda_grid_unit_tests_func.cuh"
__global__ void grid_gradient_vector(grid_gpu_ker<3,Point_aggr_test,memory_traits_inte> g1, grid_gpu_ker<3,Point_aggr_test,memory_traits_inte> g2, ite_gpu<3> ite_gpu)
template<typename grid_type1, typename grid_type2>
__global__ void grid_gradient_vector(grid_type1 g1, grid_type2 g2, ite_gpu<3> ite_gpu)
{
GRID_ID_3(ite_gpu);
......@@ -13,7 +14,8 @@ __global__ void grid_gradient_vector(grid_gpu_ker<3,Point_aggr_test,memory_trait
g2.template get<4>(key)[2] = (g1.template get<0>(key.move(2,1)) - g1.template get<0>(key.move(2,-1))) / 2.0;
}
__global__ void grid_fill_vector(grid_gpu_ker<3,Point_aggr_test,memory_traits_inte> g1, ite_gpu<3> ite_gpu)
template<typename grid_type>
__global__ void grid_fill_vector(grid_type g1, ite_gpu<3> ite_gpu)
{
GRID_ID_3(ite_gpu);
......@@ -22,7 +24,8 @@ __global__ void grid_fill_vector(grid_gpu_ker<3,Point_aggr_test,memory_traits_in
g1.template get<4>(key)[2] = 3.0;
}
__global__ void grid_fill_vector2(grid_gpu_ker<3,Point_aggr_test,memory_traits_inte> g1, ite_gpu<3> ite_gpu)
template<typename grid_type>
__global__ void grid_fill_vector2(grid_type g1, ite_gpu<3> ite_gpu)
{
GRID_ID_3(ite_gpu);
......@@ -31,8 +34,8 @@ __global__ void grid_fill_vector2(grid_gpu_ker<3,Point_aggr_test,memory_traits_i
g1.template get<4>(key)[2] = 1003.0;
}
__global__ void compute_stencil_grid(grid_gpu_ker<3,Point_aggr_test,memory_traits_inte> g1, grid_gpu_ker<3,Point_aggr_test,memory_traits_inte> g2, ite_gpu<3> ite_gpu)
template<typename grid_type>
__global__ void compute_stencil_grid(grid_type g1, grid_type g2, ite_gpu<3> ite_gpu)
{
GRID_ID_3(ite_gpu);
......
......@@ -142,14 +142,14 @@ __device__ void fill_grid_error_array(size_t lin_id)
* \tparam n_buf number of template buffers
*
*/
template<unsigned int dim, typename T, template <typename> class layout_base>
template<unsigned int dim, typename T, template <typename> class layout_base, typename linearizer>
class grid_gpu_ker
{
//! Type T
typedef typename apply_transform<layout_base,T>::type T_;
//! grid information
grid_sm<dim,void> g1;
linearizer g1;
//! type of layout of the structure
typedef typename layout_base<T_>::type layout;
......@@ -201,7 +201,7 @@ public:
__device__ __host__ grid_gpu_ker()
{}
__device__ __host__ grid_gpu_ker(const grid_sm<dim,void> & g1)
__device__ __host__ grid_gpu_ker(const linearizer & g1)
:g1(g1)
{
}
......@@ -209,7 +209,12 @@ public:
__device__ __host__ grid_gpu_ker(const grid_gpu_ker & cpy)
:g1(cpy.g1)
{
// std::cout << "Constructing " << &cpy << std::endl;
grid_gpu_ker_constructor_impl<is_layout_inte<layout_base<T_>>::value,T_>::construct(cpy,*this);
}
__device__ __host__ void constructor_impl(const grid_gpu_ker & cpy)
{
g1 = cpy.g1;
grid_gpu_ker_constructor_impl<is_layout_inte<layout_base<T_>>::value,T_>::construct(cpy,*this);
}
......@@ -337,7 +342,7 @@ public:
}
__device__ inline void set(const grid_key_dx<dim> & key1,const grid_gpu_ker<dim,T_,layout_base> & g, const grid_key_dx<dim> & key2)
__device__ inline void set(const grid_key_dx<dim> & key1,const grid_gpu_ker<dim,T_,layout_base, linearizer> & g, const grid_key_dx<dim> & key2)
{
#ifdef SE_CLASS1
if (check_bound(key1) == false)
......@@ -351,7 +356,7 @@ public:
this->get_o(key1) = g.get_o(key2);
}
template<unsigned int ... prp> __device__ inline void set(const grid_key_dx<dim> & key1,const grid_gpu_ker<dim,T_,layout_base> & g, const grid_key_dx<dim> & key2)
template<unsigned int ... prp> __device__ inline void set(const grid_key_dx<dim> & key1,const grid_gpu_ker<dim,T_,layout_base, linearizer> & g, const grid_key_dx<dim> & key2)
{
#ifdef SE_CLASS1
if (check_bound(key1) == false)
......@@ -413,7 +418,7 @@ public:
* \param object to copy
*
*/
grid_gpu_ker<dim,T,layout_base> & operator=(const grid_gpu_ker<dim,T,layout_base> & g)
grid_gpu_ker<dim,T,layout_base,linearizer> & operator=(const grid_gpu_ker<dim,T,layout_base,linearizer> & g)
{
g1 = g.g1;
......
......@@ -570,6 +570,12 @@ private:
}
else
grid_new.setMemory();
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T>>::value,dim,T>::toKernel(*this);
#endif
}
public:
......@@ -650,6 +656,12 @@ public:
{
swap(g.duplicate());
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T>>::value,dim,T>::toKernel(*this);
#endif
return *this;
}
......@@ -664,6 +676,12 @@ public:
{
swap(g);
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T>>::value,dim,T>::toKernel(*this);
#endif
return *this;
}
......@@ -773,6 +791,13 @@ public:
void setMemory()
{
mem_setm<S,layout_base<T>,decltype(this->data_),decltype(this->g1)>::setMemory(data_,g1,is_mem_init);
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T>>::value,dim,T>::toKernel(*this);
#endif
}
/*! \brief Set the object that provide memory from outside
......@@ -791,17 +816,17 @@ public:
//! Is external
isExternal = true;
//! Create and set the memory allocator
// data_.setMemory(m);
//! Allocate the memory and create the reppresentation
// if (g1.size() != 0) data_.allocate(g1.size());
bool skip_ini = skip_init<has_noPointers<T>::value,T>::skip_();
mem_setmemory<decltype(data_),S,layout_base<T>>::template setMemory<p>(data_,m,g1.size(),skip_ini);
is_mem_init = true;
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T>>::value,dim,T>::toKernel(*this);
#endif
}
/*! \brief Set the object that provide memory from outside
......@@ -825,6 +850,12 @@ public:
mem_setmemory<decltype(data_),S,layout_base<T>>::template setMemoryArray(*this,m,g1.size(),skip_ini);
is_mem_init = true;
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = grid_toKernelImpl<is_layout_inte<layout_base<T>>::value,dim,T>::toKernel(*this);
#endif
}
/*! \brief Return a plain pointer to the internal data
......@@ -1315,18 +1346,7 @@ public:
resize_impl_memset(grid_new);
// We know that, if it is 1D we can safely copy the memory
// if (dim == 1)
// {
// //! 1-D copy (This case is simple we use raw memory copy because is the fastest option)
// grid_new.data_.mem->copy(*data_.mem);
// }
// else
// {
// It should be better to separate between fast and slow cases
//! N-D copy
//! N-D copy
if (opt & DATA_ON_HOST)
{resize_impl_host(sz,grid_new);}
......@@ -1388,30 +1408,27 @@ public:
*
* This is a different from the standard swap and require long explanation.
*
* This object by default when it construct after we call setMemory() it create an internal memory object
* and use it allocate memory internally.
* This object (grid) by default when it constructs and after we call setMemory() it create an internal memory object
* and use it allocate memory internally. (Mode 1)
*
* If instead we use setMemory(external_mem) this object does not create an internal memory object but use
* the passed object to allocate memory. Because the external memory can already have a pool of memory preallocated
* we can re-use the memory.
*
* Despite this setMemory can be used to do memory retaining/re-use and/or garbage collection.
* It can be seen from a different prospective of making the data structures act like a representation of external
* memory. De facto we are giving meaning to the external memory so we are shaping or re-shaping pre-existing
* external memory.
* the passed object to allocate memory. (Mode 2)
*
* In the following I will call these two mode Mode1 and Mode2
* External memory can be used to do memory retaining/re-use and/or garbage collection and making the data structures
* act like a representation of external memory
*
* Using the structure in this way has consequences, because now in Mode2 the memory (and so its life-span) is disentangled
* Using the structure in this way has consequences, because in Mode2 the memory (and so its life-span) is disentangled
* by its structure.
*
*
* The main difference comes when we swap object in which one of both are in Mode2
* The problem comes when we swap object in which one the structure is in Mode2
*
* Let's suppose object A is in Mode1 and object B is is Mode2. The normal swap, fully swap the objects
*
* A.swap(B) A become B (in mode 2) and B become A (in mode 1)
*
* swap nomode require that A and B have the same size.
*
* A.swap_nomode(B) In this case the mode is not swapped A become B (in mode 1) and B become A (in mode 2).
* So the mode is not swapped and remain the original
*
......@@ -1669,7 +1686,32 @@ public:
return grid_key_dx_iterator<dim>(gvoid);
}
#ifdef CUDA_GPU
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
mutable grid_gpu_ker<dim,T,layout_base,linearizer_type> base_gpu;
/*! \brief Convert the grid into a data-structure compatible for computing into GPU
*
* The object created can be considered like a reference of the original
*
*/
grid_gpu_ker<dim,T,layout_base,linearizer_type> & toKernel()
{
return base_gpu;
}
/*! \brief Convert the grid into a data-structure compatible for computing into GPU
*
* The object created can be considered like a reference of the original
*
*/
const grid_gpu_ker<dim,T,layout_base,linearizer_type> & toKernel() const
{
return base_gpu;
}
#else
/*! \brief Convert the grid into a data-structure compatible for computing into GPU
*
......
......@@ -315,7 +315,7 @@ public:
* The object created can be considered like a reference of the original
*
*/
grid_gpu_ker<dim,T_,memory_traits_lin> toKernel()
grid_gpu_ker<dim,T_,memory_traits_lin,linearizer_type> toKernel()
{
return grid_toKernelImpl<is_layout_inte<memory_traits_lin<T_>>::value,dim,T_>::toKernel(*this);
}
......@@ -325,7 +325,7 @@ public:
* The object created can be considered like a reference of the original
*
*/
const grid_gpu_ker<dim,T_,memory_traits_lin> toKernel() const
const grid_gpu_ker<dim,T_,memory_traits_lin,linearizer_type> toKernel() const
{
return grid_toKernelImpl<is_layout_inte<memory_traits_lin<T_>>::value,dim,T_>::toKernel(*this);
}
......@@ -874,7 +874,7 @@ public:
* The object created can be considered like a reference of the original
*
*/
grid_gpu_ker<dim,T_,memory_traits_inte> toKernel()
grid_gpu_ker<dim,T_,memory_traits_inte,linearizer_type> toKernel()
{
return grid_toKernelImpl<is_layout_inte<memory_traits_inte<T_>>::value,dim,T_>::toKernel(*this);
}
......@@ -884,7 +884,7 @@ public:
* The object created can be considered like a reference of the original
*
*/
const grid_gpu_ker<dim,T_,memory_traits_inte> toKernel() const
const grid_gpu_ker<dim,T_,memory_traits_inte,linearizer_type> toKernel() const
{
return grid_toKernelImpl<is_layout_inte<memory_traits_inte<T>>::value,dim,T_>::toKernel(*this);
}
......
......@@ -127,7 +127,7 @@ namespace openfpm
unsigned int v_size;
//! 1-D static grid
grid_gpu_ker<1,T_,layout_base> base;
grid_gpu_ker<1,T_,layout_base,grid_sm<1,void>> base;
/*! \brief Check that the key is inside the grid
*
......@@ -180,7 +180,6 @@ namespace openfpm
return base.size();
}
/*! \brief Get an element of the vector
*
* Get an element of the vector
......@@ -344,10 +343,20 @@ namespace openfpm
:v_size(0)
{}
vector_gpu_ker(int v_size, const grid_gpu_ker<1,T_,layout_base> & cpy)
vector_gpu_ker(int v_size, const grid_gpu_ker<1,T_,layout_base,grid_sm<1,void>> & cpy)
:v_size(v_size),base(cpy)
{}
/*! \brief implementation of the constructor
*
* \param v_size number of elements
*
*/
inline void constructor_impl(int v_size, const grid_gpu_ker<1,T_,layout_base,grid_sm<1,void>> & cpy)
{
this->v_size = v_size;
base.constructor_impl(cpy);
}
/*! \brief Set the object id to obj
*
......@@ -491,7 +500,7 @@ namespace openfpm
* \return the base
*
*/
__device__ grid_gpu_ker<1,T_,layout_base> & getBase()
__device__ grid_gpu_ker<1,T_,layout_base, grid_sm<1,void>> & getBase()
{
return base;
}
......
......@@ -369,6 +369,12 @@ namespace openfpm
size_t sz[1] = {sp};
base.resize(sz);
}
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
}
/*! \brief Clear the vector
......@@ -390,6 +396,12 @@ namespace openfpm
{
size_t sz[1] = {size()};
base.resize(sz);
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
}
/*! \brief Resize the vector
......@@ -426,6 +438,12 @@ namespace openfpm
// update the vector size
v_size = slot;
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
}
......@@ -452,6 +470,12 @@ namespace openfpm
// update the vector size
v_size = slot;
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
}
//! Access key for the vector
......@@ -477,6 +501,12 @@ namespace openfpm
//! increase the vector size
v_size++;
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
}
/*! \brief It insert a new emtpy object on the vector, eventually it reallocate the grid
......@@ -499,6 +529,12 @@ namespace openfpm
//! increase the vector size
v_size++;
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
}
/*! \brief It insert a new object on the vector, eventually it reallocate the grid
......@@ -526,6 +562,12 @@ namespace openfpm
//! increase the vector size
v_size++;
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
}
/*! \brief It insert a new object on the vector, eventually it reallocate the vector
......@@ -554,6 +596,12 @@ namespace openfpm
//! increase the vector size
v_size++;
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
}
/*! \brief It add the element of another vector to this vector
......@@ -1434,6 +1482,12 @@ namespace openfpm
:v_size(0)
{
swap(v);
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
}
/*! \brief Constructor from another constant vector
......@@ -1445,6 +1499,12 @@ namespace openfpm
:v_size(0)
{
swap(v.duplicate());
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
}
//! Constructor, vector of size 0
......@@ -1452,6 +1512,12 @@ namespace openfpm
:v_size(0),base(0)
{
base.setMemory();
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
}
//! Constructor, vector of size sz
......@@ -1459,6 +1525,12 @@ namespace openfpm
:v_size(sz),base(sz)
{
base.setMemory();
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
}
/*! \brief Set the object id to obj
......@@ -1548,6 +1620,12 @@ namespace openfpm
v_size = mv.v_size;
base.swap(mv.base);
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
return *this;
}
......@@ -1575,6 +1653,12 @@ namespace openfpm
copy_two_vectors_activate_impl<Memory::isDeviceHostSame() == false>::copy2(*this,mv);
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
return *this;
}
......@@ -1592,6 +1676,12 @@ namespace openfpm
v_size = mv.v_size;
base.swap(mv.base);
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
return *this;
}
......@@ -1619,6 +1709,12 @@ namespace openfpm
copy_two_vectors_activate_impl<Memory::isDeviceHostSame() == false && Mem::isDeviceHostSame() == false>::copy2(*this,mv);
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
return *this;
}
......@@ -1637,6 +1733,12 @@ namespace openfpm
v_size = mv.v_size;
base.swap(mv.base);
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
return *this;
}
......@@ -1668,6 +1770,13 @@ namespace openfpm
copy_two_vectors_activate_impl<Memory::isDeviceHostSame() == false && Mem::isDeviceHostSame() == false>::copy2(*this,mv);
#if defined(CUDIFY_USE_SEQUENTIAL) || defined(CUDIFY_USE_OPENMP)
base_gpu = this->toKernel();
#endif
return *this;
}
......@@ -1719,6 +1828,13 @@ namespace openfpm
v_size = v.v_size;
base.swap_nomode(v.base);