Commit d78de391 authored by incardon's avatar incardon

Merge branch 'master' into develop

parents 23e6bc49 1e67b751
Pipeline #3052 passed with stages
in 23 minutes and 43 seconds
......@@ -5,7 +5,7 @@ if (POLICY CMP0074)
cmake_policy(SET CMP0074 OLD)
endif ()
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake_modules/ /opt/rocm/hip/cmake)
set(BOOST_INCLUDE ${Boost_INCLUDE_DIR} CACHE PATH "Include directory for BOOST")
set(LIBHILBERT_ROOT CACHE PATH "LibHilbert root path")
......@@ -15,11 +15,15 @@ set(ENABLE_GPU CACHE BOOL "Disable the GPU code independently that a cuda compil
set(TEST_PERFORMANCE CACHE BOOL "Enable test performance")
set(ALPAKA_ROOT CACHE PATH "Alpaka root path")
set(CUDA_ON_CPU CACHE BOOL "Make Cuda work on heap")
set(HIP_ENABLE CACHE BOOL "Enable HIP compiler")
set(AMD_ARCH_COMPILE "gfx900" CACHE STRING "AMD gpu architecture used to compile kernels")
if (ENABLE_GPU)
set(CUDA_ON_CPU OFF)
enable_language(CUDA)
find_package(CUDA)
set(CUDA_ON_CPU OFF)
if (NOT HIP_ENABLE)
enable_language(CUDA)
find_package(CUDA)
endif()
endif()
set (CMAKE_CXX_STANDARD 14)
......@@ -31,7 +35,7 @@ message("Searching Vc in ${Vc_DIR}")
find_package(Boost 1.72.0 REQUIRED COMPONENTS unit_test_framework iostreams program_options system filesystem OPTIONAL_COMPONENTS fiber context)
find_package(LibHilbert REQUIRED)
find_package(Vc REQUIRED)
find_package(HIP)
###### CONFIG.h FILE ######
......@@ -47,6 +51,11 @@ if(TEST_PERFORMANCE)
set(DEFINE_PERFORMANCE_TEST "#define PERFORMANCE_TEST")
endif()
if(HIP_FOUND)
set(DEFINE_HIP_GPU "#define HIP_GPU")
set(DEFINE_CUDIFY_USE_HIP "#define CUDIFY_USE_HIP")
endif()
if (Boost_FOUND)
set(DEFINE_HAVE_BOOST "#define HAVE_BOOST")
set(DEFINE_HAVE_BOOST_IOSTREAMS "#define HAVE_BOOST_IOSTREAMS")
......@@ -73,6 +82,10 @@ if(CUDA_ON_CPU)
set(DEFINE_CUDA_GPU "#define CUDA_GPU")
endif()
if(HIP_FOUND)
set(DEFINE_CUDA_GPU "#define CUDA_GPU")
endif()
if(LIBHILBERT_FOUND)
set(DEFINE_HAVE_LIBHILBERT "#define HAVE_LIBHILBERT 1")
else()
......
......@@ -7,7 +7,6 @@ hostname=$(hostname)
type_compile=$3
branch=$4
echo "Build on: $hostname with $type_compile branch: $branch"
if [ x"$hostname" == x"cifarm-centos-node.mpi-cbg.de" ]; then
......
......@@ -122,6 +122,7 @@ with_petsc
with_eigen
with_vcdevel
enable_gpu
enable_hip
enable_asan
'
......@@ -258,6 +259,9 @@ do
fi
conf_options="$conf_options -DENABLE_GPU=ON"
;;
hip)
conf_options="$conf_options -DHIP_ENABLE=ON"
;;
asan)
conf_options="$conf_options -DENABLE_ASAN=ON"
;;
......
......@@ -21,7 +21,7 @@ if (TEST_PERFORMANCE)
Vector/performance/vector_performance_test.cu)
endif ()
if (CUDA_FOUND OR CUDA_ON_CPU)
if (CUDA_FOUND OR CUDA_ON_CPU OR HIP_FOUND)
set(CUDA_SOURCES ${CUDA_SOURCES}
Vector/map_vector_sparse_unit_tests.cu
Vector/vector_gpu_unit_tests.cu
......@@ -45,7 +45,8 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
endif()
if (CUDA_ON_CPU)
add_definitions(-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000)
add_definitions(-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 )
set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX)
set_source_files_properties(isolation.cu PROPERTIES LANGUAGE CXX)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
......@@ -53,18 +54,47 @@ if (CUDA_ON_CPU)
endif()
endif()
add_executable(mem_map ${CUDA_SOURCES}
main.cpp
data_type/aggregate_unit_tests.cpp
util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp
memory_ly/memory_conf_unit_tests.cpp
Space/tests/SpaceBox_unit_tests.cpp
Space/Shape/Sphere_unit_test.cpp
if ( HIP_ENABLE AND HIP_FOUND )
list(APPEND HIP_HIPCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND HIP_HIPCC_FLAGS -O0)
endif()
list(APPEND HIP_HIPCC_FLAGS -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0)
set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE CXX)
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
hip_add_executable(mem_map ${CUDA_SOURCES}
main.cpp
data_type/aggregate_unit_tests.cpp
util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp
memory_ly/memory_conf_unit_tests.cpp
Space/tests/SpaceBox_unit_tests.cpp
Space/Shape/Sphere_unit_test.cpp
SparseGrid/SparseGrid_unit_tests.cpp
SparseGrid/SparseGrid_chunk_copy_unit_tests.cpp
Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp
Grid/Geometry/tests/grid_smb_tests.cpp)
else()
add_executable(mem_map ${CUDA_SOURCES}
main.cpp
data_type/aggregate_unit_tests.cpp
util/multi_array_openfpm/multi_array_ref_openfpm_unit_test.cpp
memory_ly/memory_conf_unit_tests.cpp
Space/tests/SpaceBox_unit_tests.cpp
Space/Shape/Sphere_unit_test.cpp
SparseGrid/SparseGrid_unit_tests.cpp
SparseGrid/SparseGrid_chunk_copy_unit_tests.cpp
Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp
Grid/copy_grid_unit_test.cpp NN/Mem_type/Mem_type_unit_tests.cpp
Grid/Geometry/tests/grid_smb_tests.cpp)
endif()
set_property(TARGET mem_map PROPERTY CUDA_ARCHITECTURES 60 75)
if (CUDA_FOUND)
......@@ -405,12 +435,13 @@ install(FILES util/multi_array_openfpm/array_openfpm.hpp
COMPONENT OpenFPM)
install(FILES util/cuda/scan_cuda.cuh
util/cuda/ofp_context.hxx
install(FILES util/cuda/ofp_context.hxx
util/cuda/kernels.cuh
util/cuda/scan_ofp.cuh
util/cuda/sort_ofp.cuh
util/cuda/reduce_ofp.cuh
util/cuda/segreduce_ofp.cuh
util/cuda/merge_ofp.cuh
DESTINATION openfpm_data/include/util/cuda
COMPONENT OpenFPM)
......
......@@ -32,6 +32,18 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func)
auto gcf = c3.getGPUIterator(k1,k2);
#ifdef __HIP__
BOOST_REQUIRE_EQUAL(gcf.thr.x,8ul);
BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul);
BOOST_REQUIRE_EQUAL(gcf.thr.z,4ul);
BOOST_REQUIRE_EQUAL(gcf.wthr.x,8ul);
BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul);
BOOST_REQUIRE_EQUAL(gcf.wthr.z,16ul);
#else
BOOST_REQUIRE_EQUAL(gcf.thr.x,16ul);
BOOST_REQUIRE_EQUAL(gcf.thr.y,8ul);
BOOST_REQUIRE_EQUAL(gcf.thr.z,8ul);
......@@ -40,12 +52,26 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func)
BOOST_REQUIRE_EQUAL(gcf.wthr.y,8ul);
BOOST_REQUIRE_EQUAL(gcf.wthr.z,8ul);
#endif
grid_key_dx<3> k3({50,50,50});
grid_key_dx<3> k4({62,62,62});
grid_key_dx<3> k5({60,61,62});
auto gcf2 = c3.getGPUIterator(k3,k4);
#ifdef __HIP__
BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.z,4ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.x,2ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.z,4ul);
#else
BOOST_REQUIRE_EQUAL(gcf2.thr.x,13ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.y,8ul);
BOOST_REQUIRE_EQUAL(gcf2.thr.z,8ul);
......@@ -54,6 +80,8 @@ BOOST_AUTO_TEST_CASE (gpu_computation_func)
BOOST_REQUIRE_EQUAL(gcf2.wthr.y,2ul);
BOOST_REQUIRE_EQUAL(gcf2.wthr.z,2ul);
#endif
gcf2 = c3.getGPUIterator(k3,k4,511);
BOOST_REQUIRE_EQUAL(gcf2.thr.x,8ul);
......
......@@ -421,7 +421,7 @@ public:
* \param stop end point
*
*/
struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = 1024) const
struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim> & key1, grid_key_dx<dim> & key2, size_t n_thr = default_kernel_wg_threads_) const
{
return getGPUIterator_impl<dim>(g1,key1,key2,n_thr);
}
......
......@@ -177,27 +177,6 @@ struct grid_p<1,ids_type>
#endif
template<unsigned int dim>
bool has_work_gpu(ite_gpu<dim> & ite)
{
size_t tot_work = 1;
if (dim == 1)
{tot_work *= ite.wthr.x * ite.thr.x;}
else if(dim == 2)
{
tot_work *= ite.wthr.x * ite.thr.x;
tot_work *= ite.wthr.y * ite.thr.y;
}
else
{
tot_work *= ite.wthr.x * ite.thr.x;
tot_work *= ite.wthr.y * ite.thr.y;
tot_work *= ite.wthr.z * ite.thr.z;
}
return tot_work != 0;
}
template<unsigned int dim>
void move_work_to_blocks(ite_gpu<dim> & ite)
......@@ -459,6 +438,9 @@ private:
{
#if defined(CUDA_GPU) && defined(__NVCC__)
// Compile time-cheking that make sense to call a GPU kernel to copy.
grid_key_dx<dim> start;
grid_key_dx<dim> stop;
......@@ -730,7 +712,7 @@ public:
* \param stop end point
*
*/
struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = 1024) const
struct ite_gpu<dim> getGPUIterator(grid_key_dx<dim,long int> & key1, grid_key_dx<dim,long int> & key2, size_t n_thr = default_kernel_wg_threads_) const
{
return getGPUIterator_impl<dim>(g1,key1,key2,n_thr);
}
......
......@@ -121,11 +121,33 @@ struct ite_gpu
#endif
};
template<unsigned int dim>
bool has_work_gpu(ite_gpu<dim> & ite)
{
size_t tot_work = 1;
if (dim == 1)
{tot_work *= ite.wthr.x * ite.thr.x;}
else if(dim == 2)
{
tot_work *= ite.wthr.x * ite.thr.x;
tot_work *= ite.wthr.y * ite.thr.y;
}
else
{
tot_work *= ite.wthr.x * ite.thr.x;
tot_work *= ite.wthr.y * ite.thr.y;
tot_work *= ite.wthr.z * ite.thr.z;
}
return tot_work != 0;
}
//! Declaration grid_sm
template<unsigned int N, typename T> class grid_sm;
template<unsigned int dim, typename T2, typename T>
ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = 1024);
ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = default_kernel_wg_threads_);
//! Declaration print_warning_on_adjustment
template <unsigned int dim, typename linearizer> class print_warning_on_adjustment;
......@@ -733,7 +755,7 @@ public:
return grid_key_dx_iterator_sub<N>(*this,start,stop);
}
#ifdef CUDA_GPU
#if defined(CUDA_GPU)
/*! \brief Get an iterator for the GPU
*
......@@ -742,7 +764,7 @@ public:
*
*/
template<typename T2>
struct ite_gpu<N> getGPUIterator(const grid_key_dx<N,T2> & key1, const grid_key_dx<N,T2> & key2, size_t n_thr = 1024) const
struct ite_gpu<N> getGPUIterator(const grid_key_dx<N,T2> & key1, const grid_key_dx<N,T2> & key2, size_t n_thr = default_kernel_wg_threads_) const
{
return getGPUIterator_impl<N>(*this,key1,key2,n_thr);
}
......@@ -753,7 +775,7 @@ public:
* \param stop end point
*
*/
struct ite_gpu<N> getGPUIterator(size_t n_thr = 1024) const
struct ite_gpu<N> getGPUIterator(size_t n_thr = default_kernel_wg_threads_) const
{
grid_key_dx<N> k1;
grid_key_dx<N> k2;
......
......@@ -185,12 +185,28 @@ public:
rem_copy_opt opt = rem_copy_opt::NONE_OPT)
{}
#if defined(__HIP__)
/*! \brief It copy a grid
*
* \param g grid to copy
*
*/
__device__ grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g)
{
printf("Error grid_base operator= is not defined in device code\n");
return *this;
}
#endif
/*! \brief It copy a grid
*
* \param g grid to copy
*
*/
grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g)
__host__ grid_base<dim,T,S> & operator=(const grid_base<dim,T,S> & g)
{
(static_cast<grid_base_impl<dim,T,S, memory_traits_lin> *>(this))->swap(g.duplicate());
......
......@@ -16,7 +16,6 @@
#include "CellList.hpp"
#include "util/boost/boost_array_openfpm.hpp"
#include "Point_test.hpp"
#include "util/cuda/scan_cuda.cuh"
#include "util/cuda_util.hpp"
BOOST_AUTO_TEST_SUITE( CellList_gpu_test )
......@@ -1363,6 +1362,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di
mgpu::ofp_context_t context(mgpu::gpu_context_opt::no_print_props);
cl2.construct(pl,pl_out,pl_prp,pl_prp_out,context,g_m);
auto & s_t_ns = cl2.getSortToNonSort();
pl.template hostToDevice<0>();
......@@ -1371,6 +1371,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di
// Domain particles
auto & gdsi = cl2.getDomainSortIds();
gdsi.template deviceToHost<0>();
s_t_ns.template deviceToHost<0>();
......@@ -1497,6 +1498,7 @@ void Test_cell_gpu_force(SpaceBox<dim,T> & box, size_t npart, const size_t (& di
}
BOOST_REQUIRE_EQUAL(check,true);
}
}
......
......@@ -395,7 +395,7 @@ template<unsigned int dim, typename T, typename CellS> void Test_NN_iterator_rad
for (size_t i = 0 ; i < dim ; i++)
{
vrp.template get<0>(j)[i] = ((float)rand() / RAND_MAX)*(box.getHigh(i) - box.getLow(i)) + box.getLow(i);
vrp.template get<0>(j)[i] = ((float)rand() / (float)RAND_MAX)*(box.getHigh(i) - box.getLow(i)) + box.getLow(i);
}
}
......
......@@ -356,15 +356,12 @@ class CellList_gpu : public CellDecomposer_sm<dim,T,transform>
{
#ifdef __NVCC__
CUDA_SAFE()
// Than we construct the ids
auto ite_gpu = pl.getGPUIteratorTo(stop-start-1);
cl_n.resize(this->gr_cell.size()+1);
cl_n.template fill<0>(0);
// CUDA_SAFE(cudaMemset(cl_n.template getDeviceBuffer<0>(),0,cl_n.size()*sizeof(cnt_type)));
part_ids.resize(stop - start);
......
......@@ -335,7 +335,7 @@ public:
* \return this
*
*/
inline Point_test<T> operator= (const Point_test<T> & p)
__device__ __host__ inline Point_test<T> & operator= (const Point_test<T> & p)
{
boost::fusion::at_c<0>(data) = boost::fusion::at_c<0>(p.data);
boost::fusion::at_c<1>(data) = boost::fusion::at_c<1>(p.data);
......
......@@ -211,7 +211,7 @@ class SpaceBox : public Box<dim,T>
Point<dim,T> p;
for (size_t i = 0 ; i < dim ; i++)
p.get(i) = ((T)rand())/RAND_MAX * (this->getHigh(i) - this->getLow(i)) + this->getLow(i);
p.get(i) = ((T)rand())/(T)RAND_MAX * (this->getHigh(i) - this->getLow(i)) + this->getLow(i);
return p;
}
......
......@@ -8,9 +8,11 @@
#ifndef SPARSEGRID_CHUNK_COPY_HPP_
#define SPARSEGRID_CHUNK_COPY_HPP_
#if !defined(__NVCC__) || defined(CUDA_ON_CPU)
#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__)
// Nvcc does not like VC ... for some reason
#include <Vc/Vc>
#endif
#include "util/mathutil.hpp"
......
......@@ -110,7 +110,7 @@ struct conv_impl
}
};
#if !defined(__NVCC__) || defined(CUDA_ON_CPU)
#if !defined(__NVCC__) || defined(CUDA_ON_CPU) || defined(__HIP__)
template<unsigned int dir,int p, unsigned int prop_src1,typename chunk_type, typename vect_type, typename ids_type>
......@@ -199,14 +199,16 @@ void load_crs_v(vect_type & cs1, chunk_type & chunk, ids_type & ids)
}
}
template<typename prop_type>
struct cross_stencil_v
{
Vc::double_v xm;
Vc::double_v xp;
Vc::double_v ym;
Vc::double_v yp;
Vc::double_v zm;
Vc::double_v zp;
Vc::Vector<prop_type> xm;
Vc::Vector<prop_type> xp;
Vc::Vector<prop_type> ym;
Vc::Vector<prop_type> yp;
Vc::Vector<prop_type> zm;
Vc::Vector<prop_type> zp;
};
template<>
......@@ -421,7 +423,7 @@ struct conv_impl<3>
for (int k = 0 ; k < sx::value ; k += Vc::Vector<prop_type>::Size)
{
// we do only id exist the point
if (*(int *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;}
if (*(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;}
data_il<Vc::Vector<prop_type>::Size> mxm;
data_il<Vc::Vector<prop_type>::Size> mxp;
......@@ -430,7 +432,7 @@ struct conv_impl<3>
data_il<Vc::Vector<prop_type>::Size> mzm;
data_il<Vc::Vector<prop_type>::Size> mzp;
cross_stencil_v cs;
cross_stencil_v<prop_type> cs;
Vc::Vector<prop_type> cmd(&chunk.template get<prop_src>()[s2]);
......@@ -451,37 +453,21 @@ struct conv_impl<3>
long int sumzp = (v == sz::value-1)?offset_jump[5] - (sz::value - 1)*sx::value*sy::value:sx::value*sy::value;
sumzp += s2;
if (Vc::Vector<prop_type>::Size == 2)
{
mxm.i = *(short int *)&mask.mask[s2];
mxm.i = mxm.i << 8;
mxm.i |= (short int)mask.mask[sumxm];
mxp.i = *(short int *)&mask.mask[s2];
mxp.i = mxp.i >> 8;
mxp.i |= ((short int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
mym.i = *(short int *)&mask.mask[sumym];
myp.i = *(short int *)&mask.mask[sumyp];
mzm.i = *(short int *)&mask.mask[sumzm];
mzp.i = *(short int *)&mask.mask[sumzp];
}
else if (Vc::Vector<prop_type>::Size == 4)
if (Vc::Vector<prop_type>::Size == 2 || Vc::Vector<prop_type>::Size == 4 || Vc::Vector<prop_type>::Size == 8)
{
mxm.i = *(int *)&mask.mask[s2];
mxm.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2];
mxm.i = mxm.i << 8;
mxm.i |= (int)mask.mask[sumxm];
mxm.i |= (typename data_il<Vc::Vector<prop_type>::Size>::type)mask.mask[sumxm];
mxp.i = *(int *)&mask.mask[s2];
mxp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2];
mxp.i = mxp.i >> 8;
mxp.i |= ((int)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
mxp.i |= ((typename data_il<Vc::Vector<prop_type>::Size>::type)mask.mask[sumxp]) << (Vc::Vector<prop_type>::Size - 1)*8;
mym.i = *(int *)&mask.mask[sumym];
myp.i = *(int *)&mask.mask[sumyp];
mym.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumym];
myp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumyp];
mzm.i = *(int *)&mask.mask[sumzm];
mzp.i = *(int *)&mask.mask[sumzp];
mzm.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumzm];
mzp.i = *(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[sumzp];
}
else
{
......@@ -737,7 +723,7 @@ struct conv_impl<3>
for (int k = 0 ; k < sx::value ; k += Vc::Vector<prop_type>::Size)
{
// we do only id exist the point
if (*(int *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;}
if (*(typename data_il<Vc::Vector<prop_type>::Size>::type *)&mask.mask[s2] == 0) {s2 += Vc::Vector<prop_type>::Size; continue;}
data_il<4> mxm;
data_il<4> mxp;
......@@ -746,8 +732,8 @@ struct conv_impl<3>
data_il<4> mzm;
data_il<4> mzp;
cross_stencil_v cs1;
cross_stencil_v cs2;
cross_stencil_v<prop_type> cs1;
cross_stencil_v<prop_type> cs2;
Vc::Vector<prop_type> cmd1(&chunk.template get<prop_src1>()[s2]);
Vc::Vector<prop_type> cmd2(&chunk.template get<prop_src2>()[s2]);
......@@ -769,37 +755,21 @@ struct conv_impl<3>
long int sumzp = (v == sz::value-1)?offset_jump[5] - (sz::value - 1)*sx::value*sy::value:sx::value*sy::value;
sumzp += s2;
if (Vc::Vector<prop_type>::Size == 2)