Commit 34d81e99 authored by incardon's avatar incardon
Browse files

Adding CUDA_ON_CPU

parent 9439b214
Pipeline #2513 failed with stages
in 6 seconds
......@@ -6,7 +6,7 @@ set(BOOST_INCLUDE ${Boost_INCLUDE_DIR} CACHE PATH "Include directory for BOOST")
set (CMAKE_CXX_STANDARD 14)
set (CMAKE_CUDA_STANDARD 14)
find_package(Boost 1.68.0 REQUIRED COMPONENTS unit_test_framework iostreams program_options fiber)
find_package(Boost 1.68.0 REQUIRED COMPONENTS unit_test_framework iostreams program_options OPTIONAL_COMPONENTS fiber context)
set(CUDA_ON_CPU CACHE BOOL "Make Cuda work on heap")
......@@ -34,6 +34,10 @@ if (BOOST_FOUND)
set(DEFINE_HAVE_BOOST_UNIT_TEST_FRAMEWORK "#define HAVE_BOOST_UNIT_TEST_FRAMEWORK")
endif()
if (ALPAKA_ROOT)
set(DEFINE_HAVE_ALPAKA "#define HAVE_ALPAKA")
endif()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/src/config/config_cmake.h.in ${CMAKE_CURRENT_SOURCE_DIR}/src/config/config.h)
add_subdirectory (src)
......
#!/bin/sh
#!/bin/bash
# configure script
#
#
# Because we moved to cmake this script emulate the configure script from autotools
conf_options=
ld_lib_pathopt=
# Avoid depending upon Character Ranges.
as_cr_letters='abcdefghijklmnopqrstuvwxyz'
......@@ -99,12 +100,15 @@ enable_debug
with_metis
with_hdf5
with_libhilbert
enable_cuda_on_cpu
enable_scan_coverty
enable_test_performance
enable_test_coverage
with_parmetis
enable_se_class1
enable_se_class2
enable_se_class3
with_alpaka
with_action_on_error
with_boost
with_boost_libdir
......@@ -116,7 +120,9 @@ with_lapack
with_suitesparse
with_petsc
with_eigen
with_vcdevel
enable_gpu
enable_asan
'
rm -rf build
......@@ -124,6 +130,8 @@ if [ ! -d "build" ]; then
mkdir build
fi
echo "/usr/local" > install_dir
##### Go over all options
for ac_option
do
......@@ -219,12 +227,24 @@ do
se_class1)
conf_options="$conf_options -DSE_CLASS1=ON"
;;
se_class2)
conf_options="$conf_options -DSE_CLASS2=ON"
;;
se_class3)
conf_options="$conf_options -DSE_CLASS3=ON"
;;
test_coverage)
conf_options="$conf_options -DTEST_COVERAGE=ON"
;;
scan_coverty)
conf_options="$conf_options -DSCAN_COVERTY=ON"
;;
cuda_on_cpu)
conf_options="$conf_options -DCUDA_ON_CPU=ON"
;;
test_performance)
conf_options="$conf_options -DTEST_PERFORMANCE=ON"
;;
gpu)
if [ x"$CXX" == x"" ]; then
conf_options="$conf_options"
......@@ -233,6 +253,9 @@ do
fi
conf_options="$conf_options -DENABLE_GPU=ON"
;;
asan)
conf_options="$conf_options -DENABLE_ASAN=ON"
;;
*) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
ac_unrecognized_sep=', '
;;
......@@ -458,16 +481,26 @@ do
;;
suitesparse)
conf_options="$conf_options -DSUITESPARSE_ROOT=$ac_optarg"
ld_lib_pathopt=$ac_optarg/lib
;;
eigen)
conf_options="$conf_options -DEIGEN3_ROOT=$ac_optarg"
;;
boost)
conf_options="$conf_options -DBOOST_ROOT=$ac_optarg"
conf_options="$conf_options -DBOOST_ROOT=$ac_optarg -DBoost_NO_BOOST_CMAKE=ON"
;;
action_on_error)
conf_options="$conf_options -DACTION_ON_ERROR=$ac_optarg"
;;
mpivendor)
conf_options="$conf_options -DMPI_VENDOR=$ac_optarg"
;;
vcdevel)
conf_options="$conf_options -DVc_ROOT=$ac_optarg"
;;
alpaka)
conf_options="$conf_options -DALPAKA_ROOT=$ac_optarg"
;;
*) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig"
ac_unrecognized_sep=', ';;
esac
......@@ -541,7 +574,7 @@ cd build
## remove enerything
echo "Calling cmake ../. $conf_options"
rm ../error_code
cmake ../. $conf_options
DYLD_LIBRARY_PATH=$ld_lib_pathopt cmake ../. $conf_options
if [ $? != 0 ]; then
#ok something went wrong the install script analyze the return code to potentially fix the problem automatically
# Read the error code and exit with that
......
......@@ -12,10 +12,10 @@ if (CUDA_ON_CPU)
endif()
endif()
add_executable(mem main.cpp memory/HeapMemory.cpp util/cudify_vars.cpp util/cudify_unit_test.cpp ${CUDA_SOURCES})
add_executable(mem main.cpp memory/HeapMemory.cpp util/cudify/cudify_vars.cpp util/cudify/cudify_unit_test.cpp ${CUDA_SOURCES})
add_library(ofpmmemory STATIC memory/HeapMemory.cpp util/cudify_vars.cpp memory/PtrMemory.cpp ${CUDA_SOURCES})
add_library(ofpmmemory_dl SHARED memory/HeapMemory.cpp util/cudify_vars.cpp memory/PtrMemory.cpp ${CUDA_SOURCES})
add_library(ofpmmemory STATIC memory/HeapMemory.cpp util/cudify/cudify_vars.cpp memory/PtrMemory.cpp ${CUDA_SOURCES})
add_library(ofpmmemory_dl SHARED memory/HeapMemory.cpp util/cudify/cudify_vars.cpp memory/PtrMemory.cpp ${CUDA_SOURCES})
if ( CMAKE_COMPILER_IS_GNUCC )
target_compile_options(mem PRIVATE "-Wno-deprecated-declarations")
......@@ -37,7 +37,9 @@ target_include_directories (mem PUBLIC ${CUDA_INCLUDE_DIRS})
target_include_directories (mem PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories (mem PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/config)
target_include_directories (mem PUBLIC ${Boost_INCLUDE_DIRS})
target_include_directories (mem PUBLIC ${ALPAKA_ROOT}/include)
if (ALPAKA_ROOT)
target_include_directories (mem PUBLIC ${ALPAKA_ROOT}/include)
endif()
if (CUDA_FOUND)
target_include_directories (mem PUBLIC ${CUDA_INCLUDE_DIRS})
endif()
......@@ -79,26 +81,3 @@ install(FILES ptr_info.hpp
DESTINATION openfpm_devices/include)
# Request that particles be built with -std=c++11
# As this is a public compile feature anything that links to particles
# will also build with -std=c++11
target_compile_features(mem PUBLIC cxx_std_11)
#if(BUILD_TESTING)
# add_executable(particle_test test.cu)
# set_target_properties(particle_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# target_link_libraries(particle_test PRIVATE particles)
# add_test(NAME particles_10k COMMAND particle_test 10000 )
# add_test(NAME particles_256k COMMAND particle_test 256000 )
# if(APPLE)
# We need to add the default path to the driver (libcuda.dylib) as an rpath,
# so that the static cuda runtime can find it at runtime.
# set_property(TARGET particle_test PROPERTY BUILD_RPATH ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
# endif()
#endif()
......@@ -107,12 +107,28 @@ ${DEFINE_HAVE_UNISTD_H}
/* Test TinyObjLoader */
${DEFINE_HAVE_TINYOBJLOADER}
/* ACTION to take in case of error */
${DEFINE_ACTION_ON_ERROR}
/* Define to the sub-directory where libtool stores uninstalled libraries. */
#define LT_OBJDIR ".libs/"
/* NVCC compiling */
${DEFINE_NVCC} /**/
/* Define if we have Alpaka */
${DEFINE_HAVE_ALPAKA}
/* Additional alpaka definitions */
${ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE_DEF}
${ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLE_DEF}
${ALPAKA_ACC_CPU_B_SEQ_T_FIBERS_ENABLE_DEF}
${ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLE_DEF}
${ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE_DEF}
${ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLE_DEF}
${ALPAKA_ACC_CPU_BT_OMP4_ENABLE_DEF}
/* Name of package */
#define PACKAGE "openfpm_pdata"
......@@ -140,6 +156,9 @@ ${DEFINE_PERFORMANCE_TEST}
/* Security enhancement class 1 */
${DEFINE_SE_CLASS1}
/* Security enhancement class 2 */
${DEFINE_SE_CLASS2}
/* Security enhancement class 3 */
${DEFINE_SE_CLASS3}
......@@ -157,3 +176,5 @@ ${DEFINE_TEST_COVERAGE_MODE}
/* Version number of package */
#define VERSION "1.0.0"
#define OPENFPM_PDATA
......@@ -42,7 +42,8 @@ bool CudaMemory::allocate(size_t sz)
#if defined(CUDA_GPU) && !defined(CUDA_ON_CPU)
CUDA_SAFE_CALL(cudaMalloc(&dm,sz));
#else
dm = new unsigned char[sz];
if (sz != 0)
{dm = new unsigned char[sz];}
#endif
}
else
......
......@@ -27,11 +27,7 @@
#ifndef CUDA_MEMORY_CUH_
#define CUDA_MEMORY_CUH_
#if __CUDACC_VER_MAJOR__ < 9
#define EXCEPT_MC
#else
#define EXCEPT_MC noexcept
#endif
#include "config.h"
#include "memory.hpp"
......
......@@ -8,6 +8,7 @@
#ifndef SE_CLASS1_CUDA_HPP_
#define SE_CLASS1_CUDA_HPP_
#include "util/se_util.hpp"
#include <type_traits>
#include <string>
......
......@@ -10,10 +10,6 @@
#include "cuda_kernel_error_checker.hpp"
#ifdef CUDA_ON_CPU
#include "cudify.hpp"
#endif
#if defined(CUDA_GPU) && !defined(CUDA_ON_CPU)
#if defined(SE_CLASS1) || defined(CUDA_CHECK_LAUNCH)
......@@ -114,7 +110,7 @@
#else
#include "util/cudify.hpp"
#include "util/cudify/cudify.hpp"
#endif
......
......@@ -29,27 +29,29 @@
#ifndef __host__
#define __host__
#define __device__
#define __shared__
#define __global__
#endif
#ifdef CUDA_ON_CPU
#define CUDA_SAFE(cuda_call) \
cuda_call;
#define CUDA_SAFE(cuda_call) \
cuda_call;
#define __shared__ static
#else
#define CUDA_SAFE(cuda_call) \
cuda_call; \
{\
cudaError_t e = cudaPeekAtLastError();\
if (e != cudaSuccess)\
#define CUDA_SAFE(cuda_call) \
cuda_call; \
{\
std::string error = cudaGetErrorString(e);\
std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
}\
}
cudaError_t e = cudaPeekAtLastError();\
if (e != cudaSuccess)\
{\
std::string error = cudaGetErrorString(e);\
std::cout << "Cuda Error in: " << __FILE__ << ":" << __LINE__ << " " << error << std::endl;\
}\
}
#define __shared__
#endif
......
#ifndef CUDIFY_HPP_
#define CUDIFY_HPP_
#ifdef CUDIFY_USE_ALPAKA
#else
#include "cudify_sequencial.hpp"
#endif
#endif
\ No newline at end of file
#ifndef CUDIFY_HPP_
#define CUDIFY_HPP_
#ifndef CUDIFY_ALPAKA_HPP_
#define CUDIFY_ALPAKA_HPP_
/*! \brief This file wrap CUDA functions and some CUB and MGPU function into CPU
*
* This file use ALPAKA as underline accelerator implementation.
*
* At the moment performances make it useless with mostly any available accelerator.
*
*/
#include "cudify_hardware.hpp"
#include "cuda_util.hpp"
#include "boost/bind.hpp"
#include <type_traits>
extern alpa_base_structs __alpa_base__;
......@@ -35,6 +44,31 @@ static void cudaDeviceSynchronize()
alpaka::wait(*__alpa_base__.queue);
}
static void cudaMemcpyFromSymbol(void * dev_mem,const unsigned char * global_cuda_error_array,size_t sz)
{
memcpy(dev_mem,global_cuda_error_array,sz);
}
/**
* CUDA memory copy types
*/
enum cudaMemcpyKind
{
cudaMemcpyHostToHost = 0, /**< Host -> Host */
cudaMemcpyHostToDevice = 1, /**< Host -> Device */
cudaMemcpyDeviceToHost = 2, /**< Device -> Host */
cudaMemcpyDeviceToDevice = 3, /**< Device -> Device */
cudaMemcpyDefault = 4 /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */
};
extern int vct_atomic_add;
extern int vct_atomic_rem;
static void cudaMemcpyToSymbol(unsigned char * global_cuda_error_array,const void * mem,size_t sz,int offset,int unused)
{
memcpy(global_cuda_error_array+offset,mem,sz);
}
namespace cub
{
template<typename T, unsigned int dim>
......@@ -60,67 +94,27 @@ namespace cub
__syncthreads();
T cur = tmp[0];
tmp[0] = 0;
for (int i = 1 ; i < dim ; i++)
if (threadIdx.x == 0)
{
tmp[i] = tmp[i-1] + cur;
T prec = tmp[0];
tmp[0] = 0;
for (int i = 1 ; i < dim ; i++)
{
auto next = tmp[i-1] + prec;
prec = tmp[i];
tmp[i] = next;
}
}
}
};
}
namespace mgpu
{
template<typename input_it,
typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t>
void segreduce(input_it input, int count, segments_it segments,
int num_segments, output_it output, op_t op, type_t init,
context_t& context)
{
for (int i = 0 ; i < num_segments - 1; i++)
{
output[i] = 0;
for (int j = segments[i] ; j < segments[i+1] ; j++)
{
op(output[i],input[j]);
}
}
}
// Key-value merge.
template<typename a_keys_it, typename a_vals_it,
typename b_keys_it, typename b_vals_it,
typename c_keys_it, typename c_vals_it,
typename comp_t, typename context_t>
void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count,
b_keys_it b_keys, b_vals_it b_vals, int b_count,
c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
{
int a_it = 0;
int b_it = 0;
int c_it = 0;
__syncthreads();
while (a_it < a_count && b_it < b_count)
{
if (comp(a_keys[a_it],b_keys[b_it]))
{
c_keys[c_it] = a_keys[a_it];
c_vals[c_it] = a_vals[a_it];
c_it++;
a_it++;
}
else
{
c_keys[c_it] = b_keys[a_it] + a_count;
c_vals[c_it] = b_vals[a_it];
c_it++;
b_it++;
}
out = tmp[threadIdx.x];
return;
}
}
};
}
template<typename T, typename T2>
static T atomicAdd(T * address, T2 val)
{
......@@ -138,6 +132,10 @@ namespace mgpu
bool operator()(type_t a, type_t b) const {
return a < b;
}
template<typename type2_t, typename type3_t>
bool operator()(type2_t a, type3_t b) const {
return a < b;
}
};
/* template<typename type_t>
struct less_equal_t : public std::binary_function<type_t, type_t, bool> {
......@@ -150,6 +148,10 @@ namespace mgpu
MGPU_HOST_DEVICE bool operator()(type_t a, type_t b) const {
return a > b;
}
template<typename type2_t, typename type3_t>
MGPU_HOST_DEVICE bool operator()(type2_t a, type3_t b) const {
return a > b;
}
};
/* template<typename type_t>
struct greater_equal_t : public std::binary_function<type_t, type_t, bool> {
......@@ -209,6 +211,90 @@ namespace mgpu
};
}
namespace mgpu
{
template<typename input_it,
typename segments_it, typename output_it, typename op_t, typename type_t, typename context_t>
void segreduce(input_it input, int count, segments_it segments,
int num_segments, output_it output, op_t op, type_t init,
context_t& context)
{
int i = 0;
for ( ; i < num_segments - 1; i++)
{
int j = segments[i];
output[i] = input[j];
++j;
for ( ; j < segments[i+1] ; j++)
{
output[i] = op(output[i],input[j]);
}
}
// Last segment
int j = segments[i];
output[i] = input[j];
++j;
for ( ; j < count ; j++)
{
output[i] = op(output[i],input[j]);
}
}
// Key-value merge.
template<typename a_keys_it, typename a_vals_it,
typename b_keys_it, typename b_vals_it,
typename c_keys_it, typename c_vals_it,
typename comp_t, typename context_t>
void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count,
b_keys_it b_keys, b_vals_it b_vals, int b_count,
c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context)
{
int a_it = 0;
int b_it = 0;
int c_it = 0;
while (a_it < a_count || b_it < b_count)
{
if (a_it < a_count)
{
if (b_it < b_count)
{
if (comp(b_keys[b_it],a_keys[a_it]))
{
c_keys[c_it] = b_keys[b_it];
c_vals[c_it] = b_vals[b_it];
c_it++;
b_it++;
}
else
{
c_keys[c_it] = a_keys[a_it];
c_vals[c_it] = a_vals[a_it];
c_it++;
a_it++;
}
}
else
{
c_keys[c_it] = a_keys[a_it];
c_vals[c_it] = a_vals[a_it];
c_it++;
a_it++;
}
}
else
{
c_keys[c_it] = b_keys[b_it];
c_vals[c_it] = b_vals[b_it];
c_it++;
b_it++;
}
}
}
}
static void init_alpaka()
{
if (__alpa_base__.initialized == true) {return;}
......@@ -221,6 +307,100 @@ static void init_alpaka()
__alpa_base__.initialized = true;
}
#ifdef PRINT_CUDA_LAUNCHES
#define CUDA_LAUNCH(cuda_call,ite, ...)\
{\
Vec_alpa const elementsPerThread(Vec_alpa::all(static_cast<Idx_alpa>(1)));\
Vec_alpa const grid_d((Idx_alpa)ite.wthr.x,(Idx_alpa)ite.wthr.y,(Idx_alpa)ite.wthr.z);\
Vec_alpa const thread_d((Idx_alpa)ite.thr.x,(Idx_alpa)ite.thr.y,(Idx_alpa)ite.thr.z);\
WorkDiv_alpa const workDiv = WorkDiv_alpa(grid_d,thread_d,elementsPerThread);\
\
gridDim.x = ite.wthr.x;\
gridDim.y = ite.wthr.y;\
gridDim.z = ite.wthr.z;\
\
blockDim.x = ite.thr.x;\
blockDim.y = ite.thr.y;\
blockDim.z = ite.thr.z;\
\
CHECK_SE_CLASS1_PRE\
\
std::cout << "Launching: " << #cuda_call << std::endl;\
\
alpaka::exec<Acc_alpa>(\
*__alpa_base__.queue,\
workDiv,\
[&] ALPAKA_FN_ACC(Acc_alpa const& acc) -> void {\
\
auto globalThreadIdx = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);\
auto globalBlockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc);\
auto globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);\
\
blockIdx.x = globalBlockIdx[0];\
blockIdx.y = globalBlockIdx[1];\
blockIdx.z = globalBlockIdx[2];\
\
threadIdx.x = globalThreadIdx[0];\
threadIdx.y = globalThreadIdx[1];\
threadIdx.z = globalThreadIdx[2];\
\
__alpa_base__.accKer = &acc;\
\
cuda_call(__VA_ARGS__);\
});\
CHECK_SE_CLASS1_POST(#cuda_call,__VA_ARGS__)\
}