Skip to content
Snippets Groups Projects
Commit 3c70d3ad authored by Abhinav Singh's avatar Abhinav Singh
Browse files

Merge remote-tracking branch 'origin/master'

parents 760f8a91 0c4f0f5b
No related branches found
No related tags found
No related merge requests found
# Kokkos minimally requires 3.10 right now,
# but your project can set it higher
cmake_minimum_required(VERSION 3.10)
# Projects can safely mix languages - must have C++ support
# Kokkos flags will only apply to C++ files
project(Example CXX)
# You need this for using Kokkos_ROOT variable
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.12.0")
message(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
cmake_policy(SET CMP0074 NEW)
endif()
find_package(OpenMP)
find_package(Threads)
find_package(MPI)
find_package(Boost 1.75.0 COMPONENTS program_options iostreams filesystem fiber context REQUIRED)
# Look for an installed Kokkos
find_package(openfpm 4.1.0 REQUIRED)
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-g -O3")
add_definitions( -D__NVCC__ -DCUDART_VERSION=11000 )
add_compile_options( -ftree-vectorize -fstrict-aliasing -ffast-math -march=skylake -mavx2 -mtune=skylake -mprefer-vector-width=256 )
# -fopt-info-vec-missed
add_executable(nbody main.cpp)
# This is the only thing required to set up compiler/linker flags
target_link_libraries(nbody openfpm::binary_config)
target_link_libraries(nbody OpenMP::OpenMP_CXX)
#include <Vector/map_vector.hpp>
#include <chrono>
//#include <memory/HeapMemory.cpp>
#define NELEMENTS 16777216
//#if defined __GNUC__ // Got the idea but it seems incorrect for clang __GNUC__ is defined, in clang it end here
//# define IVDEP _Pragma("GCC ivdep")
//#elif defined(_MSC_VER)
//# define IVDEP __pragma(loop(ivdep))
//#elif defined __clang__
//# define IVDEP _Pragma("clang loop vectorize(enable) interleave(enable) distribute(enable)")
#define IVDEP _Pragma("clang loop vectorize(assume_safety) interleave(assume_safety)")
//#else
//# error "Please define IVDEP for your compiler!"
//#endif
#define FLAT_AGGREGATE 0
template<typename vector_type, typename vector_type2>
__global__ void initialize_buff(vector_type vd_out, vector_type2 vd_in)
{
auto i = blockIdx.x * blockDim.x + threadIdx.x;
vd_in.template get<0>(i)[0] = i;
vd_in.template get<0>(i)[1] = i+100.0;
vd_out.template get<0>(i) = i+200.0;
vd_out.template get<1>(i)[0] = i;
vd_out.template get<1>(i)[1] = i+100.0;
vd_out.template get<2>(i)[0][0] = i;
vd_out.template get<2>(i)[0][1] = i+100.0;
vd_out.template get<2>(i)[1][0] = i+200.0;
vd_out.template get<2>(i)[1][1] = i+300.0;
}
template<typename vin_type, typename vout_type>
void initialize_buf(vin_type in, vout_type out)
{
auto ite = out.getGPUIterator(256);
CUDA_LAUNCH(initialize_buff,ite,out.toKernel(),in.toKernel());
}
namespace {
struct Stopwatch {
using clock = std::chrono::high_resolution_clock;
auto elapsedAndReset() -> double {
const auto now = clock::now();
const auto seconds = std::chrono::duration<double>{now - last}.count();
last = now;
return seconds;
}
private:
clock::time_point last = clock::now();
};
constexpr auto PROBLEM_SIZE = 16 * 1024;
constexpr auto STEPS = 5;
constexpr auto TIMESTEP = 0.0001f;
constexpr auto EPS2 = 0.01f;
template <typename Data, template <typename> typename LayoutBase>
using OpenFPMVector = openfpm::vector<Data, HeapMemory, LayoutBase>;
#if FLAT_AGGREGATE
using Data = aggregate<float, float, float, float, float, float, float>;
constexpr auto POS_X = 0;
constexpr auto POS_Y = 1;
constexpr auto POS_Z = 2;
constexpr auto VEL_X = 3;
constexpr auto VEL_Y = 4;
constexpr auto VEL_Z = 5;
constexpr auto MASS = 6;
#else
using Data = aggregate<float[3], float[3], float, float[3][3]>;
constexpr auto POS = 0;
constexpr auto VEL = 1;
constexpr auto MASS = 2;
constexpr auto TENS = 3;
#endif
//using Vector = OpenFPMVector<Data, memory_traits_lin>;
using Vector = OpenFPMVector<Data, memory_traits_inte>;
inline void pPInteraction(Vector& particles, int i, int j) {
#if FLAT_AGGREGATE
const float distanceX = particles.get<POS_X>(i) - particles.get<POS_X>(j);
const float distanceY = particles.get<POS_Y>(i) - particles.get<POS_Y>(j);
const float distanceZ = particles.get<POS_Z>(i) - particles.get<POS_Z>(j);
const float distanceSqrX = distanceX * distanceX;
const float distanceSqrY = distanceY * distanceY;
const float distanceSqrZ = distanceZ * distanceZ;
const float distSqr = EPS2 + distanceSqrX + distanceSqrY + distanceSqrZ;
const float distSixth = distSqr * distSqr * distSqr;
const float invDistCube = 1.0f / std::sqrt(distSixth);
const float sts = particles.get<MASS>(j) * invDistCube * TIMESTEP;
particles.get<VEL_X>(i) += distanceSqrX * sts;
particles.get<VEL_Y>(i) += distanceSqrY * sts;
particles.get<VEL_Z>(i) += distanceSqrZ * sts;
#else
const auto& pi = particles.get(i);
const auto& pj = particles.get(j);
const auto& posi = pi.get<POS>();
const auto& posj = pj.get<POS>();
const float distanceX = posi[0] - posj[0];
const float distanceY = posi[1] - posj[1];
const float distanceZ = posi[2] - posj[2];
const float distanceSqrX = distanceX * distanceX;
const float distanceSqrY = distanceY * distanceY;
const float distanceSqrZ = distanceZ * distanceZ;
const float distSqr = EPS2 + distanceSqrX + distanceSqrY + distanceSqrZ;
const float distSixth = distSqr * distSqr * distSqr;
const float invDistCube = 1.0f / std::sqrt(distSixth);
const float sts = pj.get<2>() * invDistCube * TIMESTEP;
auto veli = pi.get<1>();
veli[0] += distanceSqrX * sts;
veli[1] += distanceSqrY * sts;
veli[2] += distanceSqrZ * sts;
/////////////// Checking the proxies/view for lazy access
// std::cout << demangle(typeid(decltype(pj)).name()) << std::endl;
// std::cout << demangle(typeid(decltype(posj)).name()) << std::endl;
#endif
}
void update(Vector& particles) {
for (int i = 0; i < PROBLEM_SIZE; i++)
IVDEP
for (int j = 0; j < PROBLEM_SIZE; j++)
pPInteraction(particles, i, j);
}
void move(Vector& particles) {
IVDEP
for (std::size_t i = 0; i < PROBLEM_SIZE; i++) {
#if FLAT_AGGREGATE
particles.get(i).get<POS_X>() += particles.get(i).get<VEL_X>() * TIMESTEP;
particles.get(i).get<POS_Y>() += particles.get(i).get<VEL_Y>() * TIMESTEP;
particles.get(i).get<POS_Z>() += particles.get(i).get<VEL_Z>() * TIMESTEP;
#else
auto&& pi = particles.get(i);
const auto& veli = pi.get<1>();
auto&& posi = pi.get<0>();
posi[0] += veli[0] * TIMESTEP;
posi[1] += veli[1] * TIMESTEP;
posi[2] += veli[2] * TIMESTEP;
#endif
}
}
}
/*template<typename T>
class type_restrict
{
T ref;
public:
type_restrict(T & ref)
:ref(ref)
{}
type_restrict(type_restrict<T> & ref)
:ref(ref.ref)
{}
type_restrict<T> & operator=(type_restrict<T> & ref_)
{
T * __restrict__ out = &ref;
T * __restrict__ in = &ref_.ref;
*out = *in;
return *this;
}
template<typename Te, typename U = typename std::enable_if<std::is_fundamental<Te>::value>::type>
type_restrict<T> & operator=(const Te & obj)
{
ref = obj;
return *this;
}
};*/
int main() {
init_wrappers();
///////////// DEBUG ///////////////////
openfpm::vector_gpu<aggregate<float,float[2],float[2][2]>> out;
openfpm::vector_gpu<aggregate<float[2]>> in;
out.resize(PROBLEM_SIZE);
in.resize(PROBLEM_SIZE);
initialize_buf(in,out);
///////////////////////////////////////
Vector particles(PROBLEM_SIZE);
{
const auto& p0 = particles.get(0);
#if FLAT_AGGREGATE
std::cout << "addresses:\n"
<< &p0.get<POS_X>() << '\n'
<< &p0.get<POS_Y>() << '\n'
<< &p0.get<POS_Z>() << '\n'
<< &p0.get<VEL_X>() << '\n'
<< &p0.get<VEL_Y>() << '\n'
<< &p0.get<VEL_Z>() << '\n'
<< &p0.get<MASS>() << '\n';
#else
std::cout << "addresses:\n"
<< &p0.get<POS>()[0] << '\n'
<< &p0.get<POS>()[1] << '\n'
<< &p0.get<POS>()[2] << '\n'
<< &p0.get<VEL>()[0] << '\n'
<< &p0.get<VEL>()[1] << '\n'
<< &p0.get<VEL>()[2] << '\n'
<< &p0.get<MASS>() << '\n';
#endif
}
std::default_random_engine engine;
std::normal_distribution<float> dist(float(0), float(1));
for (auto i = 0; i < PROBLEM_SIZE; i++) {
#if FLAT_AGGREGATE
particles.get(i).get<POS_X>() = dist(engine);
particles.get(i).get<POS_Y>() = dist(engine);
particles.get(i).get<POS_Z>() = dist(engine);
particles.get(i).get<VEL_X>() = dist(engine) / float(10);
particles.get(i).get<VEL_Y>() = dist(engine) / float(10);
particles.get(i).get<VEL_Z>() = dist(engine) / float(10);
particles.get(i).get<MASS>() = dist(engine) / float(100);
#else
auto&& pi = particles.get(i);
pi.get<POS>()[0] = dist(engine);
pi.get<POS>()[1] = dist(engine);
pi.get<POS>()[2] = dist(engine);
pi.get<VEL>()[0] = dist(engine) / float(10);
pi.get<VEL>()[1] = dist(engine) / float(10);
pi.get<VEL>()[2] = dist(engine) / float(10);
pi.get<MASS>() = dist(engine) / float(100);
#endif
}
//////////////////// Test vector ///////////////
constexpr auto TIMESTEP = 0.0001f;
IVDEP
for (int i = 0; i < PROBLEM_SIZE; i++) {
const auto& pi = particles.get(i);
const auto& posi = pi.get<POS>();
IVDEP
for (int j = 0 ; j < PROBLEM_SIZE ; j++) {
const auto& pj = particles.get(j);
const auto& posj = pj.get<POS>();
const float distanceX = posi[0] - posj[0];
const float distanceY = posi[1] - posj[1];
const float distanceZ = posi[2] - posj[2];
const float distanceSqrX = distanceX * distanceX;
const float distanceSqrY = distanceY * distanceY;
const float distanceSqrZ = distanceZ * distanceZ;
const float distSqr = EPS2 + distanceSqrX + distanceSqrY + distanceSqrZ;
const float distSixth = distSqr * distSqr * distSqr;
const float invDistCube = 1.0f / sqrtf(distSixth);
const float sts = pj.get<2>() * invDistCube * TIMESTEP;
particles.get<VEL>(j)[0] += distanceSqrX * invDistCube * sts;
particles.get<VEL>(j)[1] += distanceSqrY * invDistCube * sts;
particles.get<VEL>(j)[2] += distanceSqrZ * invDistCube * sts;
}
}
///////////////////////////////////////////////
Stopwatch watch;
double sumUpdate = 0;
double sumMove = 0;
for (std::size_t s = 0; s < STEPS; ++s) {
update(particles);
sumUpdate += watch.elapsedAndReset();
move(particles);
sumMove += watch.elapsedAndReset();
}
std::cout << "openfpm\t" << sumUpdate / STEPS << '\t' << sumMove / STEPS << '\n';
{
#if FLAT_AGGREGATE
const auto& p0 = particles.get(0);
std::cout << "particle 0 pos: " << p0.get<0>() << " " << p0.get<1>() << " " << p0.get<2>() << '\n';
#else
const auto& pos0 = particles.get<POS>(0);
std::cout << "particle 0 pos: " << pos0[0] << " " << pos0[1] << " " << pos0[2] << '\n';
#endif
}
}
# Kokkos minimally requires 3.10 right now,
# but your project can set it higher
cmake_minimum_required(VERSION 3.10)
# Projects can safely mix languages - must have C++ support
# Kokkos flags will only apply to C++ files
project(Example CXX)
# You need this for using Kokkos_ROOT variable
if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.12.0")
message(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
cmake_policy(SET CMP0074 NEW)
endif()
# Look for an installed Kokkos
find_package(Kokkos REQUIRED)
add_executable(teams team.cpp)
# This is the only thing required to set up compiler/linker flags
target_link_libraries(teams Kokkos::kokkos)
include example.mk
### This is a trick to avoid "Command not found if you no not have NVCC compiler". In practice the normal C++ compiler is used
### internally the example disable with the preprocessor its code if not compiled with nvcc
CUDA_CC=
CUDA_CC_LINK=
ifdef HIP
CUDA_CC=hipcc
CUDA_OPTIONS=-D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0
LIBS_SELECT=$(LIBS)
CC=hipcc
CUDA_CC_LINK=hipcc
else
ifdef CUDA_ON_CPU
CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
INCLUDE_PATH_NVCC=
CUDA_CC_LINK=mpic++
CUDA_OPTIONS=-D__NVCC__ -DCUDART_VERSION=11000
LIBS_SELECT=$(LIBS)
else
ifeq (, $(shell which nvcc))
CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
INCLUDE_PATH_NVCC=
CUDA_CC_LINK=mpic++
CUDA_OPTIONS=
LIBS_SELECT=$(LIBS)
else
CUDA_CC=nvcc -ccbin=mpic++
CUDA_CC_LINK=nvcc -ccbin=mpic++
CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo
LIBS_SELECT=$(LIBS_NVCC)
endif
endif
endif
ifeq ($(PROFILE),ON)
CUDA_CC=scorep --nocompiler --cuda --mpp=mpi nvcc -ccbin=mpic++
CUDA_CC_LINK=scorep --nocompiler --cuda --mpp=mpi nvcc -ccbin=mpic++
else
CUDA_CC:=$(CUDA_CC)
CUDA_CC_LINK:=$(CUDA_CC_LINK)
endif
LDIR =
OPT=
OBJ = main.o
cuda:
sync_test: OPT += -DTEST_RUN
sync_dlb_test: sync_dlb
%.o: %.cu
$(CUDA_CC) -O3 $(OPT) $(CUDA_OPTIONS) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
%.o: %.cpp
$(CC) -O0 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
cuda: $(OBJ)
$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
sync_dlb2: $(OBJ)
$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
all: cuda
run: sync_dlb_test
mpirun --oversubscribe -np 2 ./sync_dlb
.PHONY: clean all run
clean:
rm -f *.o *~ core sync_dlb
#ifdef __NVCC__
#define PRINT_STACKTRACE
//#define STOP_ON_ERROR
#define OPENMPI
//#define SE_CLASS1
//#define USE_LOW_REGISTER_ITERATOR
#include "Vector/vector_dist.hpp"
#include <math.h>
#include "Draw/DrawParticles.hpp"
#include "util/stat/common_statistics.hpp"
__global__ void test1_syncthreads()
{
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
__syncthreads();
}
struct ite_g
{
dim3 wthr;
dim3 thr;
size_t nblocks()
{
return wthr.x * wthr.y * wthr.z;
}
size_t nthrs()
{
return thr.x * thr.y * thr.z;
}
};
int main(int argc, char* argv[])
{
// initialize the library
openfpm_init(&argc,&argv);
openfpm::vector<double> tele_ker;
ite_g g;
g.wthr = dim3(512*512,1,1);
g.thr = dim3(8,1,1);
for (int i = 0; i < 10; i++)
{
timer t_ker;
t_ker.start();
CUDA_LAUNCH(test1_syncthreads,g);
t_ker.stop();
std::cout << "TKERNEL: " << t_ker.getwct() << std::endl;
//////////////////////////////////////////////////////////////////////////////////////////////////
tele_ker.add(t_ker.getwct());
///////////////////////////////////////////////////////////////////////////////////////////////////////
}
double tele_ker_mean;
double tele_ker_dev;
standard_deviation(tele_ker,tele_ker_mean,tele_ker_dev);
std::cout << g.wthr.x*g.wthr.y*g.wthr.z << " " << g.thr.x << std::endl;
std::cout << "SYNCTHREAD LATENCY: " << tele_ker_mean / (g.wthr.x*g.wthr.y*g.wthr.z*24*g.thr.x) * 1e9 << " ns " << " error: " << tele_ker_dev << std::endl;
openfpm_finalize();
}
#else
int main(int argc, char* argv[])
{
return 0;
}
#endif
#include <Kokkos_Core.hpp>
#include <cstdio>
int main(int argc, char* argv[]) {
Kokkos::initialize(argc, argv);
printf("LayoutRight\n");
{
//////////////////////// TEAMS ////////////////////////
using Kokkos::TeamPolicy;
using Kokkos::parallel_for;
typedef TeamPolicy<Kokkos::OpenMP>::member_type member_type;
// Create an instance of the policy
int team_sz = 1;
int sz = 512;
TeamPolicy<Kokkos::OpenMP> policy (sz*sz, team_sz);
// Launch a kernel
Kokkos::fence();
Kokkos::Timer timer;
parallel_for (policy, KOKKOS_LAMBDA (member_type team_member) {
// Calculate a global thread id
int k = team_member.league_rank () * team_member.team_size () +
team_member.team_rank ();
// Calculate the sum of the global thread ids of this team
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
team_member.team_barrier();
// Atomically add the value to a global value
});
Kokkos::fence();
double time = timer.seconds();
std::cout << "TIME: " << time / (sz*sz*team_sz*24) * 1e9 << " ns" << std::endl;
///////////////////////////////////////////////////////
}
printf("LayoutLeft\n");
Kokkos::finalize();
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment