Merge remote-tracking branch 'origin/master'

3c70d3ad · Abhinav Singh · 760f8a91 · 0c4f0f5b · 3c70d3ad · 3c70d3ad
Commit 3c70d3ad authored 1 year ago by Abhinav Singh
--- a/example/Performance/Nbody_benchmark/CMakeLists.txt
+++ b/example/Performance/Nbody_benchmark/CMakeLists.txt
+# Kokkos minimally requires 3.10 right now,
+# but your project can set it higher
+cmake_minimum_required(VERSION 3.10)
+
+# Projects can safely mix languages - must have C++ support
+# Kokkos flags will only apply to C++ files
+project(Example CXX)
+
+# You need this for using Kokkos_ROOT variable
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.12.0")
+  message(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
+  cmake_policy(SET CMP0074 NEW)
+endif()
+
+find_package(OpenMP)
+find_package(Threads)
+find_package(MPI)
+find_package(Boost 1.75.0 COMPONENTS program_options iostreams filesystem fiber context REQUIRED)
+
+# Look for an installed Kokkos
+find_package(openfpm 4.1.0 REQUIRED)
+
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-g -O3")
+
+add_definitions( -D__NVCC__ -DCUDART_VERSION=11000 )
+add_compile_options(  -ftree-vectorize  -fstrict-aliasing -ffast-math  -march=skylake -mavx2 -mtune=skylake -mprefer-vector-width=256  )
+
+# -fopt-info-vec-missed
+
+add_executable(nbody main.cpp)
+
+# This is the only thing required to set up compiler/linker flags
+target_link_libraries(nbody  openfpm::binary_config)
+target_link_libraries(nbody OpenMP::OpenMP_CXX)
--- a/example/Performance/Nbody_benchmark/main.cpp
+++ b/example/Performance/Nbody_benchmark/main.cpp
+#include <Vector/map_vector.hpp>
+#include <chrono>
+//#include <memory/HeapMemory.cpp>
+
+#define NELEMENTS 16777216
+
+//#if defined __GNUC__                       // Got the idea but it seems incorrect for clang __GNUC__ is defined, in clang it end here
+//#	define IVDEP _Pragma("GCC ivdep")
+//#elif defined(_MSC_VER)
+//#	define IVDEP __pragma(loop(ivdep))
+//#elif defined __clang__
+//#	define IVDEP _Pragma("clang loop vectorize(enable) interleave(enable) distribute(enable)")
+#define IVDEP _Pragma("clang loop vectorize(assume_safety) interleave(assume_safety)")
+//#else
+//#	error "Please define IVDEP for your compiler!"
+//#endif
+
+#define FLAT_AGGREGATE 0
+
+template<typename vector_type, typename vector_type2>
+__global__ void initialize_buff(vector_type vd_out, vector_type2 vd_in)
+{
+        auto i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    vd_in.template get<0>(i)[0] = i;
+    vd_in.template get<0>(i)[1] = i+100.0;
+
+    vd_out.template get<0>(i) = i+200.0;
+
+    vd_out.template get<1>(i)[0] = i;
+    vd_out.template get<1>(i)[1] = i+100.0;
+
+    vd_out.template get<2>(i)[0][0] = i;
+    vd_out.template get<2>(i)[0][1] = i+100.0;
+    vd_out.template get<2>(i)[1][0] = i+200.0;
+    vd_out.template get<2>(i)[1][1] = i+300.0;
+}
+
+template<typename vin_type, typename vout_type>
+void initialize_buf(vin_type in, vout_type out)
+{
+    auto ite = out.getGPUIterator(256);
+    CUDA_LAUNCH(initialize_buff,ite,out.toKernel(),in.toKernel());
+}
+
+
+namespace {
+	struct Stopwatch {
+		using clock = std::chrono::high_resolution_clock;
+
+		auto elapsedAndReset() -> double {
+			const auto now = clock::now();
+			const auto seconds = std::chrono::duration<double>{now - last}.count();
+			last = now;
+			return seconds;
+		}
+
+	private:
+		clock::time_point last = clock::now();
+	};
+
+	constexpr auto PROBLEM_SIZE = 16 * 1024;
+	constexpr auto STEPS = 5;
+	constexpr auto TIMESTEP = 0.0001f;
+	constexpr auto EPS2 = 0.01f;
+
+	template <typename Data, template <typename> typename LayoutBase>
+	using OpenFPMVector = openfpm::vector<Data, HeapMemory, LayoutBase>;
+
+#if FLAT_AGGREGATE
+	using Data = aggregate<float, float, float, float, float, float, float>;
+
+        constexpr auto POS_X = 0;
+        constexpr auto POS_Y = 1;
+        constexpr auto POS_Z = 2;
+        constexpr auto VEL_X = 3;
+        constexpr auto VEL_Y = 4;
+        constexpr auto VEL_Z = 5;
+        constexpr auto MASS = 6;
+
+#else
+	using Data = aggregate<float[3], float[3], float, float[3][3]>;
+
+        constexpr auto POS = 0;
+        constexpr auto VEL = 1;
+        constexpr auto MASS = 2;
+        constexpr auto TENS = 3;
+
+#endif
+
+	//using Vector = OpenFPMVector<Data, memory_traits_lin>;
+	using Vector = OpenFPMVector<Data, memory_traits_inte>;
+
+
+	inline void pPInteraction(Vector& particles, int i, int j) {
+#if FLAT_AGGREGATE
+		const float distanceX = particles.get<POS_X>(i) - particles.get<POS_X>(j);
+		const float distanceY = particles.get<POS_Y>(i) - particles.get<POS_Y>(j);
+		const float distanceZ = particles.get<POS_Z>(i) - particles.get<POS_Z>(j);
+		const float distanceSqrX = distanceX * distanceX;
+		const float distanceSqrY = distanceY * distanceY;
+		const float distanceSqrZ = distanceZ * distanceZ;
+		const float distSqr = EPS2 + distanceSqrX + distanceSqrY + distanceSqrZ;
+		const float distSixth = distSqr * distSqr * distSqr;
+		const float invDistCube = 1.0f / std::sqrt(distSixth);
+		const float sts = particles.get<MASS>(j) * invDistCube * TIMESTEP;
+		particles.get<VEL_X>(i) += distanceSqrX * sts;
+		particles.get<VEL_Y>(i) += distanceSqrY * sts;
+		particles.get<VEL_Z>(i) += distanceSqrZ * sts;
+#else
+		const auto& pi = particles.get(i);
+		const auto& pj = particles.get(j);
+		const auto& posi = pi.get<POS>();
+		const auto& posj = pj.get<POS>();
+		const float distanceX = posi[0] - posj[0];
+		const float distanceY = posi[1] - posj[1];
+		const float distanceZ = posi[2] - posj[2];
+		const float distanceSqrX = distanceX * distanceX;
+		const float distanceSqrY = distanceY * distanceY;
+		const float distanceSqrZ = distanceZ * distanceZ;
+		const float distSqr = EPS2 + distanceSqrX + distanceSqrY + distanceSqrZ;
+		const float distSixth = distSqr * distSqr * distSqr;
+		const float invDistCube = 1.0f / std::sqrt(distSixth);
+		const float sts = pj.get<2>() * invDistCube * TIMESTEP;
+		auto veli = pi.get<1>();
+		veli[0] += distanceSqrX * sts;
+		veli[1] += distanceSqrY * sts;
+		veli[2] += distanceSqrZ * sts;
+
+/////////////// Checking the proxies/view for lazy access
+//             	std::cout << demangle(typeid(decltype(pj)).name()) << std::endl;
+//		std::cout << demangle(typeid(decltype(posj)).name()) << std::endl;
+
+#endif
+	}
+
+	void update(Vector& particles) {
+		for (int i = 0; i < PROBLEM_SIZE; i++)
+			IVDEP
+			for (int j = 0; j < PROBLEM_SIZE; j++)
+				pPInteraction(particles, i, j);
+	}
+
+	void move(Vector& particles) {
+		IVDEP
+		for (std::size_t i = 0; i < PROBLEM_SIZE; i++) {
+#if FLAT_AGGREGATE
+			particles.get(i).get<POS_X>() += particles.get(i).get<VEL_X>() * TIMESTEP;
+			particles.get(i).get<POS_Y>() += particles.get(i).get<VEL_Y>() * TIMESTEP;
+			particles.get(i).get<POS_Z>() += particles.get(i).get<VEL_Z>() * TIMESTEP;
+#else
+			auto&& pi = particles.get(i);
+			const auto& veli = pi.get<1>();
+			auto&& posi = pi.get<0>();
+			posi[0] += veli[0] * TIMESTEP;
+			posi[1] += veli[1] * TIMESTEP;
+			posi[2] += veli[2] * TIMESTEP;
+#endif
+		}
+	}
+}
+
+/*template<typename T>
+class type_restrict
+{
+	T ref;
+
+public:
+
+	type_restrict(T & ref)
+	:ref(ref)
+	{}
+
+	type_restrict(type_restrict<T> & ref)
+	:ref(ref.ref)
+	{}
+
+	type_restrict<T> & operator=(type_restrict<T> & ref_) 
+	{
+		T * __restrict__ out = &ref;
+		T * __restrict__ in = &ref_.ref;
+
+		*out = *in;
+		return *this;
+	}
+
+	template<typename Te, typename U = typename std::enable_if<std::is_fundamental<Te>::value>::type>
+        type_restrict<T> & operator=(const Te & obj)
+        {
+                ref = obj;
+                return *this;
+        }
+
+};*/
+
+int main() {
+
+    init_wrappers();
+
+///////////// DEBUG ///////////////////
+
+    openfpm::vector_gpu<aggregate<float,float[2],float[2][2]>> out;
+    openfpm::vector_gpu<aggregate<float[2]>> in;
+
+
+    out.resize(PROBLEM_SIZE);
+    in.resize(PROBLEM_SIZE);
+
+    initialize_buf(in,out);
+
+
+///////////////////////////////////////
+
+	Vector particles(PROBLEM_SIZE);
+
+	{
+		const auto& p0 = particles.get(0);
+#if FLAT_AGGREGATE
+		std::cout << "addresses:\n"
+				  << &p0.get<POS_X>() << '\n'
+				  << &p0.get<POS_Y>() << '\n'
+				  << &p0.get<POS_Z>() << '\n'
+				  << &p0.get<VEL_X>() << '\n'
+				  << &p0.get<VEL_Y>() << '\n'
+				  << &p0.get<VEL_Z>() << '\n'
+				  << &p0.get<MASS>() << '\n';
+#else
+		std::cout << "addresses:\n"
+				  << &p0.get<POS>()[0] << '\n'
+				  << &p0.get<POS>()[1] << '\n'
+				  << &p0.get<POS>()[2] << '\n'
+				  << &p0.get<VEL>()[0] << '\n'
+				  << &p0.get<VEL>()[1] << '\n'
+				  << &p0.get<VEL>()[2] << '\n'
+				  << &p0.get<MASS>() << '\n';
+#endif
+	}
+
+	std::default_random_engine engine;
+	std::normal_distribution<float> dist(float(0), float(1));
+	for (auto i = 0; i < PROBLEM_SIZE; i++) {
+#if FLAT_AGGREGATE
+		particles.get(i).get<POS_X>() = dist(engine);
+		particles.get(i).get<POS_Y>() = dist(engine);
+		particles.get(i).get<POS_Z>() = dist(engine);
+		particles.get(i).get<VEL_X>() = dist(engine) / float(10);
+		particles.get(i).get<VEL_Y>() = dist(engine) / float(10);
+		particles.get(i).get<VEL_Z>() = dist(engine) / float(10);
+		particles.get(i).get<MASS>() = dist(engine) / float(100);
+#else
+		auto&& pi = particles.get(i);
+		pi.get<POS>()[0] = dist(engine);
+		pi.get<POS>()[1] = dist(engine);
+		pi.get<POS>()[2] = dist(engine);
+		pi.get<VEL>()[0] = dist(engine) / float(10);
+		pi.get<VEL>()[1] = dist(engine) / float(10);
+		pi.get<VEL>()[2] = dist(engine) / float(10);
+		pi.get<MASS>() = dist(engine) / float(100);
+#endif
+	}
+
+//////////////////// Test vector ///////////////
+
+	constexpr auto TIMESTEP = 0.0001f;
+
+	IVDEP
+	for (int i = 0; i < PROBLEM_SIZE; i++) {
+		const auto& pi = particles.get(i);
+		const auto& posi = pi.get<POS>();
+        	IVDEP
+        	for (int j = 0 ; j < PROBLEM_SIZE ; j++) {
+                	const auto& pj = particles.get(j);
+                	const auto& posj = pj.get<POS>();
+
+                	const float distanceX = posi[0] -  posj[0];
+                	const float distanceY = posi[1] - posj[1];
+                	const float distanceZ = posi[2] - posj[2];
+
+                	const float distanceSqrX = distanceX * distanceX;
+                	const float distanceSqrY = distanceY * distanceY;
+                	const float distanceSqrZ = distanceZ * distanceZ;
+
+			const float distSqr = EPS2 + distanceSqrX + distanceSqrY + distanceSqrZ;
+                	const float distSixth = distSqr * distSqr * distSqr;
+                	const float invDistCube = 1.0f / sqrtf(distSixth);
+                	const float sts = pj.get<2>() * invDistCube * TIMESTEP;
+
+                	particles.get<VEL>(j)[0] += distanceSqrX * invDistCube  * sts;
+                	particles.get<VEL>(j)[1] += distanceSqrY * invDistCube * sts;
+                	particles.get<VEL>(j)[2] += distanceSqrZ * invDistCube * sts;
+        	}
+	}
+
+///////////////////////////////////////////////
+
+	Stopwatch watch;
+	double sumUpdate = 0;
+	double sumMove = 0;
+	for (std::size_t s = 0; s < STEPS; ++s) {
+		update(particles);
+		sumUpdate += watch.elapsedAndReset();
+		move(particles);
+		sumMove += watch.elapsedAndReset();
+	}
+	std::cout << "openfpm\t" << sumUpdate / STEPS << '\t' << sumMove / STEPS << '\n';
+
+	{
+#if FLAT_AGGREGATE
+		const auto& p0 = particles.get(0);
+		std::cout << "particle 0 pos: " << p0.get<0>() << " " << p0.get<1>() << " " << p0.get<2>() << '\n';
+#else
+		const auto& pos0 = particles.get<POS>(0);
+		std::cout << "particle 0 pos: " << pos0[0] << " " << pos0[1] << " " << pos0[2] << '\n';
+#endif
+	}
+}
--- a/example/Performance/Syncthreads_kokkos_benchmark/CMakeLists.txt
+++ b/example/Performance/Syncthreads_kokkos_benchmark/CMakeLists.txt
+# Kokkos minimally requires 3.10 right now,
+# but your project can set it higher
+cmake_minimum_required(VERSION 3.10)
+
+# Projects can safely mix languages - must have C++ support
+# Kokkos flags will only apply to C++ files
+project(Example CXX)
+
+# You need this for using Kokkos_ROOT variable
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.12.0")
+  message(STATUS "Setting policy CMP0074 to use <Package>_ROOT variables")
+  cmake_policy(SET CMP0074 NEW)
+endif()
+
+# Look for an installed Kokkos
+find_package(Kokkos REQUIRED)
+
+add_executable(teams team.cpp)
+
+# This is the only thing required to set up compiler/linker flags
+target_link_libraries(teams Kokkos::kokkos)
--- a/example/Performance/Syncthreads_kokkos_benchmark/Makefile
+++ b/example/Performance/Syncthreads_kokkos_benchmark/Makefile
+include example.mk
+
+### This is a trick to avoid "Command not found if you no not have NVCC compiler". In practice the normal C++ compiler is used
+### internally the example disable with the preprocessor its code if not compiled with nvcc 
+CUDA_CC=
+CUDA_CC_LINK=
+ifdef HIP
+        CUDA_CC=hipcc
+        CUDA_OPTIONS=-D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0
+        LIBS_SELECT=$(LIBS)
+        CC=hipcc
+        CUDA_CC_LINK=hipcc
+else
+	ifdef CUDA_ON_CPU
+        	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        	INCLUDE_PATH_NVCC=
+        	CUDA_CC_LINK=mpic++
+		CUDA_OPTIONS=-D__NVCC__ -DCUDART_VERSION=11000
+		LIBS_SELECT=$(LIBS)
+	else
+        	ifeq (, $(shell which nvcc))
+                	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+                	INCLUDE_PATH_NVCC=
+                	CUDA_CC_LINK=mpic++
+			CUDA_OPTIONS=
+			LIBS_SELECT=$(LIBS)
+        	else
+                	CUDA_CC=nvcc -ccbin=mpic++
+                	CUDA_CC_LINK=nvcc -ccbin=mpic++
+			CUDA_OPTIONS=-use_fast_math  -arch=sm_61 -lineinfo
+			LIBS_SELECT=$(LIBS_NVCC)
+        	endif
+	endif
+endif
+
+
+ifeq ($(PROFILE),ON)
+        CUDA_CC=scorep --nocompiler  --cuda --mpp=mpi nvcc -ccbin=mpic++
+        CUDA_CC_LINK=scorep --nocompiler  --cuda --mpp=mpi nvcc -ccbin=mpic++
+else
+	CUDA_CC:=$(CUDA_CC)
+	CUDA_CC_LINK:=$(CUDA_CC_LINK)
+endif
+
+LDIR =
+OPT=
+
+OBJ = main.o
+
+cuda:
+sync_test: OPT += -DTEST_RUN
+sync_dlb_test: sync_dlb
+
+%.o: %.cu
+	$(CUDA_CC) -O3 $(OPT) $(CUDA_OPTIONS)  -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+
+%.o: %.cpp
+	$(CC) -O0 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
+
+cuda: $(OBJ)
+	$(CUDA_CC_LINK) -o  $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
+
+sync_dlb2: $(OBJ)
+	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
+
+all: cuda
+
+run: sync_dlb_test
+	mpirun --oversubscribe -np 2 ./sync_dlb
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core sync_dlb
+
--- a/example/Performance/Syncthreads_kokkos_benchmark/main.cu
+++ b/example/Performance/Syncthreads_kokkos_benchmark/main.cu
+
+#ifdef __NVCC__
+
+#define PRINT_STACKTRACE
+//#define STOP_ON_ERROR
+#define OPENMPI
+//#define SE_CLASS1
+
+//#define USE_LOW_REGISTER_ITERATOR
+
+#include "Vector/vector_dist.hpp"
+#include <math.h>
+#include "Draw/DrawParticles.hpp"
+#include "util/stat/common_statistics.hpp"
+
+
+__global__ void test1_syncthreads()
+{
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+    __syncthreads();
+}
+
+
+struct ite_g
+{
+    dim3 wthr;
+    dim3 thr;
+
+    size_t nblocks()
+	{
+		return wthr.x * wthr.y * wthr.z;
+	}
+
+	size_t nthrs()
+	{
+		return thr.x * thr.y * thr.z;
+	}
+};
+
+int main(int argc, char* argv[])
+{
+
+    // initialize the library
+	openfpm_init(&argc,&argv);
+
+	openfpm::vector<double> tele_ker;
+
+	ite_g g;
+
+    	g.wthr = dim3(512*512,1,1);
+    	g.thr = dim3(8,1,1);
+
+	for (int i = 0; i < 10; i++)
+	{
+            timer t_ker;
+            t_ker.start();
+
+            CUDA_LAUNCH(test1_syncthreads,g);
+
+            t_ker.stop();
+
+	    std::cout << "TKERNEL: " << t_ker.getwct() << std::endl;
+	    
+
+
+
+	//////////////////////////////////////////////////////////////////////////////////////////////////
+
+	    tele_ker.add(t_ker.getwct());
+
+
+      ///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+	}
+
+	double tele_ker_mean;
+        double tele_ker_dev;
+	standard_deviation(tele_ker,tele_ker_mean,tele_ker_dev);
+
+	std::cout << g.wthr.x*g.wthr.y*g.wthr.z << "  " << g.thr.x << std::endl;
+	std::cout << "SYNCTHREAD LATENCY: " << tele_ker_mean / (g.wthr.x*g.wthr.y*g.wthr.z*24*g.thr.x) * 1e9 << " ns " << " error: " << tele_ker_dev << std::endl;
+
+	openfpm_finalize();
+}
+ 
+#else
+
+int main(int argc, char* argv[])
+{
+        return 0;
+}
+
+#endif
--- a/example/Performance/Syncthreads_kokkos_benchmark/team.cpp
+++ b/example/Performance/Syncthreads_kokkos_benchmark/team.cpp
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+
+
+
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+
+  printf("LayoutRight\n");
+
+  {
+
+
+
+    //////////////////////// TEAMS ////////////////////////
+
+    using Kokkos::TeamPolicy;
+    using Kokkos::parallel_for;
+
+    typedef TeamPolicy<Kokkos::OpenMP>::member_type member_type;
+    // Create an instance of the policy
+    int team_sz = 1;
+    int sz = 512;
+    TeamPolicy<Kokkos::OpenMP> policy (sz*sz, team_sz);
+    // Launch a kernel
+    
+    Kokkos::fence();
+    Kokkos::Timer timer;  
+
+    parallel_for (policy, KOKKOS_LAMBDA (member_type team_member) {
+        // Calculate a global thread id
+         int k = team_member.league_rank () * team_member.team_size () +
+                team_member.team_rank ();
+        // Calculate the sum of the global thread ids of this team
+	team_member.team_barrier();
+	team_member.team_barrier();
+	team_member.team_barrier();
+	team_member.team_barrier();
+	team_member.team_barrier();
+	team_member.team_barrier();
+
+        team_member.team_barrier();
+        team_member.team_barrier();
+        team_member.team_barrier();
+        team_member.team_barrier();
+        team_member.team_barrier();
+        team_member.team_barrier();
+
+        team_member.team_barrier();
+        team_member.team_barrier();
+        team_member.team_barrier();
+        team_member.team_barrier();
+        team_member.team_barrier();
+        team_member.team_barrier();
+
+        team_member.team_barrier();
+        team_member.team_barrier();
+        team_member.team_barrier();
+        team_member.team_barrier();
+        team_member.team_barrier();
+        team_member.team_barrier();
+
+         // Atomically add the value to a global value
+      });
+
+      Kokkos::fence();
+        double time = timer.seconds();
+        std::cout << "TIME: " << time / (sz*sz*team_sz*24) * 1e9  << " ns"  << std::endl; 
+
+    ///////////////////////////////////////////////////////
+
+  }
+  printf("LayoutLeft\n");
+
+  Kokkos::finalize();
+}
+