From a2ce93f1f21ab80a3ab053251813f5ab9acc0dde Mon Sep 17 00:00:00 2001 From: Incardona Pietro <incardon@mpi-cbg.de> Date: Mon, 13 Dec 2021 18:48:54 +0100 Subject: [PATCH] Memory BW added --- example/Performance/memBW/Makefile | 62 ++++++++++++++++++++++++++++++ example/Performance/memBW/main.cu | 62 ++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 example/Performance/memBW/Makefile create mode 100644 example/Performance/memBW/main.cu diff --git a/example/Performance/memBW/Makefile b/example/Performance/memBW/Makefile new file mode 100644 index 000000000..a5198f186 --- /dev/null +++ b/example/Performance/memBW/Makefile @@ -0,0 +1,62 @@ +include ../../example.mk + +### This is a trick to avoid "Command not found if you no not have NVCC compiler". In practice the normal C++ compiler is used +### internally the example disable with the preprocessor its code if not compiled with nvcc +CUDA_CC= +CUDA_CC_LINK= + +CC=mpic++ +ifdef HIP + CUDA_CC=hipcc + CUDA_OPTIONS= -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0 + LIBS_SELECT=$(LIBS) + CC=hipcc + CUDA_CC_LINK=hipcc +else + ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS=-D__NVCC__ -DCUDART_VERSION=11000 + LIBS_SELECT=$(LIBS) + else + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + LIBS_SELECT=$(LIBS) + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + LIBS_SELECT=$(LIBS_NVCC) + endif + endif +endif + +CC=mpic++ + +LDIR = + +OBJ = main.o + +miniBUDE: + +%.o: %.cu + $(CUDA_CC) -g -O3 $(CUDA_OPTIONS) $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) + +%.o: %.cpp + $(CC) -g -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) + +miniBUDE: $(OBJ) + $(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) + +all: miniBUDE + +run: miniBUDE + mpirun --oversubscribe -np 2 ./miniBUDE + +.PHONY: clean all run + +clean: + rm -f *.o *~ core miniBUDE + diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu new file mode 100644 index 000000000..dd08ed890 --- /dev/null +++ b/example/Performance/memBW/main.cu @@ -0,0 +1,62 @@ +#include "Vector/map_vector.hpp" +#include "util/stat/common_statistics.hpp" + + +template<typename vector_type, typename vector_type2> +__attribute__((always_inline)) inline __global__ void translate_fill_prop(vector_type & vd_out, vector_type2 & vd_in) +{ + auto p = blockIdx.x * blockDim.x + threadIdx.x; + + vd_out.template get<0>(p) = vd_in.template get<0>(p)[0] + vd_in.template get<0>(p)[1]; + + vd_out.template get<1>(p)[0] = vd_in.template get<0>(p)[0]; + vd_out.template get<1>(p)[1] = vd_in.template get<0>(p)[1]; + + vd_out.template get<2>(p)[0][0] = vd_in.template get<0>(p)[0]; + vd_out.template get<2>(p)[0][1] = vd_in.template get<0>(p)[1]; + vd_out.template get<2>(p)[1][0] = vd_in.template get<0>(p)[0] + vd_in.template get<0>(p)[1]; + vd_out.template get<2>(p)[1][1] = vd_in.template get<0>(p)[1] - vd_in.template get<0>(p)[0]; + + vd_in.template get<0>(p)[0] += 0.01f; + vd_in.template get<0>(p)[1] += 0.01f; +} + + +int main(int argc, char *argv[]) +{ + init_wrappers(); + + openfpm::vector_gpu<aggregate<float,float[2],float[2][2]>> out; + openfpm::vector_gpu<aggregate<float[2]>> in; + + int nele = 16777216; + + out.resize(nele); + in.resize(nele); + + for (int i = 0 ; i < 16777216 ; i++) + { + in.template get<0>(i)[0] = i; + in.template get<0>(i)[1] = i+100.0; + } + + auto ite = out.getGPUIterator(256); + + for (int i = 0 ; i < 100 ; i++) + { + cudaDeviceSynchronize(); + timer t; + t.start(); + + auto vout = out.toKernel(); + auto vin = in.toKernel(); + + CUDA_LAUNCH(translate_fill_prop,ite,vout,vin); + + cudaDeviceSynchronize(); + + t.stop(); + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << nele*4*19 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } +} -- GitLab