From a2ce93f1f21ab80a3ab053251813f5ab9acc0dde Mon Sep 17 00:00:00 2001
From: Incardona Pietro <incardon@mpi-cbg.de>
Date: Mon, 13 Dec 2021 18:48:54 +0100
Subject: [PATCH] Memory BW added

---
 example/Performance/memBW/Makefile | 62 ++++++++++++++++++++++++++++++
 example/Performance/memBW/main.cu  | 62 ++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 example/Performance/memBW/Makefile
 create mode 100644 example/Performance/memBW/main.cu

diff --git a/example/Performance/memBW/Makefile b/example/Performance/memBW/Makefile
new file mode 100644
index 000000000..a5198f186
--- /dev/null
+++ b/example/Performance/memBW/Makefile
@@ -0,0 +1,62 @@
+include ../../example.mk
+
+### This is a trick to avoid "Command not found if you no not have NVCC compiler". In practice the normal C++ compiler is used
+### internally the example disable with the preprocessor its code if not compiled with nvcc 
+CUDA_CC=
+CUDA_CC_LINK=
+
+CC=mpic++
+ifdef HIP
+        CUDA_CC=hipcc
+        CUDA_OPTIONS= -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0
+        LIBS_SELECT=$(LIBS)
+        CC=hipcc
+	CUDA_CC_LINK=hipcc
+else
+	ifdef CUDA_ON_CPU
+        	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+        	INCLUDE_PATH_NVCC=
+        	CUDA_CC_LINK=mpic++
+        	CUDA_OPTIONS=-D__NVCC__ -DCUDART_VERSION=11000
+        	LIBS_SELECT=$(LIBS)
+	else
+        	ifeq (, $(shell which nvcc))
+                	CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
+                	INCLUDE_PATH_NVCC=
+                	CUDA_CC_LINK=mpic++
+			LIBS_SELECT=$(LIBS)
+        	else
+                	CUDA_CC=nvcc -ccbin=mpic++
+                	CUDA_CC_LINK=nvcc -ccbin=mpic++
+			LIBS_SELECT=$(LIBS_NVCC)
+        	endif
+	endif
+endif
+
+CC=mpic++
+
+LDIR =
+
+OBJ = main.o
+
+miniBUDE:
+
+%.o: %.cu
+	$(CUDA_CC) -g -O3 $(CUDA_OPTIONS) $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+
+%.o: %.cpp
+	$(CC) -g -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
+
+miniBUDE: $(OBJ)
+	$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
+
+all: miniBUDE
+
+run: miniBUDE
+	mpirun --oversubscribe -np 2 ./miniBUDE
+
+.PHONY: clean all run
+
+clean:
+	rm -f *.o *~ core miniBUDE
+
diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu
new file mode 100644
index 000000000..dd08ed890
--- /dev/null
+++ b/example/Performance/memBW/main.cu
@@ -0,0 +1,62 @@
+#include "Vector/map_vector.hpp"
+#include "util/stat/common_statistics.hpp"
+
+
+template<typename vector_type, typename vector_type2>
+__attribute__((always_inline)) inline __global__ void translate_fill_prop(vector_type & vd_out, vector_type2 & vd_in)
+{
+	auto p = blockIdx.x * blockDim.x + threadIdx.x;
+
+	vd_out.template get<0>(p) = vd_in.template get<0>(p)[0] + vd_in.template get<0>(p)[1];
+
+	vd_out.template get<1>(p)[0] = vd_in.template get<0>(p)[0];
+	vd_out.template get<1>(p)[1] = vd_in.template get<0>(p)[1];
+
+	vd_out.template get<2>(p)[0][0] = vd_in.template get<0>(p)[0];
+	vd_out.template get<2>(p)[0][1] = vd_in.template get<0>(p)[1];
+	vd_out.template get<2>(p)[1][0] = vd_in.template get<0>(p)[0] + vd_in.template get<0>(p)[1];
+	vd_out.template get<2>(p)[1][1] = vd_in.template get<0>(p)[1] - vd_in.template get<0>(p)[0];
+
+	vd_in.template get<0>(p)[0] += 0.01f;
+	vd_in.template get<0>(p)[1] += 0.01f;
+}
+
+
+int main(int argc, char *argv[])
+{
+    init_wrappers();
+
+    openfpm::vector_gpu<aggregate<float,float[2],float[2][2]>> out;
+    openfpm::vector_gpu<aggregate<float[2]>> in;
+
+    int nele = 16777216;
+
+    out.resize(nele);
+    in.resize(nele);
+
+    for (int i = 0 ; i < 16777216 ; i++)
+    {
+        in.template get<0>(i)[0] = i;
+        in.template get<0>(i)[1] = i+100.0;
+    }
+
+    auto ite = out.getGPUIterator(256);
+
+    for (int i = 0 ; i < 100 ; i++)
+    {
+	cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+	auto vout = out.toKernel();
+	auto vin = in.toKernel();
+
+        CUDA_LAUNCH(translate_fill_prop,ite,vout,vin);
+
+        cudaDeviceSynchronize();
+
+        t.stop();
+        std::cout << "Time: " << t.getwct() << std::endl;
+        std::cout << "BW: " << nele*4*19 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
+}
-- 
GitLab