Skip to content
Snippets Groups Projects
Commit a2ce93f1 authored by Pietro Incardona's avatar Pietro Incardona
Browse files

Memory BW added

parent 682a215e
No related branches found
No related tags found
No related merge requests found
Pipeline #4013 passed
include ../../example.mk
### This is a trick to avoid "Command not found if you no not have NVCC compiler". In practice the normal C++ compiler is used
### internally the example disable with the preprocessor its code if not compiled with nvcc
CUDA_CC=
CUDA_CC_LINK=
CC=mpic++
ifdef HIP
CUDA_CC=hipcc
CUDA_OPTIONS= -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0
LIBS_SELECT=$(LIBS)
CC=hipcc
CUDA_CC_LINK=hipcc
else
ifdef CUDA_ON_CPU
CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
INCLUDE_PATH_NVCC=
CUDA_CC_LINK=mpic++
CUDA_OPTIONS=-D__NVCC__ -DCUDART_VERSION=11000
LIBS_SELECT=$(LIBS)
else
ifeq (, $(shell which nvcc))
CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH)
INCLUDE_PATH_NVCC=
CUDA_CC_LINK=mpic++
LIBS_SELECT=$(LIBS)
else
CUDA_CC=nvcc -ccbin=mpic++
CUDA_CC_LINK=nvcc -ccbin=mpic++
LIBS_SELECT=$(LIBS_NVCC)
endif
endif
endif
CC=mpic++
LDIR =
OBJ = main.o
miniBUDE:
%.o: %.cu
$(CUDA_CC) -g -O3 $(CUDA_OPTIONS) $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
%.o: %.cpp
$(CC) -g -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)
miniBUDE: $(OBJ)
$(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT)
all: miniBUDE
run: miniBUDE
mpirun --oversubscribe -np 2 ./miniBUDE
.PHONY: clean all run
clean:
rm -f *.o *~ core miniBUDE
#include "Vector/map_vector.hpp"
#include "util/stat/common_statistics.hpp"
template<typename vector_type, typename vector_type2>
__attribute__((always_inline)) inline __global__ void translate_fill_prop(vector_type & vd_out, vector_type2 & vd_in)
{
auto p = blockIdx.x * blockDim.x + threadIdx.x;
vd_out.template get<0>(p) = vd_in.template get<0>(p)[0] + vd_in.template get<0>(p)[1];
vd_out.template get<1>(p)[0] = vd_in.template get<0>(p)[0];
vd_out.template get<1>(p)[1] = vd_in.template get<0>(p)[1];
vd_out.template get<2>(p)[0][0] = vd_in.template get<0>(p)[0];
vd_out.template get<2>(p)[0][1] = vd_in.template get<0>(p)[1];
vd_out.template get<2>(p)[1][0] = vd_in.template get<0>(p)[0] + vd_in.template get<0>(p)[1];
vd_out.template get<2>(p)[1][1] = vd_in.template get<0>(p)[1] - vd_in.template get<0>(p)[0];
vd_in.template get<0>(p)[0] += 0.01f;
vd_in.template get<0>(p)[1] += 0.01f;
}
int main(int argc, char *argv[])
{
init_wrappers();
openfpm::vector_gpu<aggregate<float,float[2],float[2][2]>> out;
openfpm::vector_gpu<aggregate<float[2]>> in;
int nele = 16777216;
out.resize(nele);
in.resize(nele);
for (int i = 0 ; i < 16777216 ; i++)
{
in.template get<0>(i)[0] = i;
in.template get<0>(i)[1] = i+100.0;
}
auto ite = out.getGPUIterator(256);
for (int i = 0 ; i < 100 ; i++)
{
cudaDeviceSynchronize();
timer t;
t.start();
auto vout = out.toKernel();
auto vin = in.toKernel();
CUDA_LAUNCH(translate_fill_prop,ite,vout,vin);
cudaDeviceSynchronize();
t.stop();
std::cout << "Time: " << t.getwct() << std::endl;
std::cout << "BW: " << nele*4*19 / t.getwct() * 1e-9 << " GB/s" << std::endl;
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment