diff --git a/example/Performance/miniBUDE/Makefile b/example/Performance/miniBUDE/Makefile index 7a1ecea66f950659d8f0c1b0973846e577dca282..f08386940b49300f35953c4a857f5ea7e8480048 100644 --- a/example/Performance/miniBUDE/Makefile +++ b/example/Performance/miniBUDE/Makefile @@ -4,68 +4,62 @@ include ../../example.mk ### internally the example disable with the preprocessor its code if not compiled with nvcc CUDA_CC= CUDA_CC_LINK= + +CC=mpic++ ifdef HIP CUDA_CC=hipcc - CUDA_OPTIONS=-D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0 + CUDA_OPTIONS= -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0 LIBS_SELECT=$(LIBS) CC=hipcc - CUDA_CC_LINK=hipcc + CUDA_CC_LINK=hipcc else ifdef CUDA_ON_CPU CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) INCLUDE_PATH_NVCC= CUDA_CC_LINK=mpic++ - CUDA_OPTIONS=-D__NVCC__ -DCUDART_VERSION=11000 -fopenmp -O3 -ffast-math -march=native -mavx - LIBS_SELECT=$(LIBS) + CUDA_OPTIONS=-DCUDA_ON_CPU -D__NVCC__ -DCUDART_VERSION=11000 + LIBS_SELECT=$(LIBS) else ifeq (, $(shell which nvcc)) CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) INCLUDE_PATH_NVCC= CUDA_CC_LINK=mpic++ - CUDA_OPTIONS= LIBS_SELECT=$(LIBS) else CUDA_CC=nvcc -ccbin=mpic++ CUDA_CC_LINK=nvcc -ccbin=mpic++ - CUDA_OPTIONS=-use_fast_math -arch=sm_61 -lineinfo LIBS_SELECT=$(LIBS_NVCC) endif endif endif - -ifeq ($(PROFILE),ON) - CUDA_CC=scorep --nocompiler --cuda --mpp=mpi nvcc -ccbin=mpic++ - CUDA_CC_LINK=scorep --nocompiler --cuda --mpp=mpi nvcc -ccbin=mpic++ -else - CUDA_CC:=$(CUDA_CC) - CUDA_CC_LINK:=$(CUDA_CC_LINK) -endif +CC=mpic++ LDIR = OPT= OBJ = main.o -miniBUDE: +sph_dlb: +sph_dlb_test: OPT += -DTEST_RUN +sph_dlb_test: sph_dlb %.o: %.cu - $(CUDA_CC) $(OPT) $(CUDA_OPTIONS) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) + $(CUDA_CC) $(CUDA_OPTIONS) $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) %.o: %.cpp - $(CC) $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH) - -miniBUDE: $(OBJ) - $(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) + $(CC) -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) +sph_dlb: $(OBJ) + $(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) -all: miniBUDE +all: sph_dlb -run: miniBUDE - mpirun --oversubscribe -np 2 ./miniBUDE +run: sph_dlb_test + mpirun --oversubscribe -np 2 ./sph_dlb .PHONY: clean all run clean: - rm -f *.o *~ core miniBUDE + rm -f *.o *~ core sph_dlb diff --git a/example/Performance/miniBUDE/config.cfg b/example/Performance/miniBUDE/config.cfg new file mode 100644 index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc --- /dev/null +++ b/example/Performance/miniBUDE/config.cfg @@ -0,0 +1,2 @@ +[pack] +files = main.cu Makefile diff --git a/example/Performance/miniBUDE/main.cu b/example/Performance/miniBUDE/main.cu index 5ec7ddb6b55ed70388eb8975f613c3a5a35ec783..c092e680a9910efb9e5a77ba9a8dc3d70af03469 100644 --- a/example/Performance/miniBUDE/main.cu +++ b/example/Performance/miniBUDE/main.cu @@ -75,7 +75,7 @@ double getTimestamp() return tv.tv_usec + tv.tv_sec*1e6; } -void printTimings(double start, double end, double poses_per_wi) +void printTimings(double start, double end, double poses_per_wi, openfpm::vector<double> & gflops_data) { double ms = ((end-start)/params.iterations)*1e-3; @@ -88,6 +88,8 @@ void printTimings(double start, double end, double poses_per_wi) double flops = total_ops / runtime; double gflops = flops / 1e9; + gflops_data.add(gflops); + double interactions = (double)params.nposes * (double)params.natlig @@ -608,41 +610,45 @@ int main(int argc, char *argv[]) printf("Deck : %s\n", params.deckDir); float *resultsRef = (float *)malloc(params.nposes*sizeof(float)); - runCUDA(_openfpm); + // We run the benchmark 30 times to get mean and variace + for (int i = 0 ; i < 30 ; i++) + { + runCUDA(_openfpm); - // Load reference results from file - FILE* ref_energies = openFile(params.deckDir, FILE_REF_ENERGIES, "r", NULL); - size_t n_ref_poses = params.nposes; - if (params.nposes > REF_NPOSES) { - printf("Only validating the first %d poses.\n", REF_NPOSES); - n_ref_poses = REF_NPOSES; - } - for (size_t i = 0; i < n_ref_poses; i++) - fscanf(ref_energies, "%f", &resultsRef[i]); + // Load reference results from file + FILE* ref_energies = openFile(params.deckDir, FILE_REF_ENERGIES, "r", NULL); + size_t n_ref_poses = params.nposes; + if (params.nposes > REF_NPOSES) { + printf("Only validating the first %d poses.\n", REF_NPOSES); + n_ref_poses = REF_NPOSES; + } - fclose(ref_energies); + for (size_t i = 0; i < n_ref_poses; i++) + fscanf(ref_energies, "%f", &resultsRef[i]); - float maxdiff = -100.0f; - printf("\n Reference CUDA (diff)\n"); - for (int i = 0; i < n_ref_poses; i++) - { - if (fabs(resultsRef[i]) < 1.f && fabs(_openfpm.d_results.template get<0>(i)) < 1.f) continue; + fclose(ref_energies); - float diff = fabs(resultsRef[i] - _openfpm.d_results.template get<0>(i)) / _openfpm.d_results.template get<0>(i); - if (diff > maxdiff) { - maxdiff = diff; - // printf ("Maxdiff: %.2f (%.3f vs %.3f)\n", maxdiff, resultsRef[i], resultsCUDA[i]); - } + float maxdiff = -100.0f; + printf("\n Reference CUDA (diff)\n"); + for (int i = 0; i < n_ref_poses; i++) + { + if (fabs(resultsRef[i]) < 1.f && fabs(_openfpm.d_results.template get<0>(i)) < 1.f) continue; - if (i < 8) - printf("%7.2f vs %7.2f (%5.2f%%)\n", resultsRef[i], _openfpm.d_results.template get<0>(i), 100*diff); - } - printf("\nLargest difference was %.3f%%\n\n", maxdiff*100); + float diff = fabs(resultsRef[i] - _openfpm.d_results.template get<0>(i)) / _openfpm.d_results.template get<0>(i); + if (diff > maxdiff) { + maxdiff = diff; + // printf ("Maxdiff: %.2f (%.3f vs %.3f)\n", maxdiff, resultsRef[i], resultsCUDA[i]); + } - free(resultsRef); + if (i < 8) + printf("%7.2f vs %7.2f (%5.2f%%)\n", resultsRef[i], _openfpm.d_results.template get<0>(i), 100*diff); + } + printf("\nLargest difference was %.3f%%\n\n", maxdiff*100); + free(resultsRef); + } }