diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cfcd437516c382f8e0d51aaf2262cb3b5e4d4870..48fb17ac42c1e62d639fb062df3cb19682904b5b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,117 +1,119 @@ -# docker_centos_build: -# stage: build -# tags: -# - docker -# image: 'ubuntu:20.04' -# artifacts: -# paths: -# - ./build/src/pdata -# - ./build/openfpm_numerics/src/numerics -# - ./openfpm_numerics/test -# script: -# - apt-get update -# - DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata -# - apt-get -y install cmake wget git g++ gfortran python2 python-is-python3 -# - mkdir -p /root/openfpm_dependencies/openfpm_pdata/base -# - mkdir /root/.ssh && chmod 700 /root/.ssh -# - cp id_rsa.pub /root/.ssh/id_rsa.pub && chmod 644 /root/.ssh/id_rsa.pub -# - cp id_rsa /root/.ssh/id_rsa && chmod 600 /root/.ssh/id_rsa -# - ssh-keyscan -H git.mpi-cbg.de >> ~/.ssh/known_hosts -# - ./build.sh $CI_PROJECT_DIR unused pdata 0 base -# cache: -# key: $CI_COMMIT_REF_SLUG -# paths: -# - /root/openfpm_dependencies/openfpm_pdata/base +docker_centos_build: + stage: build + tags: + - ubuntu-docker + image: 'ubuntu:20.04' + artifacts: + paths: + - ./build/src/pdata + - ./build/openfpm_numerics/src/numerics + - ./openfpm_numerics/test + script: + - apt-get update + - DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata + - apt-get -y install cmake wget git g++ gfortran python2 python-is-python3 + - mkdir -p /root/openfpm_dependencies/openfpm_pdata/base + - mkdir /root/.ssh && chmod 700 /root/.ssh + - cp id_rsa.pub /root/.ssh/id_rsa.pub && chmod 644 /root/.ssh/id_rsa.pub + - cp id_rsa /root/.ssh/id_rsa && chmod 600 /root/.ssh/id_rsa + - ssh-keyscan -H git.mpi-cbg.de >> ~/.ssh/known_hosts + - pwd + - cat base/test_file +# - ./build.sh $CI_PROJECT_DIR unused pdata 0 base &> out + cache: + key: $CI_COMMIT_REF_SLUG + paths: + - base/ -centos_build: - stage: build - tags: - - centos - artifacts: - paths: - - ./build/src/pdata - - ./build/openfpm_numerics/src/numerics - - ./openfpm_numerics/test - script: - - ./build.sh $CI_PROJECT_DIR unused pdata 0 $CI_COMMIT_REF_NAME +#centos_build: +# stage: build +# tags: +# - centos +# artifacts: +# paths: +# - ./build/src/pdata +# - ./build/openfpm_numerics/src/numerics +# - ./openfpm_numerics/test +# script: +# - ./build.sh $CI_PROJECT_DIR unused pdata 0 $CI_COMMIT_REF_NAME -centos_run: - stage: test - tags: - - centos - dependencies: - - centos_build - script: - - export OMP_NUM_THREADS=1 - - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 2 pdata 0 $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 3 pdata 0 $CI_COMMIT_REF_NAME - - export OMP_NUM_THREADS=8 - - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME - - export OMP_NUM_THREADS=1 - - cd openfpm_numerics - - ./run.sh $CI_PROJECT_DIR unused 1 0 0 numerics $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 3 0 0 numerics $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 4 0 0 numerics $CI_COMMIT_REF_NAME +#centos_run: +# stage: test +# tags: +# - centos +# dependencies: +# - centos_build +# script: +# - export OMP_NUM_THREADS=1 +# - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME +# - ./run.sh $CI_PROJECT_DIR unused 2 pdata 0 $CI_COMMIT_REF_NAME +# - ./run.sh $CI_PROJECT_DIR unused 3 pdata 0 $CI_COMMIT_REF_NAME +# - export OMP_NUM_THREADS=8 +# - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME +# - export OMP_NUM_THREADS=1 +# - cd openfpm_numerics +# - ./run.sh $CI_PROJECT_DIR unused 1 0 0 numerics $CI_COMMIT_REF_NAME +# - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics $CI_COMMIT_REF_NAME +# - ./run.sh $CI_PROJECT_DIR unused 3 0 0 numerics $CI_COMMIT_REF_NAME +# - ./run.sh $CI_PROJECT_DIR unused 4 0 0 numerics $CI_COMMIT_REF_NAME -mac_build: - variables: - GIT_STRATEGY: empty - stage: build - tags: - - mac - artifacts: - paths: - - ./build/src/pdata - - ./build/openfpm_numerics/src/numerics - - ./openfpm_numerics/test - script: - - ./build.sh $CI_PROJECT_DIR unused pdata 0 $CI_COMMIT_REF_NAME +#mac_build: +# variables: +# GIT_STRATEGY: empty +# stage: build +# tags: +# - mac +# artifacts: +# paths: +# - ./build/src/pdata +# - ./build/openfpm_numerics/src/numerics +# - ./openfpm_numerics/test +# script: +# - ./build.sh $CI_PROJECT_DIR unused pdata 0 $CI_COMMIT_REF_NAME -mac_run: - stage: test - tags: - - mac - dependencies: - - mac_build - script: - - ./run.sh $CI_PROJECT_DIR unused 1 0 0 $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 2 0 0 $CI_COMMIT_REF_NAME - - cd openfpm_numerics - - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics $CI_COMMIT_REF_NAME +#mac_run: +# stage: test +# tags: +# - mac +# dependencies: +# - mac_build +# script: +# - ./run.sh $CI_PROJECT_DIR unused 1 0 0 $CI_COMMIT_REF_NAME +# - ./run.sh $CI_PROJECT_DIR unused 2 0 0 $CI_COMMIT_REF_NAME +# - cd openfpm_numerics +# - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics $CI_COMMIT_REF_NAME -ubuntu_build: - stage: build - tags: - - ubuntu - artifacts: - paths: - - ./build/src/pdata - - ./build/openfpm_numerics/src/numerics - - ./openfpm_numerics/test - script: - - ./build.sh $CI_PROJECT_DIR unused pdata 0 $CI_COMMIT_REF_NAME +#ubuntu_build: +# stage: build +# tags: +# - ubuntu +# artifacts: +# paths: +# - ./build/src/pdata +# - ./build/openfpm_numerics/src/numerics +# - ./openfpm_numerics/test +# script: +# - ./build.sh $CI_PROJECT_DIR unused pdata 0 $CI_COMMIT_REF_NAME -ubuntu_run: - stage: test - tags: - - ubuntu - dependencies: - - ubuntu_build - script: - - export OMP_NUM_THREADS=1 - - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 2 pdata 0 $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 3 pdata 0 $CI_COMMIT_REF_NAME - - export OMP_NUM_THREADS=8 - - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME - - export OMP_NUM_THREADS=1 - - cd openfpm_numerics - - ./run.sh $CI_PROJECT_DIR unused 1 0 0 numerics $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 3 0 0 numerics $CI_COMMIT_REF_NAME - - ./run.sh $CI_PROJECT_DIR unused 4 0 0 numerics $CI_COMMIT_REF_NAME +#ubuntu_run: +# stage: test +# tags: +# - ubuntu +# dependencies: +# - ubuntu_build +# script: +# - export OMP_NUM_THREADS=1 +# - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME +# - ./run.sh $CI_PROJECT_DIR unused 2 pdata 0 $CI_COMMIT_REF_NAME +# - ./run.sh $CI_PROJECT_DIR unused 3 pdata 0 $CI_COMMIT_REF_NAME +# - export OMP_NUM_THREADS=8 +# - ./run.sh $CI_PROJECT_DIR unused 1 pdata 0 $CI_COMMIT_REF_NAME +# - export OMP_NUM_THREADS=1 +# - cd openfpm_numerics +# - ./run.sh $CI_PROJECT_DIR unused 1 0 0 numerics $CI_COMMIT_REF_NAME +# - ./run.sh $CI_PROJECT_DIR unused 2 0 0 numerics $CI_COMMIT_REF_NAME +# - ./run.sh $CI_PROJECT_DIR unused 3 0 0 numerics $CI_COMMIT_REF_NAME +# - ./run.sh $CI_PROJECT_DIR unused 4 0 0 numerics $CI_COMMIT_REF_NAME diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b9bc5364ff2113893e7647ad7a0046b48661815..415e0170da88763c6ca05687f8cacd152a037488 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,7 +78,9 @@ find_package(LAPACK) find_package(Eigen3) find_package(SuiteSparse OPTIONAL_COMPONENTS UMFPACK) find_package(Vc) -find_package(OpenMP) +if (NOT CUDA_ON_BACKEND STREQUAL "HIP") + find_package(OpenMP) +endif() if (CUDA_ON_BACKEND STREQUAL "HIP" AND NOT HIP_FOUND) find_package(HIP) endif() diff --git a/example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp index 3725674c4cb484200a33c24bdaea03c878735bee..68e4a4c471715b682c919048aafe63dd6ad21f0c 100644 --- a/example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp +++ b/example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp @@ -1,5 +1,5 @@ // -// Created by jstark on 2020-05-18. +// Created by jstark on 2020-05-18. Updated on 2022-01-05. // /** * @file example_sussman_disk/main.cpp @@ -125,6 +125,7 @@ //! @cond [Initialization and output folder] @endcond int main(int argc, char* argv[]) { + typedef double phi_type; // initialize library openfpm_init(&argc, &argv); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -169,7 +170,7 @@ int main(int argc, char* argv[]) * * Define the grid size in terms of number of grid points per dimension * * Create a Box that defines our domain * * Create a Ghost object that will define the extension of the ghost part - * * Create a 2D grid with two properties of type double, one for the pre-redistancing Phi_initial and one, where + * * Create a 2D grid with two properties of type phi_type, one for the pre-redistancing Phi_initial and one, where * the post-redistancing Phi_SDF should be written to. * * Set some property names (optionally. These names show up when opening the grid vtk in Paraview.) * @@ -181,10 +182,10 @@ int main(int argc, char* argv[]) // Prop1: store the initial Phi; // Prop2: here the re-initialized Phi (signed distance function) will be written to in the re-distancing step size_t sz[grid_dim] = {128, 128}; // Grid size in terms of number of grid points per dimension - Box<grid_dim, double> box({0.0, 0.0}, {5.0, 5.0}); // 2D + Box<grid_dim, phi_type> box({0.0, 0.0}, {5.0, 5.0}); // 2D Ghost<grid_dim, long int> ghost(0); - typedef aggregate<double, double> props; - typedef grid_dist_id<grid_dim, double, props > grid_in_type; + typedef aggregate<phi_type, phi_type> props; + typedef grid_dist_id<grid_dim, phi_type, props > grid_in_type; grid_in_type g_dist(sz, box, ghost); g_dist.setPropNames({"Phi_0", "Phi_SDF"}); //! @cond [Grid creation] @endcond @@ -210,7 +211,7 @@ int main(int argc, char* argv[]) */ //! @cond [Get disk] @endcond // Now we initialize the grid with a disk. Outside the disk, the value of Phi_0 will be -1, inside +1. - double radius = 1.0; // Radius of the disk + phi_type radius = 1.0; // Radius of the disk init_grid_with_disk<Phi_0_grid>(g_dist, radius, 2.5, 2.5); // Initialize disk onto grid, centered at (2.5, 2.5) g_dist.write(path_output + "/grid_disk_preRedistancing_radius" + std::to_string((int)radius) , FORMAT_BINARY); // Save the disk as vtk file @@ -247,7 +248,8 @@ int main(int argc, char* argv[]) * w.r.t the previous iteration and the residual is printed (Default: false). * * \p print_steadyState_iter: If true, the number of the steady-state-iteration, the corresponding change * w.r.t the previous iteration and the residual is printed (Default: false). - * * \p save_temp_grid: If true, save the temporary grid as hdf5 that can be reloaded onto a grid (Default: false). + * * \p save_temp_grid: If true, save the temporary grid every interval_check_convergence as hdf5 that can be + * reloaded onto a grid (Default: false). * * * @snippet example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp Redistancing options @@ -256,7 +258,7 @@ int main(int argc, char* argv[]) //! @cond [Redistancing options] @endcond // Now we want to convert the initial Phi into a signed distance function (SDF) with magnitude of gradient = 1. // For the initial re-distancing we use the Sussman method. First of all, we can set some redistancing options. - Redist_options redist_options; + Redist_options<phi_type> redist_options; redist_options.min_iter = 1e3; redist_options.max_iter = 1e4; @@ -293,7 +295,8 @@ int main(int argc, char* argv[]) * @snippet example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp Run redistancing */ //! @cond [Run redistancing] @endcond - RedistancingSussman<grid_in_type> redist_obj(g_dist, redist_options); // Instantiation of Sussman-redistancing class + RedistancingSussman<grid_in_type, phi_type> redist_obj(g_dist, redist_options); // Instantiation of + // Sussman-redistancing class // std::cout << "dt = " << redist_obj.get_time_step() << std::endl; // Run the redistancing. In the <> brackets provide property-index where 1.) your initial Phi is stored and 2.) // where the resulting SDF should be written to. @@ -325,9 +328,9 @@ int main(int argc, char* argv[]) // Minimum is 1 property, to which the Phi_SDF can be written // In this example we chose 3 properties. The 1st for the Phi_SDF, the 2nd for the gradient of phi and the 3rd for // the magnitude of the gradient - typedef aggregate<double, double[grid_dim], double> props_nb; - typedef vector_dist<grid_dim, double, props_nb> vd_type; - Ghost<grid_dim, double> ghost_vd(0); + typedef aggregate<phi_type, phi_type[grid_dim], phi_type> props_nb; + typedef vector_dist<grid_dim, phi_type, props_nb> vd_type; + Ghost<grid_dim, phi_type> ghost_vd(0); vd_type vd_narrow_band(0, box, bc, ghost_vd); vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"}); //! @cond [Initialize narrow band] @endcond @@ -345,14 +348,15 @@ int main(int argc, char* argv[]) * belonged to the narrow band. * * Depending on the type of the variable which you define, the width can be either set in terms of number of - * grid points (size_t), physical width (double) or extension of narrow band as physical width inside of object - * and outside the object (double, double). + * grid points (size_t), physical width (phi_type) or extension of narrow band as physical width inside of object + * and outside the object (phi_type, phi_type). * * @snippet example/Numerics/Sussman_redistancing/example_sussman_disk/main.cpp Instantiate narrow band */ //! @cond [Instantiate narrow band] @endcond size_t thickness_of_narrowBand_in_grid_points = 6; - NarrowBand<grid_in_type> narrowBand(g_dist, thickness_of_narrowBand_in_grid_points); // Instantiation of NarrowBand class + NarrowBand<grid_in_type, phi_type> narrowBand(g_dist, thickness_of_narrowBand_in_grid_points); // Instantiation of + // NarrowBand class //! @cond [Instantiate narrow band] @endcond /** diff --git a/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp index d17017f826bf433da5ece3fe3a3656c0611d0ec5..0f29601447fd0dae63bd04d227936c5c023caf6b 100644 --- a/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp +++ b/example/Numerics/Sussman_redistancing/example_sussman_images_2D/main.cpp @@ -1,5 +1,5 @@ // -// Created by jstark on 2020-05-18. +// Created by jstark on 2020-05-18. Updated on 2022-01-05. // /** * @file example_sussman_images_2D/main.cpp @@ -77,8 +77,8 @@ int main(int argc, char* argv[]) // std::string image_name = "triangle"; // std::string image_name = "face"; std::string image_name = "dolphin"; - - + + typedef double phi_type; // initialize library openfpm_init(&argc, &argv); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -86,7 +86,7 @@ int main(int argc, char* argv[]) std::string cwd = get_cwd(); const std::string path_output = cwd + "/output_" + image_name; // const std::string path_output = cwd + "/output_face"; - + create_directory_if_not_exist(path_output); ////////////////////////////////////////////////////////////////////////////////////////////// @@ -95,7 +95,7 @@ int main(int argc, char* argv[]) const std::string path_input ="input/"; const std::string path_to_image = path_input + image_name + ".bin"; const std::string path_to_size = path_input + "size_" + image_name + ".csv"; - + /* * in case of 2D (single image): */ @@ -127,7 +127,7 @@ int main(int argc, char* argv[]) * */ //! @cond [Refinement] @endcond - const double refinement [] = {0.5, 0.5}; // (2D) factor by which grid should be finer / coarser as + const phi_type refinement [] = {0.5, 0.5}; // (2D) factor by which grid should be finer / coarser as // underlying image in each dimension (e.g. to get isotropic grid from anisotropic image resolution) //! @cond [Refinement] @endcond /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -161,7 +161,7 @@ int main(int argc, char* argv[]) // Array with grid dimensions after refinement. This size-array will be used for the grid creation. const size_t sz[grid_dim] = {(size_t)std::round(stack_size[x] * refinement[x]), (size_t)std::round(stack_size[y] * - refinement[y])}; // 2D + refinement[y])}; // 2D //! @cond [Size] @endcond // Here we create a 2D grid that stores 2 properties: // Prop1: store the initial Phi; @@ -180,24 +180,27 @@ int main(int argc, char* argv[]) * */ //! @cond [Redistancing] @endcond - Box<grid_dim, double> box({0.0, 0.0}, {5.0, 5.0}); // 2D + Box<grid_dim, phi_type> box({0.0, 0.0}, {5.0, 5.0}); // 2D Ghost<grid_dim, long int> ghost(0); - typedef aggregate<double, double> props; - typedef grid_dist_id<grid_dim, double, props > grid_in_type; + typedef aggregate<phi_type, phi_type> props; + typedef grid_dist_id<grid_dim, phi_type, props > grid_in_type; grid_in_type g_dist(sz, box, ghost); g_dist.setPropNames({"Phi_0", "Phi_SDF"}); + // initialize complete grid including ghost layer with -1 + init_grid_and_ghost<Phi_0_grid>(g_dist, -1.0); + // Now we can initialize the grid with the pixel values from the image stack load_pixel_onto_grid<Phi_0_grid>(g_dist, path_to_image, stack_size); g_dist.write(path_output + "/grid_from_images_initial", FORMAT_BINARY); // Save the initial grid as vtk file - - + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Now we want to convert the initial Phi into a signed distance function (SDF) with magnitude of gradient = 1 // For the initial re-distancing we use the Sussman method // 1.) Set some redistancing options (for details see example sussman disk or sphere) - Redist_options redist_options; + Redist_options<phi_type> redist_options; redist_options.min_iter = 1e3; redist_options.max_iter = 1e4; @@ -212,7 +215,8 @@ int main(int argc, char* argv[]) redist_options.print_steadyState_iter = true; redist_options.save_temp_grid = true; - RedistancingSussman<grid_in_type> redist_obj(g_dist, redist_options); // Instantiation of Sussman-redistancing class + RedistancingSussman<grid_in_type, phi_type> redist_obj(g_dist, redist_options); // Instantiation of Sussman-redistancing + // class // std::cout << "dt = " << redist_obj.get_time_step() << std::endl; // Run the redistancing. in the <> brackets provide property-index where 1.) your initial Phi is stored and 2.) where the resulting SDF should be written to. redist_obj.run_redistancing<Phi_0_grid, Phi_SDF_grid>(); @@ -226,13 +230,14 @@ int main(int argc, char* argv[]) // Minimum is 1 property, to which the Phi_SDF can be written // In this example we chose 3 properties. The 1st for the Phi_SDF, the 2nd for the gradient of phi and the 3rd for // the magnitude of the gradient - typedef aggregate<double, Point<grid_dim, double>, double> props_nb; - typedef vector_dist<grid_dim, double, props_nb> vd_type; - Ghost<grid_dim, double> ghost_vd(0); - vd_type vd_narrow_band(0, box, bc, ghost_vd); + typedef aggregate<phi_type, Point<grid_dim, phi_type>, phi_type> props_nb; + typedef vector_dist<grid_dim, phi_type, props_nb> vd_type; + Ghost<grid_dim, phi_type> ghost_vd(0); + vd_type vd_narrow_band(0, box, bc, ghost_vd); vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"}); - NarrowBand<grid_in_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of NarrowBand class + NarrowBand<grid_in_type, phi_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of + // NarrowBand class // Get the narrow band. You can decide, if you only want the Phi_SDF saved to your particles or // if you also want the gradients or gradients and magnitude of gradient. @@ -244,7 +249,7 @@ int main(int argc, char* argv[]) narrowBand.get_narrow_band<Phi_SDF_grid, Phi_SDF_vd, Phi_grad_vd, Phi_magnOfGrad_vd>(g_dist, vd_narrow_band); vd_narrow_band.write(path_output + "/vd_narrow_band_images", FORMAT_BINARY); // Save particles as vtk file - + openfpm_finalize(); // Finalize openFPM library return 0; } diff --git a/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp index cbec16e6fb67a3e1a9a0496795fe25d48b24a9d2..cb75ca91b292bb25d15e4efb1a0083868f669f53 100644 --- a/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp +++ b/example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp @@ -35,7 +35,7 @@ /** * @page example_sussman_images_3D Images 3D * - * ## Include ## {#e2d_img_include} + * ## Include ## {#e3d_img_include} * * These are the header files that we need to include: * @@ -55,7 +55,7 @@ /** * @page example_sussman_images_3D Images 3D * - * ## Initialization, indices and output folder ## {#e2d_img_init} + * ## Initialization, indices and output folder ## {#e3d_img_init} * * Again we start with * * Initializing OpenFPM @@ -74,6 +74,7 @@ //! @cond [Initialization] @endcond int main(int argc, char* argv[]) { + typedef double phi_type; // initialize library openfpm_init(&argc, &argv); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -108,7 +109,7 @@ int main(int argc, char* argv[]) /** * @page example_sussman_images_3D Images 3D * - * ## Set refinement factor ## {#e2d_img_refine} + * ## Set refinement factor ## {#e3d_img_refine} * * If we want that the grid has a different resolution as the image stack, we can set a refinement factor. In the * refinement array, we define by which factor the grid resolution should be changed w.r.t. the image stack @@ -120,8 +121,8 @@ int main(int argc, char* argv[]) * */ //! @cond [Refinement] @endcond - const double refinement [] = {1.0, 1.0, 1.0}; // without refinement -// const double refinement [] = {0.8, 1.5, 2.0}; // factors by which grid should be finer as underlying image stack in each dimension (e.g. to get isotropic grid from anisotropic stack resolution) + const phi_type refinement [] = {1.0, 1.0, 1.0}; // without refinement +// const phi_type refinement [] = {0.8, 1.5, 2.0}; // factors by which grid should be finer as underlying image stack in each dimension (e.g. to get isotropic grid from anisotropic stack resolution) //! @cond [Refinement] @endcond /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // read the stack size (number of pixel values per dimension) from a binary file @@ -129,7 +130,7 @@ int main(int argc, char* argv[]) /** * @page example_sussman_images_3D Images 3D * - * ## Get size from image_size.csv ## {#e2d_img_size} + * ## Get size from image_size.csv ## {#e3d_img_size} * * Here we read the stack size (number of pixel values per dimension) from a csv file. Alternatively, you can * directly define the stack size as: \p std::vector<size_t> \p stack_size \p {#pixels \p in \p x, \p #pixels \p in @@ -162,7 +163,7 @@ int main(int argc, char* argv[]) /** * @page example_sussman_images_3D Images 3D * - * ## Run Sussman redistancing and get narrow band ## {#e2d_img_redist} + * ## Run Sussman redistancing and get narrow band ## {#e3d_img_redist} * * Once we have loaded the geometrical object from the 3D stack onto the grid, we can perform Sussman * redistancing and get the narrow band the same way as it is explained in detail here: @ref @@ -173,14 +174,17 @@ int main(int argc, char* argv[]) * */ //! @cond [Redistancing] @endcond - Box<grid_dim, double> box({0.0, 0.0, 0.0}, {5.0, 5.0, 5.0}); // 3D + Box<grid_dim, phi_type> box({0.0, 0.0, 0.0}, {5.0, 5.0, 5.0}); // 3D Ghost<grid_dim, long int> ghost(0); - typedef aggregate<double, double> props; - typedef grid_dist_id<grid_dim, double, props > grid_in_type; + typedef aggregate<phi_type, phi_type> props; + typedef grid_dist_id<grid_dim, phi_type, props > grid_in_type; grid_in_type g_dist(sz, box, ghost); g_dist.setPropNames({"Phi_0", "Phi_SDF"}); + // initialize complete grid including ghost layer with -1 + init_grid_and_ghost<Phi_0_grid>(g_dist, -1.0); + // Now we can initialize the grid with the pixel values from the image stack load_pixel_onto_grid<Phi_0_grid>(g_dist, path_to_zstack, stack_size); g_dist.write(path_output + "/grid_from_images_initial", FORMAT_BINARY); // Save the initial grid as vtk file @@ -190,7 +194,7 @@ int main(int argc, char* argv[]) // Now we want to convert the initial Phi into a signed distance function (SDF) with magnitude of gradient = 1 // For the initial re-distancing we use the Sussman method // 1.) Set some redistancing options (for details see example sussman disk or sphere) - Redist_options redist_options; + Redist_options<phi_type> redist_options; redist_options.min_iter = 1e3; redist_options.max_iter = 1e4; @@ -204,7 +208,8 @@ int main(int argc, char* argv[]) redist_options.print_current_iterChangeResidual = true; redist_options.print_steadyState_iter = true; redist_options.save_temp_grid = true; - RedistancingSussman<grid_in_type> redist_obj(g_dist, redist_options); // Instantiation of Sussman-redistancing class + RedistancingSussman<grid_in_type, phi_type> redist_obj(g_dist, redist_options); // Instantiation of Sussman-redistancing + // class // std::cout << "dt = " << redist_obj.get_time_step() << std::endl; // Run the redistancing. in the <> brackets provide property-index where 1.) your initial Phi is stored and 2.) where the resulting SDF should be written to. redist_obj.run_redistancing<Phi_0_grid, Phi_SDF_grid>(); @@ -218,13 +223,14 @@ int main(int argc, char* argv[]) // Minimum is 1 property, to which the Phi_SDF can be written // In this example we chose 3 properties. The 1st for the Phi_SDF, the 2nd for the gradient of phi and the 3rd for // the magnitude of the gradient - typedef aggregate<double, Point<grid_dim, double>, double> props_nb; - typedef vector_dist<grid_dim, double, props_nb> vd_type; - Ghost<grid_dim, double> ghost_vd(0); + typedef aggregate<phi_type, Point<grid_dim, phi_type>, phi_type> props_nb; + typedef vector_dist<grid_dim, phi_type, props_nb> vd_type; + Ghost<grid_dim, phi_type> ghost_vd(0); vd_type vd_narrow_band(0, box, bc, ghost_vd); vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"}); - NarrowBand<grid_in_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of NarrowBand class + NarrowBand<grid_in_type, phi_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of + // NarrowBand class // Get the narrow band. You can decide, if you only want the Phi_SDF saved to your particles or // if you also want the gradients or gradients and magnitude of gradient. @@ -245,7 +251,7 @@ int main(int argc, char* argv[]) /** * @page example_sussman_images_3D Images 3D * - * ## Full code ## {#e2d_img_full} + * ## Full code ## {#e3d_img_full} * * @include example/Numerics/Sussman_redistancing/example_sussman_images_3D/main.cpp */ diff --git a/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp b/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp index 966094a693f1b01130c8fdd9a9a8303bcab4e1f2..68d9d8aebc8bf21c8464af79b89974ecfde004b8 100644 --- a/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp +++ b/example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp @@ -85,6 +85,7 @@ //! @cond [Initialization and output folder] @endcond int main(int argc, char* argv[]) { + typedef double phi_type; // initialize library openfpm_init(&argc, &argv); // Set current working directory, define output paths and create folders where output will be saved @@ -129,7 +130,7 @@ int main(int argc, char* argv[]) * * Define the grid size in terms of number of grid points per dimension * * Create a Box that defines our domain * * Create a Ghost object that will define the extension of the ghost part - * * Create a 3D grid with two properties of type double, one for the pre-redistancing Phi_initial and one, where + * * Create a 3D grid with two properties of type phi_type, one for the pre-redistancing Phi_initial and one, where * the post-redistancing Phi_SDF should be written to. * * Set some property names (optionally. These names show up when opening the grid vtk in Paraview.) * @@ -141,10 +142,10 @@ int main(int argc, char* argv[]) // Prop1: store the initial Phi; // Prop2: here the re-initialized Phi (signed distance function) will be written to in the re-distancing step const size_t sz[grid_dim] = {128, 128, 128}; - Box<grid_dim, double> box({0.0, 0.0, 0.0}, {10.0, 10.0, 10.0}); + Box<grid_dim, phi_type> box({0.0, 0.0, 0.0}, {10.0, 10.0, 10.0}); Ghost<grid_dim, long int> ghost(0); - typedef aggregate<double, double> props; - typedef grid_dist_id<grid_dim, double, props > grid_in_type; + typedef aggregate<phi_type, phi_type> props; + typedef grid_dist_id<grid_dim, phi_type, props > grid_in_type; grid_in_type g_dist(sz, box, ghost); g_dist.setPropNames({"Phi_0", "Phi_SDF"}); //! @cond [Grid creation] @endcond @@ -170,7 +171,7 @@ int main(int argc, char* argv[]) */ //! @cond [Get sphere] @endcond // Now we initialize the grid with a filled sphere. Outside the sphere, the value of Phi_0 will be -1, inside +1. - double radius = 1.0; // Radius of the sphere + phi_type radius = 1.0; // Radius of the sphere init_grid_with_sphere<Phi_0_grid>(g_dist, radius, 5, 5, 5); // Initialize sphere onto grid, centered at (5, 5, 5) g_dist.write(path_output + "/grid_initial_sphere_preRedistancing_radius" + std::to_string((int)radius) , FORMAT_BINARY); // Save the sphere as vtk file @@ -217,7 +218,7 @@ int main(int argc, char* argv[]) //! @cond [Redistancing options] @endcond // Now we want to convert the initial Phi into a signed distance function (SDF) with magnitude of gradient = 1. // For the initial re-distancing we use the Sussman method. First of all, we can set some redistancing options. - Redist_options redist_options; + Redist_options<phi_type> redist_options; redist_options.min_iter = 1e3; redist_options.max_iter = 1e4; @@ -253,7 +254,8 @@ int main(int argc, char* argv[]) * @snippet example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp Run redistancing */ //! @cond [Run redistancing] @endcond - RedistancingSussman<grid_in_type> redist_obj(g_dist, redist_options); // Instantiation of Sussman-redistancing class + RedistancingSussman<grid_in_type, phi_type> redist_obj(g_dist, redist_options); // Instantiation of + // Sussman-redistancing class // std::cout << "dt = " << redist_obj.get_time_step() << std::endl; // Run the redistancing. in the <> brackets provide property-index where 1.) your initial Phi is stored and 2.) // where the resulting SDF should be written to. @@ -285,9 +287,9 @@ int main(int argc, char* argv[]) // Minimum is 1 property, to which the Phi_SDF can be written // In this example we chose 3 properties. The 1st for the Phi_SDF, the 2nd for the gradient of phi and the 3rd for // the magnitude of the gradient - typedef aggregate<double, Point<grid_dim, double>, double> props_nb; - typedef vector_dist<grid_dim, double, props_nb> vd_type; - Ghost<grid_dim, double> ghost_vd(0); + typedef aggregate<phi_type, Point<grid_dim, phi_type>, phi_type> props_nb; + typedef vector_dist<grid_dim, phi_type, props_nb> vd_type; + Ghost<grid_dim, phi_type> ghost_vd(0); vd_type vd_narrow_band(0, box, bc, ghost_vd); vd_narrow_band.setPropNames({"Phi_SDF", "Phi_grad", "Phi_magnOfGrad"}); //! @cond [Initialize narrow band] @endcond @@ -305,14 +307,15 @@ int main(int argc, char* argv[]) * belonged to the narrow band. * * Depending on the type of the variable which you define, the width can be either set in terms of number of - * grid points (size_t), physical width (double) or extension of narrow band as physical width inside of object - * and outside the object (double, double). + * grid points (size_t), physical width (phi_type) or extension of narrow band as physical width inside of object + * and outside the object (phi_type, phi_type). * * @snippet example/Numerics/Sussman_redistancing/example_sussman_sphere/main.cpp Instantiate narrow band */ //! @cond [Instantiate narrow band] @endcond size_t thickness_of_narrowBand_in_grid_points = 6; - NarrowBand<grid_in_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of NarrowBand class + NarrowBand<grid_in_type, phi_type> narrowBand(g_dist, redist_options.width_NB_in_grid_points); // Instantiation of + // NarrowBand class //! @cond [Instantiate narrow band] @endcond diff --git a/example/Performance/memBW/Makefile b/example/Performance/memBW/Makefile index 869ea6a98e15487f863e86f4f3504a1d54f32c01..e78251aa5bc7a11f8d382e013340e0dbdf8c8ffc 100644 --- a/example/Performance/memBW/Makefile +++ b/example/Performance/memBW/Makefile @@ -53,7 +53,7 @@ memBW: $(OBJ) all: memBW run: memBW - mpirun --oversubscribe -np 2 ./memBW + mpirun --oversubscribe -np 1 ./memBW .PHONY: clean all run diff --git a/example/Performance/memBW/main.cu b/example/Performance/memBW/main.cu index eb0b8c5ef0685df55efc8425163054e00b3d9ac8..ffcf93ae0fad1d97a312801b28af7f3e969abc0b 100644 --- a/example/Performance/memBW/main.cu +++ b/example/Performance/memBW/main.cu @@ -154,7 +154,7 @@ int main(int argc, char *argv[]) openfpm::vector<double> res; res.resize(100); - for (int i = 0 ; i < 110 ; i++) +/* for (int i = 0 ; i < 110 ; i++) { cudaDeviceSynchronize(); timer t; @@ -206,7 +206,7 @@ int main(int argc, char *argv[]) double dev_read_tls = 0.0; standard_deviation(res,mean_read_tls,dev_read_tls); - check_read(in,out); + check_read(in,out);*/ ////////////// @@ -407,6 +407,52 @@ int main(int argc, char *argv[]) check_read(in,out); + /////// BASE 1 core + + for (int i = 0 ; i < 110 ; i++) + { + timer t; + t.start(); + + float * out_s = (float *)out.getDeviceBuffer<0>(); + float * out_v = (float *)out.getDeviceBuffer<1>(); + float * out_m = (float *)out.getDeviceBuffer<2>(); + float * in_v = (float *)in.getDeviceBuffer<0>(); + + int stride = out.capacity(); + + auto lamb_arr_red = [out_s,out_v,out_m,in_v,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx) + { + auto p = blockIdx.x * blockDim.x + threadIdx.x; + + float a = out_s[p]; + + float b = out_v[p + 0*stride]; + float c = out_v[p + 1*stride]; + + float d = out_m[p + 0*2*stride + 0*stride]; + float e = out_m[p + 0*2*stride + 1*stride]; + float f = out_m[p + 1*2*stride + 0*stride]; + float g = out_m[p + 1*2*stride + 1*stride]; + + float h = in_v[p + 0*stride]; + in_v[p + 1*stride] = a+b+c+d+e+f+g+h; + }; + + for (int i = 0 ; i < N ; i++) + { + lamb_arr_red(i); + } + + t.stop(); + + if (i >=10) + {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;} + + std::cout << "Time ARR: " << t.getwct() << std::endl; + std::cout << "BW 1-CORE ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } + /////////////////// #ifdef CUDIFY_USE_CUDA @@ -441,8 +487,8 @@ int main(int argc, char *argv[]) #endif - std::cout << "Average READ with TLS: " << mean_read_tls << " deviation: " << dev_read_tls << std::endl; - std::cout << "Average WRITE with TLS: " << mean_write_tls << " deviation: " << dev_write_tls << std::endl; +// std::cout << "Average READ with TLS: " << mean_read_tls << " deviation: " << dev_read_tls << std::endl; +// std::cout << "Average WRITE with TLS: " << mean_write_tls << " deviation: " << dev_write_tls << std::endl; std::cout << "Average READ with lamb: " << mean_read_lamb << " deviation: " << dev_read_lamb << std::endl; std::cout << "Average WRITE with lamb: " << mean_write_lamb << " deviation: " << dev_write_lamb << std::endl; diff --git a/example/Performance/memBW_multi/Makefile b/example/Performance/memBW_multi/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0bc4b91aeba230d1d4010000a0ca2b0995086cb7 --- /dev/null +++ b/example/Performance/memBW_multi/Makefile @@ -0,0 +1,62 @@ +include ../../example.mk + +### This is a trick to avoid "Command not found if you no not have NVCC compiler". In practice the normal C++ compiler is used +### internally the example disable with the preprocessor its code if not compiled with nvcc +CUDA_CC= +CUDA_CC_LINK= + +CC=mpic++ +ifdef HIP + CUDA_CC=hipcc + CUDA_OPTIONS= -D__NVCC__ -D__HIP__ -DCUDART_VERSION=11000 -D__CUDACC__ -D__CUDACC_VER_MAJOR__=11 -D__CUDACC_VER_MINOR__=0 -D__CUDACC_VER_BUILD__=0 + LIBS_SELECT=$(LIBS) + CC=hipcc + CUDA_CC_LINK=hipcc +else + ifdef CUDA_ON_CPU + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + CUDA_OPTIONS=-D__NVCC__ -DCUDART_VERSION=11000 + LIBS_SELECT=$(LIBS) + else + ifeq (, $(shell which nvcc)) + CUDA_CC=mpic++ -x c++ $(INCLUDE_PATH) + INCLUDE_PATH_NVCC= + CUDA_CC_LINK=mpic++ + LIBS_SELECT=$(LIBS) + else + CUDA_CC=nvcc -ccbin=mpic++ + CUDA_CC_LINK=nvcc -ccbin=mpic++ + LIBS_SELECT=$(LIBS_NVCC) + endif + endif +endif + +CC=mpic++ + +LDIR = + +OBJ = main.o + +memBW: + +%.o: %.cu + $(CUDA_CC) -O3 -g $(CUDA_OPTIONS) $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC) + +%.o: %.cpp + $(CC) -g -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH) + +memBW: $(OBJ) + $(CUDA_CC_LINK) -o $@ $^ $(CFLAGS) $(LIBS_PATH) $(LIBS_SELECT) + +all: memBW + +run: memBW + mpirun --oversubscribe -np 2 ./memBW + +.PHONY: clean all run + +clean: + rm -f *.o *~ core memBW + diff --git a/example/Performance/memBW_multi/config.cfg b/example/Performance/memBW_multi/config.cfg new file mode 100644 index 0000000000000000000000000000000000000000..699be429e147cd40187be6ce345ef2f060f59fbc --- /dev/null +++ b/example/Performance/memBW_multi/config.cfg @@ -0,0 +1,2 @@ +[pack] +files = main.cu Makefile diff --git a/example/Performance/memBW_multi/main.cu b/example/Performance/memBW_multi/main.cu new file mode 100644 index 0000000000000000000000000000000000000000..7669858bb4d7a1dc49203a8d664e9f13ac453349 --- /dev/null +++ b/example/Performance/memBW_multi/main.cu @@ -0,0 +1,516 @@ +#ifdef __NVCC__ + +#include "Vector/map_vector.hpp" +#include "util/stat/common_statistics.hpp" + +#define NELEMENTS 67108864 + +//! Memory bandwidth with small calculations +template<typename vector_type, typename vector_type2> +__global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in) +{ + grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y, + blockIdx.z * blockDim.z + threadIdx.z}); + + float a = vd_in.template get<0>(p)[0]; + + vd_out.template get<0>(p) = a; + + vd_out.template get<1>(p)[0] = a; + vd_out.template get<1>(p)[1] = a; + + vd_out.template get<2>(p)[0][0] = a; + vd_out.template get<2>(p)[0][1] = a; + vd_out.template get<2>(p)[1][0] = a; + vd_out.template get<2>(p)[1][1] = a; + vd_in.template get<0>(p)[1] = a; +} + +template<typename vector_type, typename vector_type2> +__global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in) +{ + grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y, + blockIdx.z * blockDim.z + threadIdx.z}); + + float a = vd_out.template get<0>(p); + + float b = vd_out.template get<1>(p)[0]; + float c = vd_out.template get<1>(p)[1]; + + float d = vd_out.template get<2>(p)[0][0]; + float e = vd_out.template get<2>(p)[0][1]; + float f = vd_out.template get<2>(p)[1][0]; + float g = vd_out.template get<2>(p)[1][1]; + + float h = vd_in.template get<0>(p)[0]; + vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h; +} + + +template<typename in_type, typename out_type> +void check_write(in_type & in, out_type & out) +{ + out.template deviceToHost<0,1,2>(); + in.template deviceToHost<0>(); + + bool success = true; + auto it = in.getIterator(); + while (it.isNext()) + { + auto i = it.get(); + + float a = in.template get<0>(i)[0]; + + if (i.get(0) == 2 && i.get(1) == 2 && i.get(2) == 2) + { + success &= a != 0; + } + + success &= out.template get<0>(i) == a; + + success &= out.template get<1>(i)[0] == a; + success &= out.template get<1>(i)[1] == a; + + success &= out.template get<2>(i)[0][0] == a; + success &= out.template get<2>(i)[0][1] == a; + success &= out.template get<2>(i)[1][0] == a; + success &= out.template get<2>(i)[1][1] == a; + + success &= in.template get<0>(i)[1] == a; + + if (success == false) + { + std::cout << "FAIL " << a << " " << i.to_string() << " " << out.template get<0>(i) << " " << out.template get<1>(i)[0] + << out.template get<1>(i)[1] << " " << out.template get<2>(i)[0][0] + << out.template get<2>(i)[0][1] << " " << out.template get<2>(i)[1][0] + << out.template get<2>(i)[1][0] << " " << out.template get<2>(i)[1][1] + << std::endl; + break; + } + + ++it; + } + + if (success == false) + { + std::cout << "FAIL WRITE" << std::endl; + exit(1); + } +} + +template<typename in_type, typename out_type> +void check_read(in_type & in, out_type & out) +{ + out.template deviceToHost<0,1,2>(); + in.template deviceToHost<0>(); + + bool success = true; + auto it = in.getIterator(); + while (it.isNext()) + { + auto i = it.get(); + + float a = out.template get<0>(i); + + if (i.get(0) == 2 && i.get(1) == 2 && i.get(2) == 2) + { + success &= a != 0; + } + + float b = out.template get<1>(i)[0]; + float c = out.template get<1>(i)[1]; + + float d = out.template get<2>(i)[0][0]; + float e = out.template get<2>(i)[0][1]; + float f = out.template get<2>(i)[1][0]; + float g = out.template get<2>(i)[1][1]; + + float h = in.template get<0>(i)[0]; + + success &= in.template get<0>(i)[1] == (a+b+c+d+e+f+g+h); + + if (success == false) + { + std::cout << "FAIL READ " << i.to_string() << " " << in.template get<0>(i)[1] << " != " << a+b+c+d+e+f+g+h << std::endl; + exit(1); + } + + ++it; + } +} + +template<typename vector_type, typename vector_type2> +__global__ void initialize_buff(vector_type vd_out, vector_type2 vd_in) +{ + grid_key_dx<3,int> i({blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y, + blockIdx.z * blockDim.z + threadIdx.z}); + + vd_in.template get<0>(i)[0] = i.get(0) + i.get(1) + i.get(2); + vd_in.template get<0>(i)[1] = i.get(0) + i.get(1) + i.get(2)+100.0; + + vd_out.template get<0>(i) = i.get(0) + i.get(1) + i.get(2)+200.0; + + vd_out.template get<1>(i)[0] = i.get(0) + i.get(1) + i.get(2); + vd_out.template get<1>(i)[1] = i.get(0) + i.get(1) + i.get(2)+100.0; + + vd_out.template get<2>(i)[0][0] = i.get(0) + i.get(1) + i.get(2); + vd_out.template get<2>(i)[0][1] = i.get(0) + i.get(1) + i.get(2)+100.0; + vd_out.template get<2>(i)[1][0] = i.get(0) + i.get(1) + i.get(2)+200.0; + vd_out.template get<2>(i)[1][1] = i.get(0) + i.get(1) + i.get(2)+300.0; +} + +template<typename vin_type, typename vout_type> +void initialize_buf(vin_type & in, vout_type & out) +{ + auto ite = out.getGPUIterator({0,0,0},{511,511,255}); + CUDA_LAUNCH(initialize_buff,ite,out.toKernel(),in.toKernel()); +} + +int main(int argc, char *argv[]) +{ + init_wrappers(); + + grid_gpu<3,aggregate<float,float[2],float[2][2]>> out; + grid_gpu<3,aggregate<float[2]>> in; + + int nele = NELEMENTS; + + size_t sz[3] = {512,512,256}; + out.resize(sz); + in.resize(sz); + + out.setMemory(); + in.setMemory(); + + initialize_buf(in,out); + + // Read write test with TLS + + auto ite = out.getGPUIterator({0,0,0},{511,511,255}); + + openfpm::vector<double> res; + res.resize(100); + + for (int i = 0 ; i < 110 ; i++) + { + cudaDeviceSynchronize(); + timer t; + t.start(); + + + CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel()); + + cudaDeviceSynchronize(); + + t.stop(); + + if (i >=10) + {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;} + + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } + + double mean_write_tls = 0.0; + double dev_write_tls = 0.0; + standard_deviation(res,mean_write_tls,dev_write_tls); + + check_write(in,out); + + initialize_buf(in,out); + + for (int i = 0 ; i < 110 ; i++) + { + cudaDeviceSynchronize(); + timer t; + t.start(); + + + CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel()); + + cudaDeviceSynchronize(); + + t.stop(); + + if (i >=10) + {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;} + + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } + + double mean_read_tls = 0.0; + double dev_read_tls = 0.0; + standard_deviation(res,mean_read_tls,dev_read_tls); + + check_read(in,out); + + ////////////// + + /////////////////////////////////////////// LAMBDA ////////////////////////////////////////// + + initialize_buf(in,out); + + for (int i = 0 ; i < 110 ; i++) + { + cudaDeviceSynchronize(); + timer t; + t.start(); + + auto vd_out = out.toKernel(); + auto vd_in = in.toKernel(); + + auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx) + { + grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y, + blockIdx.z * blockDim.z + threadIdx.z}); + + float a = vd_in.template get<0>(p)[0]; + + vd_out.template get<0>(p) = a; + + vd_out.template get<1>(p)[0] = a; + vd_out.template get<1>(p)[1] = a; + + vd_out.template get<2>(p)[0][0] = a; + vd_out.template get<2>(p)[0][1] = a; + vd_out.template get<2>(p)[1][0] = a; + vd_out.template get<2>(p)[1][1] = a; + vd_in.template get<0>(p)[1] = a; + }; + + CUDA_LAUNCH_LAMBDA(ite, lamb); + + cudaDeviceSynchronize(); + + t.stop(); + + if (i >=10) + {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;} + + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } + + double mean_write_lamb = 0.0; + double dev_write_lamb = 0.0; + standard_deviation(res,mean_write_lamb,dev_write_lamb); + + initialize_buf(in,out); + + for (int i = 0 ; i < 110 ; i++) + { + cudaDeviceSynchronize(); + timer t; + t.start(); + + + auto vd_out = out.toKernel(); + auto vd_in = in.toKernel(); + + auto lamb = [vd_out,vd_in] __device__ (dim3 & blockIdx, dim3 & threadIdx) + { + grid_key_dx<3,int> p({blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y, + blockIdx.z * blockDim.z + threadIdx.z}); + + float a = vd_out.template get<0>(p); + + float b = vd_out.template get<1>(p)[0]; + float c = vd_out.template get<1>(p)[1]; + + float d = vd_out.template get<2>(p)[0][0]; + float e = vd_out.template get<2>(p)[0][1]; + float f = vd_out.template get<2>(p)[1][0]; + float g = vd_out.template get<2>(p)[1][1]; + + float h = vd_in.template get<0>(p)[0]; + vd_in.template get<0>(p)[1] = a+b+c+d+e+f+g+h; + }; + + CUDA_LAUNCH_LAMBDA(ite, lamb); + + cudaDeviceSynchronize(); + + t.stop(); + + if (i >=10) + {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;} + + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } + + double mean_read_lamb = 0.0; + double dev_read_lamb = 0.0; + standard_deviation(res,mean_read_lamb,dev_read_lamb); + + // Array benchmark + initialize_buf(in,out); + + for (int i = 0 ; i < 110 ; i++) + { + cudaDeviceSynchronize(); + timer t; + t.start(); + + float * out_s = (float *)out.getDeviceBuffer<0>(); + float * out_v = (float *)out.getDeviceBuffer<1>(); + float * out_m = (float *)out.getDeviceBuffer<2>(); + float * in_v = (float *)in.getDeviceBuffer<0>(); + + int sz0 = sz[0]; + int sz1 = sz[1]; + int sz2 = sz[2]; + int stride = out.size(); + + auto lamb_arr_write = [out_s,out_v,out_m,in_v,sz0,sz1,sz2,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx) + { + auto p1 = blockIdx.x * blockDim.x + threadIdx.x; + auto p2 = blockIdx.y * blockDim.y + threadIdx.y; + auto p3 = blockIdx.z * blockDim.z + threadIdx.z; + + float a = in_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride]; + + out_s[p1 + p2*sz0 + p3*sz0*sz1] = a; + + out_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride] = a; + out_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a; + + out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 0*stride ] = a; + out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 1*stride ] = a; + out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 0*stride ] = a; + out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 1*stride ] = a; + in_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a; + }; + + CUDA_LAUNCH_LAMBDA(ite,lamb_arr_write); + + cudaDeviceSynchronize(); + + t.stop(); + + if (i >=10) + {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;} + + std::cout << "Time ARR: " << t.getwct() << std::endl; + std::cout << "BW ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } + + double mean_write_arr = 0.0; + double dev_write_arr = 0.0; + standard_deviation(res,mean_write_arr,dev_write_arr); + + check_write(in,out); + + for (int i = 0 ; i < 110 ; i++) + { + cudaDeviceSynchronize(); + timer t; + t.start(); + + float * out_s = (float *)out.getDeviceBuffer<0>(); + float * out_v = (float *)out.getDeviceBuffer<1>(); + float * out_m = (float *)out.getDeviceBuffer<2>(); + float * in_v = (float *)in.getDeviceBuffer<0>(); + + int sz0 = sz[0]; + int sz1 = sz[1]; + int sz2 = sz[2]; + int stride = out.size(); + + auto lamb_arr_red = [out_s,out_v,out_m,in_v,sz0,sz1,sz2,stride] __device__ (dim3 & blockIdx, dim3 & threadIdx) + { + auto p1 = blockIdx.x * blockDim.x + threadIdx.x; + auto p2 = blockIdx.y * blockDim.y + threadIdx.y; + auto p3 = blockIdx.z * blockDim.z + threadIdx.z; + + float a = out_s[p1 + p2*sz0 + p3*sz0*sz1]; + + float b = out_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride]; + float c = out_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride]; + + float d = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 0*stride]; + float e = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 0*2*stride + 1*stride]; + float f = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 0*stride]; + float g = out_m[p1 + p2*sz0 + p3*sz0*sz1 + 1*2*stride + 1*stride]; + + float h = in_v[p1 + p2*sz0 + p3*sz0*sz1 + 0*stride]; + in_v[p1 + p2*sz0 + p3*sz0*sz1 + 1*stride] = a+b+c+d+e+f+g+h; + }; + + CUDA_LAUNCH_LAMBDA(ite,lamb_arr_red); + + cudaDeviceSynchronize(); + + t.stop(); + + if (i >=10) + {res.get(i-10) = (double)nele*4*9 / t.getwct() * 1e-9;} + + std::cout << "Time ARR: " << t.getwct() << std::endl; + std::cout << "BW ARR: " << (double)nele*4*9 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } + + double mean_read_arr = 0.0; + double dev_read_arr = 0.0; + standard_deviation(res,mean_read_arr,dev_read_arr); + + check_read(in,out); + + /////////////////// + + #ifdef CUDIFY_USE_CUDA + + for (int i = 0 ; i < 110 ; i++) + { + cudaDeviceSynchronize(); + timer t; + t.start(); + + float * a = (float *)in.getDeviceBuffer<0>(); + float * b = (float *)out.getDeviceBuffer<1>(); + + cudaMemcpy(a,b,2*NELEMENTS*4,cudaMemcpyDeviceToDevice); + + cudaDeviceSynchronize(); + + t.stop(); + + if (i >=10) + {res.get(i-10) = (double)nele*4*4 / t.getwct() * 1e-9;} + + std::cout << "Time: " << t.getwct() << std::endl; + std::cout << "BW: " << (double)nele*4*4 / t.getwct() * 1e-9 << " GB/s" << std::endl; + } + + double mean_read_mes = 0.0; + double dev_read_mes = 0.0; + standard_deviation(res,mean_read_mes,dev_read_mes); + + std::cout << "Average measured: " << mean_read_mes << " deviation: " << dev_read_mes << std::endl; + + #endif + + std::cout << "Average READ with TLS: " << mean_read_tls << " deviation: " << dev_read_tls << std::endl; + std::cout << "Average WRITE with TLS: " << mean_write_tls << " deviation: " << dev_write_tls << std::endl; + + std::cout << "Average READ with lamb: " << mean_read_lamb << " deviation: " << dev_read_lamb << std::endl; + std::cout << "Average WRITE with lamb: " << mean_write_lamb << " deviation: " << dev_write_lamb << std::endl; + + std::cout << "Average WRITE with array: " << mean_write_arr << " deviation: " << dev_write_arr << std::endl; + std::cout << "Average READ with array: " << mean_read_arr << " deviation: " << dev_read_arr << std::endl; +} + +#else + +int main(int argc, char *argv[]) +{ +} + +#endif + diff --git a/example/Performance/miniBUDE/main.cu b/example/Performance/miniBUDE/main.cu index d845ef21f395f59082f687ca046e392cfb301b3c..9bc27e11716006d35300e627d5871cc69c013b92 100644 --- a/example/Performance/miniBUDE/main.cu +++ b/example/Performance/miniBUDE/main.cu @@ -606,7 +606,7 @@ void loadParameters(int argc, char *argv[], OpenFPM & _openfpm) fclose(file); } -#ifndef __APPLE__ +#if !defined(__APPLE__) && !defined(__powerpc64__) #include <fenv.h> #include <xmmintrin.h> #include <pmmintrin.h> @@ -614,7 +614,7 @@ void loadParameters(int argc, char *argv[], OpenFPM & _openfpm) int main(int argc, char *argv[]) { -#ifndef __APPLE__ +#if !defined(__APPLE__) && !defined(__powerpc64__) _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); #endif diff --git a/example/Vector/0_simple/main.cpp b/example/Vector/0_simple/main.cpp index cb4059b2b9e83058adb008c589b98370743db236..c7f292a091c18ae83fce9b01d196fa0db6a138eb 100644 --- a/example/Vector/0_simple/main.cpp +++ b/example/Vector/0_simple/main.cpp @@ -65,9 +65,12 @@ #include "Vector/vector_dist.hpp" //! \cond [inclusion] \endcond + int main(int argc, char* argv[]) { + std::cout << is_layout_inte<memory_traits_custom_separation_of_buffer<aggregate<int>>>::value << std::endl; + /*! * \page Vector_0_simple Vector 0 simple * diff --git a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu index bb4757617af0646f583365875fea074239a3f32a..181470d623c1dc2cb7efbd777a557c247a21e9e8 100644 --- a/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu +++ b/example/Vector/7_SPH_dlb_gpu_more_opt/main.cu @@ -987,7 +987,7 @@ int main(int argc, char* argv[]) { Vcluster<> & v_cl = create_vcluster(); timer it_time; - + it_time.start(); ////// Do rebalancing every 200 timesteps it_reb++; diff --git a/openfpm_io b/openfpm_io index 3f01780ef8e9772b70db41f38df7ff8d97fc2b6b..037e4db6149cee2a40e647ddfc93f5505a52721f 160000 --- a/openfpm_io +++ b/openfpm_io @@ -1 +1 @@ -Subproject commit 3f01780ef8e9772b70db41f38df7ff8d97fc2b6b +Subproject commit 037e4db6149cee2a40e647ddfc93f5505a52721f diff --git a/src/Decomposition/CartDecomposition.hpp b/src/Decomposition/CartDecomposition.hpp index 7c52034ff6e7b6af27fa65cd5be945b8938d8e04..e639816c6515ecc0013cbb450ec0affb763aa3b7 100644 --- a/src/Decomposition/CartDecomposition.hpp +++ b/src/Decomposition/CartDecomposition.hpp @@ -157,6 +157,12 @@ public: //! This class admit a class defined on an extended domain typedef CartDecomposition_ext<dim,T,Memory,layout_base,Distribution> extended_type; + typedef ie_loc_ghost<dim, T,layout_base, Memory> ie_loc_ghost_type; + + typedef nn_prcs<dim, T,layout_base,Memory> nn_prcs_type; + + typedef ie_ghost<dim,T,Memory,layout_base> ie_ghost_type; + protected: //! bool that indicate whenever the buffer has been already transfer to device @@ -1016,6 +1022,47 @@ public: return *this; } + /*! \brief Copy the element + * + * \param cart element to copy + * + * \return itself + * + */ + template<typename CartDecomposition2> + CartDecomposition<dim,T,Memory, layout_base, Distribution> & operator=(const CartDecomposition2 & cart) + { + static_cast<ie_loc_ghost<dim,T,layout_base,Memory>*>(this)->operator=(static_cast<typename CartDecomposition2::ie_loc_ghost_type>(cart)); + static_cast<nn_prcs<dim,T,layout_base,Memory>*>(this)->operator=(static_cast<typename CartDecomposition2::nn_prcs_type>(cart)); + static_cast<ie_ghost<dim,T,Memory,layout_base>*>(this)->operator=(static_cast<typename CartDecomposition2::ie_ghost_type>(cart)); + + sub_domains = cart.private_get_sub_domains(); + box_nn_processor = cart.private_get_box_nn_processor(); + fine_s = cart.private_get_fine_s(); + gr = cart.private_get_gr(); + gr_dist = cart.private_get_gr_dist(); + dist = cart.private_get_dist(); + commCostSet = cart.private_get_commCostSet(); + cd = cart.private_get_cd(); + domain = cart.private_get_domain(); + sub_domains_global = cart.private_get_sub_domains_global(); + + for (size_t i = 0 ; i < dim ; i++) + { + spacing[i] = cart.private_get_spacing(i); + magn[i] = cart.private_get_magn(i); + }; + + ghost = cart.private_get_ghost(); + + bbox = cart.private_get_bbox(); + + for (size_t i = 0 ; i < dim ; i++) + {bc[i] = cart.private_get_bc(i);} + + return *this; + } + /*! \brief Copy the element, move semantic * * \param cart element to copy @@ -2099,6 +2146,16 @@ public: return sub_domains; } + /*! \brief Return the internal data structure sub_domains + * + * \return sub_domains + * + */ + const openfpm::vector<SpaceBox<dim, T>> & private_get_sub_domains() const + { + return sub_domains; + } + /*! \brief Return the internal data structure box_nn_processor * * \return box_nn_processor @@ -2109,6 +2166,16 @@ public: return box_nn_processor; } + /*! \brief Return the internal data structure box_nn_processor + * + * \return box_nn_processor + * + */ + const openfpm::vector<openfpm::vector<long unsigned int> > & private_get_box_nn_processor() const + { + return box_nn_processor; + } + /*! \brief Return the internal data structure fine_s * * \return fine_s @@ -2119,6 +2186,16 @@ public: return fine_s; } + /*! \brief Return the internal data structure fine_s + * + * \return fine_s + * + */ + const CellList<dim,T,Mem_fast<Memory,int>,shift<dim,T>> & private_get_fine_s() const + { + return fine_s; + } + /*! \brief Return the internal data structure gr * * \return gr @@ -2129,6 +2206,16 @@ public: return gr; } + /*! \brief Return the internal data structure gr + * + * \return gr + * + */ + const grid_sm<dim, void> & private_get_gr() const + { + return gr; + } + /*! \brief Return the internal data structure gr_dist * * \return gr_dist @@ -2139,6 +2226,16 @@ public: return gr_dist; } + /*! \brief Return the internal data structure gr_dist + * + * \return gr_dist + * + */ + const grid_sm<dim, void> & private_get_gr_dist() const + { + return gr_dist; + } + /*! \brief Return the internal data structure dist * * \return dist @@ -2149,6 +2246,16 @@ public: return dist; } + /*! \brief Return the internal data structure dist + * + * \return dist + * + */ + const Distribution & private_get_dist() const + { + return dist; + } + /*! \brief Return the internal data structure commCostSet * * \return commCostSet @@ -2159,6 +2266,16 @@ public: return commCostSet; } + /*! \brief Return the internal data structure commCostSet + * + * \return commCostSet + * + */ + const bool & private_get_commCostSet() const + { + return commCostSet; + } + /*! \brief Return the internal data structure cd * * \return cd @@ -2169,6 +2286,16 @@ public: return cd; } + /*! \brief Return the internal data structure cd + * + * \return cd + * + */ + const CellDecomposer_sm<dim, T, shift<dim,T>> & private_get_cd() const + { + return cd; + } + /*! \brief Return the internal data structure domain * * \return domain @@ -2179,6 +2306,16 @@ public: return domain; } + /*! \brief Return the internal data structure domain + * + * \return domain + * + */ + const ::Box<dim,T> & private_get_domain() const + { + return domain; + } + /*! \brief Return the internal data structure sub_domains_global * * \return sub_domains_global @@ -2189,6 +2326,16 @@ public: return sub_domains_global; } + /*! \brief Return the internal data structure sub_domains_global + * + * \return sub_domains_global + * + */ + const openfpm::vector<Box_map<dim, T>,Memory,layout_base> & private_get_sub_domains_global() const + { + return sub_domains_global; + } + /*! \brief Return the internal data structure spacing * * \return spacing @@ -2199,6 +2346,37 @@ public: return spacing[i]; } + /*! \brief Return the internal data structure spacing + * + * \return spacing + * + */ + const T & private_get_spacing(int i) const + { + return spacing[i]; + } + + /*! \brief Return the internal data structure magn + * + * \return magn + * + */ + T & private_get_magn(int i) + { + return spacing[i]; + } + + /*! \brief Return the internal data structure magn + * + * \return magn + * + */ + const T & private_get_magn(int i) const + { + return spacing[i]; + } + + /*! \brief Return the internal data structure ghost * * \return ghost @@ -2209,6 +2387,16 @@ public: return ghost; } + /*! \brief Return the internal data structure ghost + * + * \return ghost + * + */ + const Ghost<dim,T> & private_get_ghost() const + { + return ghost; + } + /*! \brief Return the internal data structure bbox * * \return bbox @@ -2219,6 +2407,16 @@ public: return bbox; } + /*! \brief Return the internal data structure bbox + * + * \return bbox + * + */ + const ::Box<dim,T> & private_get_bbox() const + { + return bbox; + } + /*! \brief Return the internal data structure bc * * \return bc @@ -2228,6 +2426,16 @@ public: { return bc[i]; } + + /*! \brief Return the internal data structure bc + * + * \return bc + * + */ + const size_t & private_get_bc(int i) const + { + return bc[i]; + } }; diff --git a/src/Decomposition/ie_ghost.hpp b/src/Decomposition/ie_ghost.hpp index 2eadea6bad6427ae702d8ffadb48510c5ccf9a6d..2e8a9633ff7efcd5ded73c6e6c2cfb6420a0fbea 100644 --- a/src/Decomposition/ie_ghost.hpp +++ b/src/Decomposition/ie_ghost.hpp @@ -581,6 +581,27 @@ public: return *this; } + //! Copy operator + template<typename Memory2, template<typename> class layout_base2> + inline ie_ghost<dim,T,Memory,layout_base> & operator=(const ie_ghost<dim,T,Memory2,layout_base2> & ie) + { + box_nn_processor_int = ie.private_get_box_nn_processor_int(); + proc_int_box = ie.private_get_proc_int_box(); + vb_ext = ie.private_get_vb_ext(); + vb_int = ie.private_get_vb_int(); + vb_int_box = ie.private_get_vb_int_box(); + geo_cell = ie.private_geo_cell(); + shifts = ie.private_get_shifts(); + ids_p = ie.private_get_ids_p(); + ids = ie.private_get_ids(); + domain = ie.private_get_domain(); + + for (int i = 0 ; i < dim ; i++) + {bc[i] = ie.private_get_bc(i);} + + return *this; + } + /*! \brief duplicate this structure changing layout and Memory @@ -1267,6 +1288,16 @@ public: ids.clear(); } + /*! \brief Return the internal data structure box_nn_processor_int + * + * \return box_nn_processor_int + * + */ + inline const openfpm::vector< openfpm::vector< Box_proc<dim,T> > > & private_get_box_nn_processor_int() const + { + return box_nn_processor_int; + } + /*! \brief Return the internal data structure box_nn_processor_int * * \return box_nn_processor_int @@ -1287,6 +1318,16 @@ public: return proc_int_box; } + /*! \brief Return the internal data structure proc_int_box + * + * \return proc_int_box + * + */ + inline const openfpm::vector< Box_dom<dim,T> > & private_get_proc_int_box() const + { + return proc_int_box; + } + /*! \brief Return the internal data structure vb_ext * * \return vb_ext @@ -1297,6 +1338,16 @@ public: return vb_ext; } + /*! \brief Return the internal data structure vb_ext + * + * \return vb_ext + * + */ + inline const openfpm::vector<p_box<dim,T> > & private_get_vb_ext() const + { + return vb_ext; + } + /*! \brief Return the internal data structure vb_int * * \return vb_int @@ -1308,6 +1359,17 @@ public: return vb_int; } + /*! \brief Return the internal data structure vb_int + * + * \return vb_int + * + */ + inline const openfpm::vector<aggregate<unsigned int,unsigned int,unsigned int>,Memory,layout_base> & + private_get_vb_int() const + { + return vb_int; + } + /*! \brief Return the internal data structure vb_int_box * * \return vb_int_box @@ -1319,6 +1381,17 @@ public: return vb_int_box; } + /*! \brief Return the internal data structure vb_int_box + * + * \return vb_int_box + * + */ + inline const openfpm::vector<Box<dim,T>,Memory,layout_base> & + private_get_vb_int_box() const + { + return vb_int_box; + } + /*! \brief Return the internal data structure proc_int_box * * \return proc_int_box @@ -1330,6 +1403,17 @@ public: return geo_cell; } + /*! \brief Return the internal data structure proc_int_box + * + * \return proc_int_box + * + */ + inline const CellList<dim,T,Mem_fast<Memory,int>,shift<dim,T>> & + private_geo_cell() const + { + return geo_cell; + } + /*! \brief Return the internal data structure shifts * * \return shifts @@ -1341,6 +1425,17 @@ public: return shifts; } + /*! \brief Return the internal data structure shifts + * + * \return shifts + * + */ + inline const openfpm::vector<Point<dim,T>,Memory,layout_base> & + private_get_shifts() const + { + return shifts; + } + /*! \brief Return the internal data structure ids_p * * \return ids_p @@ -1352,6 +1447,17 @@ public: return ids_p; } + /*! \brief Return the internal data structure ids_p + * + * \return ids_p + * + */ + inline const openfpm::vector<std::pair<size_t,size_t>> & + private_get_ids_p() const + { + return ids_p; + } + /*! \brief Return the internal data structure ids_p * * \return ids_p @@ -1363,6 +1469,17 @@ public: return ids; } + /*! \brief Return the internal data structure ids_p + * + * \return ids_p + * + */ + inline const openfpm::vector<size_t> & + private_get_ids() const + { + return ids; + } + /*! \brief Return the internal data structure domain * * \return domain @@ -1374,6 +1491,22 @@ public: return domain; } + /*! \brief Return the internal data structure domain + * + * \return domain + * + */ + inline const Box<dim,T> & + private_get_domain() const + { + return domain; + } + + size_t private_get_bc(int i) const + { + return bc[i]; + } + /*! \brief Return the internal data structure domain * * \return domain diff --git a/src/Grid/grid_dist_id.hpp b/src/Grid/grid_dist_id.hpp index 1a0e885f9f6d373d33954d10ab7f1b3ddfba2783..cd006f9c87d42201f138cac99ee95086fd1c7b1f 100644 --- a/src/Grid/grid_dist_id.hpp +++ b/src/Grid/grid_dist_id.hpp @@ -1314,10 +1314,11 @@ public: * \param ghost Ghost part * */ - grid_dist_id(const Decomposition & dec, + template<typename Decomposition2> + grid_dist_id(const Decomposition2 & dec, const size_t (& g_sz)[dim], const Ghost<dim,St> & ghost) - :domain(dec.getDomain()),ghost(ghost),ghost_int(INVALID_GHOST),dec(dec),v_cl(create_vcluster()), + :domain(dec.getDomain()),ghost(ghost),ghost_int(INVALID_GHOST),dec(create_vcluster()),v_cl(create_vcluster()), ginfo(g_sz),ginfo_v(g_sz) { #ifdef SE_CLASS2 @@ -1368,10 +1369,33 @@ public: :domain(dec.getDomain()),ghost_int(g),dec(create_vcluster()),v_cl(create_vcluster()), ginfo(g_sz),ginfo_v(g_sz) { -#ifdef SE_CLASS2 - check_new(this,8,GRID_DIST_EVENT,4); -#endif + InitializeCellDecomposer(g_sz,dec.periodicity()); + + ghost = convert_ghost(g,cd_sm); + this->dec = dec.duplicate(ghost); + + // an empty + openfpm::vector<Box<dim,long int>> empty; + + // Initialize structures + InitializeStructures(g_sz,empty,g,false); + } + /*! It constructs a grid of a specified size, defined on a specified Box space, forcing to follow a specified decomposition, and having a specified ghost size + * + * \param dec Decomposition + * \param g_sz grid size on each dimension + * \param g Ghost part (given in grid units) + * + * \warning In very rare case the ghost part can be one point bigger than the one specified + * + */ + template<typename Decomposition2> + grid_dist_id(const Decomposition2 & dec, const size_t (& g_sz)[dim], + const Ghost<dim,long int> & g) + :domain(dec.getDomain()),ghost_int(g),dec(create_vcluster()),v_cl(create_vcluster()), + ginfo(g_sz),ginfo_v(g_sz) + { InitializeCellDecomposer(g_sz,dec.periodicity()); ghost = convert_ghost(g,cd_sm); diff --git a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu index e2485f2451594448e57e1cba218fce7812888d71..3e516415a91f7d86224d1cd0dc125036b3f1aff0 100644 --- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu +++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu @@ -190,24 +190,24 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_output ) BOOST_AUTO_TEST_CASE( sgrid_gpu_test_save_and_load ) { -/* auto & v_cl = create_vcluster(); + auto & v_cl = create_vcluster(); if (v_cl.size() > 3){return;} - size_t sz[2] = {17,17}; + size_t sz[2] = {370,370}; periodicity<2> bc = {PERIODIC,PERIODIC}; Ghost<2,long int> g(1); Box<2,float> domain({0.0,0.0},{1.0,1.0}); - sgrid_dist_id_gpu<2,float,aggregate<float,float,float[2]>> gdist(sz,domain,g,bc); + sgrid_dist_id_gpu<2,float,aggregate<float,float>> gdist(sz,domain,g,bc); gdist.template setBackgroundValue<0>(666); /////// GPU insert + flush - Box<2,size_t> box({1,1},{15,15}); + Box<2,size_t> box({1,1},{350,350}); auto it = gdist.getGridIterator(box.getKP1(),box.getKP2()); /////// GPU Run kernel @@ -224,13 +224,10 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_save_and_load ) { data.template get<0>() = c + i + j; data.template get<1>() = c + 1000 + i + j; - - data.template get<2>()[0] = i; - data.template get<2>()[1] = j; } ); - gdist.template flush<smax_<0>,smax_<1>,smax_<2>>(flush_type::FLUSH_ON_DEVICE); + gdist.template flush<smax_<0>,smax_<1>>(flush_type::FLUSH_ON_DEVICE); gdist.template deviceToHost<0>(); @@ -238,9 +235,85 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_save_and_load ) // Now load - sgrid_dist_id_gpu<2,float,aggregate<float,float,float[2]>> gdist2(sz,domain,g,bc); + sgrid_dist_id_gpu<2,float,aggregate<float,float>> gdist2(sz,domain,g,bc); + + gdist2.load("sgrid_gpu_output_hdf5"); + + gdist2.template ghost_get<0,1>(RUN_ON_DEVICE); + + gdist2.deviceToHost<0,1>(); + gdist.deviceToHost<0,1>(); + + bool match = true; + + + auto it2 = gdist2.getDomainIterator(); + + while (it2.isNext()) + { + auto p = it2.get(); + + auto key = it2.getGKey(p); + + auto p_xp1 = p.move(0,1); + auto p_xm1 = p.move(0,-1); + auto p_yp1 = p.move(1,1); + auto p_ym1 = p.move(1,-1); + + auto key_xp1 = key.move(0,1); + auto key_xm1 = key.move(0,-1); + auto key_yp1 = key.move(1,1); + auto key_ym1 = key.move(1,-1); + + if (box.isInside(key_xp1.toPoint())) + { + match &= gdist.template get<0>(p_xp1) == c + key_xp1.get(0) + key_xp1.get(1); + + if (match == false) + { + std::cout << gdist.template get<0>(p_xp1) << " " << c + key_xp1.get(0) + key_xp1.get(1) << std::endl; + break; + } + } + + if (box.isInside(key_xm1.toPoint())) + { + match &= gdist.template get<0>(p_xm1) == c + key_xm1.get(0) + key_xm1.get(1); + + if (match == false) + { + std::cout << gdist.template get<0>(p_xm1) << " " << c + key_xm1.get(0) + key_xm1.get(1) << std::endl; + break; + } + } + + if (box.isInside(key_yp1.toPoint())) + { + match &= gdist.template get<0>(p_yp1) == c + key_yp1.get(0) + key_yp1.get(1); - gdist2.load("sgrid_gpu_output_hdf5");*/ + if (match == false) + { + std::cout << gdist.template get<0>(p_yp1) << " " << c + key_yp1.get(0) + key_yp1.get(1) << std::endl; + break; + } + } + + if (box.isInside(key_ym1.toPoint())) + { + match &= gdist.template get<0>(p_ym1) == c + key_ym1.get(0) + key_ym1.get(1); + + if (match == false) + { + std::cout << gdist.template get<0>(p_ym1) << " " << c + key_ym1.get(0) + key_ym1.get(1) << std::endl; + break; + } + } + + ++it2; + } + + + BOOST_REQUIRE_EQUAL(match,true); } void sgrid_ghost_get(size_t (& sz)[2],size_t (& sz2)[2]) @@ -1207,4 +1280,63 @@ BOOST_AUTO_TEST_CASE( sgrid_gpu_test_conv_background ) BOOST_REQUIRE_EQUAL(match,true); } +BOOST_AUTO_TEST_CASE( grid_dense_to_sparse_conversion ) +{ + Box<3,float> domain({0.0,0.0,0.0},{1.0,1.0,1.0}); + + // grid size + size_t sz[3]; + sz[0] = 32; + sz[1] = 32; + sz[2] = 32; + + // Ghost + Ghost<3,long int> g(1); + + periodicity<3> pr = {PERIODIC,PERIODIC,PERIODIC}; + + // Distributed grid with id decomposition + grid_dist_id<3, float, aggregate<float,float>> g_dist(sz,domain,g,pr); + + auto it = g_dist.getDomainIterator(); + + while (it.isNext()) + { + auto p = it.get(); + auto gkey = it.getGKey(p); + + g_dist.template getProp<0>(p) = gkey.get(0) + gkey.get(1) + gkey.get(2); + g_dist.template getProp<1>(p) = 3.0*gkey.get(0) + gkey.get(1) + gkey.get(2); + + ++it; + } + + sgrid_dist_id_gpu<3,float,aggregate<float,float>> sgdist(g_dist.getDecomposition(),sz,g); + + while (it.isNext()) + { + auto p = it.get(); + auto gkey = it.getGKey(p); + + sgdist.template insertFlush<0>(p) = g_dist.template get<0>(p); + + ++it; + } + + + bool check = true; + + while (it.isNext()) + { + auto p = it.get(); + auto gkey = it.getGKey(p); + + check &= sgdist.template getProp<0>(p) == g_dist.template get<0>(p); + + ++it; + } + + BOOST_REQUIRE_EQUAL(check,true); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/Vector/cuda/vector_dist_cuda_funcs.cuh b/src/Vector/cuda/vector_dist_cuda_funcs.cuh index 78c4c9c5c72a508c9d2c6716643dc714c3e97e5a..eab71639cc73957b6eba5adcc66d3ed8055b6a0d 100644 --- a/src/Vector/cuda/vector_dist_cuda_funcs.cuh +++ b/src/Vector/cuda/vector_dist_cuda_funcs.cuh @@ -419,6 +419,17 @@ void remove_marked(vector_type & vd, const int n = 1024) // we reuse memory. this give us the possibility to avoid allocation and make the remove faster + // Reference counter must be set to zero + +/* for (int i = 0 ; i < MAX_NUMER_OF_PROPERTIES ; i++) + { + for (int j = 0 ; j < exp_tmp2[i].ref() ; j++) + {exp_tmp2[i].decRef();} + } + + for (int j = 0 ; j < exp_tmp.ref() ; j++) + {exp_tmp.decRef();}*/ + vd_pos_new.setMemory(exp_tmp); vd_prp_new.setMemoryArray((CudaMemory *)&exp_tmp2); @@ -436,6 +447,10 @@ void remove_marked(vector_type & vd, const int n = 1024) vd.getPosVector().swap_nomode(vd_pos_new); vd.getPropVector().swap_nomode(vd_prp_new); + + // Increment v_pos_new and vd_prp_new memory counters + + vd.setReferenceCounterToOne(); } template<unsigned int prp, typename functor, typename particles_type, typename out_type> diff --git a/src/Vector/vector_dist.hpp b/src/Vector/vector_dist.hpp index 41b58d3797553927fca2ca10e71fe8b4f3d47497..cdf5e3aca5e11afe4af787b3cbe8d8325ce5af47 100644 --- a/src/Vector/vector_dist.hpp +++ b/src/Vector/vector_dist.hpp @@ -221,6 +221,37 @@ struct cell_list_selector<vector,comp_host> } }; +template<typename vector_type> +struct decrement_memory +{ + //! vector + vector_type & v; + + + /*! \brief constructor + * + * + * \param src encapsulated object1 + * \param dst encapsulated object2 + * + */ + inline decrement_memory(vector_type & v) + :v(v) + {}; + + + /*! \brief It call the copy function for each property + * + * \param t each member + * + */ + template<typename T> + inline void operator()(T& t) const + { + v.template getMemory<T::value>().decRef(); + } +}; + /*! \brief Distributed vector * * This class represent a distributed vector, the distribution of the structure @@ -624,6 +655,24 @@ public: #endif } + /*! Set reference counter to one + * + */ + void setReferenceCounterToOne() + { + for (int i = 0 ; i < v_pos.template getMemory<0>().ref() - 1; i++) + { + v_pos.template getMemory<0>().decRef(); + } + + for (int i = 0 ; i < v_prp.template getMemory<0>().ref() - 1; i++) + { + decrement_memory<decltype(v_prp)> m(v_prp); + + boost::mpl::for_each_ref<boost::mpl::range_c<int,0,decltype(v_prp)::value_type::max_prop>>(m); + } + } + /*! \brief remove all the elements * * diff --git a/src/Vector/vector_dist_kernel.hpp b/src/Vector/vector_dist_kernel.hpp index cc42a53e3632d9181c5d14df1b9288bc8aee64dd..b06a0a98c23f50c681a2f77788b01d56aa8535f4 100644 --- a/src/Vector/vector_dist_kernel.hpp +++ b/src/Vector/vector_dist_kernel.hpp @@ -12,7 +12,7 @@ constexpr unsigned int POS_PROP = (unsigned int)-1; #ifdef CUDA_GPU -#define GET_PARTICLE(vd) blockDim.x*blockIdx.x + threadIdx.x; if (blockDim.x*blockIdx.x + threadIdx.x >= static_cast<int>(vd.size_local())) {return;}; +#define GET_PARTICLE(vd) blockDim.x*blockIdx.x + threadIdx.x; if (blockDim.x*blockIdx.x + threadIdx.x >= static_cast<unsigned int>(vd.size_local())) {return;}; #define GET_PARTICLE_SORT(p,NN) if (blockDim.x*blockIdx.x + threadIdx.x >= NN.get_g_m()) {return;}\ else{p = NN.getDomainSortIds().template get<0>(blockDim.x*blockIdx.x + threadIdx.x);}