...
 
Commits (4)
......@@ -3,7 +3,17 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
########################### Executables
if (TEST_PERFORMANCE)
set(CUDA_SOURCES SparseGridGpu/performance/Stencil_performance_tests.cu SparseGridGpu/performance/performancePlots.hpp SparseGridGpu/tests/utils/SparseGridGpu_testKernels.cuh)
set(CUDA_SOURCES SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_sparse.cu
SparseGridGpu/performance/SparseGridGpu_performance_insert_stencil.cu
SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil.cu
SparseGridGpu/performance/SparseGridGpu_performance_stencil_heat_host.cu
SparseGridGpu/performance/SparseGridGpu_performance_get_nn.cu
SparseGridGpu/performance/SparseGridGpu_performance_get_single.cu
SparseGridGpu/performance/SparseGridGpu_performance_insert_single.cu
SparseGridGpu/performance/SparseGridGpu_performance_tests.cu
SparseGridGpu/performance/SparseGridGpu_performance_insert_block.cu
SparseGridGpu/performance/SparseGridGpu_performance_heat_stencil_3d.cu
SparseGridGpu/performance/performancePlots.cpp)
endif ()
if (CUDA_FOUND)
......
/*
* SparseGridGpu_performance_get_nn.cu
*
* Created on: Sep 10, 2019
* Author: i-bird
*/
#define SCAN_WITH_CUB
#define BOOST_TEST_DYN_LINK
#define OPENFPM_DATA_ENABLE_IO_MODULE
#define DISABLE_MPI_WRITTERS
#include <boost/test/unit_test.hpp>
#include "performancePlots.hpp"
#include <iostream>
#include "SparseGridGpu/SparseGridGpu.hpp"
#include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
extern std::string suiteURI;
extern report_sparse_grid_tests report_sparsegrid_funcs;
extern std::set<std::string> testSet;
template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
void testGetNeighbourhood(std::string testURI, unsigned int i)
{
auto testName = "Get single - neighbourhood avg";
constexpr unsigned int dim = 2;
// constexpr unsigned int blockEdgeSize = 8;
constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
typedef aggregate<float> AggregateT;
unsigned int iterations = 10;
// std::string base("performance.SparseGridGpu(" + std::to_string(i) + ").getSingle");
std::string base(testURI + "(" + std::to_string(i) + ")");
report_sparsegrid_funcs.graphs.put(base + ".test.name","Get");
report_sparsegrid_funcs.graphs.put(base + ".dim",dim);
report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*blockEdgeSize);
report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*blockEdgeSize);
dim3 gridSize(gridEdgeSize, gridEdgeSize);
dim3 blockSize(blockEdgeSize, blockEdgeSize);
dim3 blockSizeBlockedInsert(1, 1);
grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry);
mgpu::ofp_context_t ctx;
sparseGrid.template setBackgroundValue<0>(0);
// Now fill the grid once
auto offset = 0;
sparseGrid.setGPUInsertBuffer(gridSize, blockSizeBlockedInsert);
insertValues2DBlocked<0, 1, blockEdgeSize> << < gridSize, blockSize >> >
(sparseGrid.toKernel(), offset, offset);
sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
unsigned long long numElements = gridEdgeSize*blockEdgeSize*gridEdgeSize*blockEdgeSize;
openfpm::vector<double> measures;
for (unsigned int iter=0; iter<iterations; ++iter)
{
auto offset = 0;
cudaDeviceSynchronize();
timer ts;
ts.start();
getValuesNeighbourhood2D<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), offset, offset);
cudaDeviceSynchronize();
ts.stop();
float gElemS = 9 * numElements / (1e9 * ts.getwct());
measures.add(gElemS);
}
double mean = 0;
double deviation = 0;
standard_deviation(measures,mean,deviation);
report_sparsegrid_funcs.graphs.put(base + ".Gget.mean",mean);
report_sparsegrid_funcs.graphs.put(base +".Gget.dev",deviation);
// All times above are in ms
std::cout << "Test: " << testName << "\n";
std::cout << "Block: " << blockEdgeSize << "x" << blockEdgeSize << "\n";
std::cout << "Grid: " << gridEdgeSize*blockEdgeSize << "x" << gridEdgeSize*blockEdgeSize << "\n";
double dataOccupancyMean, dataOccupancyDev;
sparseGrid.deviceToHost();
sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
std::cout << "Iterations: " << iterations << "\n";
std::cout << "Throughput:\n\t" << mean << "GElem/s" << "\n";
}
BOOST_AUTO_TEST_SUITE(performance)
BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
BOOST_AUTO_TEST_CASE(testGetNeighbourhood_gridScaling_2)
{
std::string testURI = suiteURI + ".device.get.dense.neighbourhood.2D.2.gridScaling";
unsigned int counter = 0;
testGetNeighbourhood<2, 128>(testURI, counter++);
testGetNeighbourhood<2, 256>(testURI, counter++);
testGetNeighbourhood<2, 512>(testURI, counter++);
testGetNeighbourhood<2, 1024>(testURI, counter++);
testGetNeighbourhood<2, 2048>(testURI, counter++);
testGetNeighbourhood<2, 4096>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testGetNeighbourhood_gridScaling_4)
{
std::string testURI = suiteURI + ".device.get.dense.neighbourhood.2D.4.gridScaling";
unsigned int counter = 0;
testGetNeighbourhood<4, 64>(testURI, counter++);
testGetNeighbourhood<4, 128>(testURI, counter++);
testGetNeighbourhood<4, 256>(testURI, counter++);
testGetNeighbourhood<4, 512>(testURI, counter++);
testGetNeighbourhood<4, 1024>(testURI, counter++);
testGetNeighbourhood<4, 2048>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testGetNeighbourhood_gridScaling_8)
{
std::string testURI = suiteURI + ".device.get.dense.neighbourhood.2D.8.gridScaling";
unsigned int counter = 0;
testGetNeighbourhood<8, 32>(testURI, counter++);
testGetNeighbourhood<8, 64>(testURI, counter++);
testGetNeighbourhood<8, 128>(testURI, counter++);
testGetNeighbourhood<8, 256>(testURI, counter++);
testGetNeighbourhood<8, 512>(testURI, counter++);
testGetNeighbourhood<8, 1024>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testGetNeighbourhood_gridScaling_16)
{
std::string testURI = suiteURI + ".device.get.dense.neighbourhood.2D.16.gridScaling";
unsigned int counter = 0;
testGetNeighbourhood<16, 16>(testURI, counter++);
testGetNeighbourhood<16, 32>(testURI, counter++);
testGetNeighbourhood<16, 64>(testURI, counter++);
testGetNeighbourhood<16, 128>(testURI, counter++);
testGetNeighbourhood<16, 256>(testURI, counter++);
testGetNeighbourhood<16, 512>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testGetNeighbourhood_gridScaling_32)
{
std::string testURI = suiteURI + ".device.get.dense.neighbourhood.2D.32.gridScaling";
unsigned int counter = 0;
testGetNeighbourhood<32, 8>(testURI, counter++);
testGetNeighbourhood<32, 16>(testURI, counter++);
testGetNeighbourhood<32, 32>(testURI, counter++);
testGetNeighbourhood<32, 64>(testURI, counter++);
testGetNeighbourhood<32, 128>(testURI, counter++);
testGetNeighbourhood<32, 256>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testGetNeighbourhood_blockScaling)
{
std::string testURI = suiteURI + ".device.get.dense.neighbourhood.2D.blockScaling";
unsigned int counter = 0;
testGetNeighbourhood<2, 1024>(testURI, counter++);
testGetNeighbourhood<4, 512>(testURI, counter++);
testGetNeighbourhood<8, 256>(testURI, counter++);
testGetNeighbourhood<16, 128>(testURI, counter++);
testGetNeighbourhood<32, 64>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_SUITE_END()
BOOST_AUTO_TEST_SUITE_END()
/*
* SparseGridGpu_performance_get_single.cu
*
* Created on: Sep 9, 2019
* Author: i-bird
*/
#define SCAN_WITH_CUB
#define BOOST_TEST_DYN_LINK
#define OPENFPM_DATA_ENABLE_IO_MODULE
#define DISABLE_MPI_WRITTERS
#include <boost/test/unit_test.hpp>
#include "performancePlots.hpp"
#include <iostream>
#include "SparseGridGpu/SparseGridGpu.hpp"
#include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
extern std::string suiteURI;
extern report_sparse_grid_tests report_sparsegrid_funcs;
extern std::set<std::string> testSet;
template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
void testGetSingle(std::string testURI, unsigned int i)
{
auto testName = "Get single";
constexpr unsigned int dim = 2;
// constexpr unsigned int blockEdgeSize = 8;
constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
typedef aggregate<float> AggregateT;
unsigned int iterations = 10;
// std::string base("performance.SparseGridGpu(" + std::to_string(i) + ").getSingle");
std::string base(testURI + "(" + std::to_string(i) + ")");
report_sparsegrid_funcs.graphs.put(base + ".test.name","Get");
report_sparsegrid_funcs.graphs.put(base + ".dim",dim);
report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*blockEdgeSize);
report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*blockEdgeSize);
dim3 gridSize(gridEdgeSize, gridEdgeSize);
dim3 blockSize(blockEdgeSize, blockEdgeSize);
dim3 blockSizeBlockedInsert(1, 1);
grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry);
mgpu::ofp_context_t ctx;
sparseGrid.template setBackgroundValue<0>(0);
// Now fill the grid once
auto offset = 0;
sparseGrid.setGPUInsertBuffer(gridSize, blockSizeBlockedInsert);
insertValues2DBlocked<0, 1, blockEdgeSize> << < gridSize, blockSize >> >
(sparseGrid.toKernel(), offset, offset);
sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
unsigned long long numElements = gridEdgeSize*blockEdgeSize*gridEdgeSize*blockEdgeSize;
openfpm::vector<double> measures;
for (unsigned int iter=0; iter<iterations; ++iter)
{
auto offset = 0;
cudaDeviceSynchronize();
timer ts;
ts.start();
getValues2D<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), offset, offset);
cudaDeviceSynchronize();
ts.stop();
float gElemS = numElements / (1e9 * ts.getwct());
measures.add(gElemS);
}
double mean = 0;
double deviation = 0;
standard_deviation(measures,mean,deviation);
report_sparsegrid_funcs.graphs.put(base + ".Gget.mean",mean);
report_sparsegrid_funcs.graphs.put(base +".Gget.dev",deviation);
// All times above are in ms
std::cout << "Test: " << testName << "\n";
std::cout << "Block: " << blockEdgeSize << "x" << blockEdgeSize << "\n";
std::cout << "Grid: " << gridEdgeSize*blockEdgeSize << "x" << gridEdgeSize*blockEdgeSize << "\n";
double dataOccupancyMean, dataOccupancyDev;
sparseGrid.deviceToHost();
sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
std::cout << "Iterations: " << iterations << "\n";
std::cout << "Throughput:\n\t" << mean << "GElem/s" << "\n";
}
BOOST_AUTO_TEST_SUITE(performance)
BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
BOOST_AUTO_TEST_CASE(testGet_gridScaling_2)
{
std::string testURI = suiteURI + ".device.get.dense.single.2D.2.gridScaling";
unsigned int counter = 0;
testGetSingle<2, 128>(testURI, counter++);
testGetSingle<2, 256>(testURI, counter++);
testGetSingle<2, 512>(testURI, counter++);
testGetSingle<2, 1024>(testURI, counter++);
testGetSingle<2, 2048>(testURI, counter++);
testGetSingle<2, 4096>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testGet_gridScaling_4)
{
std::string testURI = suiteURI + ".device.get.dense.single.2D.4.gridScaling";
unsigned int counter = 0;
testGetSingle<4, 64>(testURI, counter++);
testGetSingle<4, 128>(testURI, counter++);
testGetSingle<4, 256>(testURI, counter++);
testGetSingle<4, 512>(testURI, counter++);
testGetSingle<4, 1024>(testURI, counter++);
testGetSingle<4, 2048>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testGet_gridScaling_8)
{
std::string testURI = suiteURI + ".device.get.dense.single.2D.8.gridScaling";
unsigned int counter = 0;
testGetSingle<8, 32>(testURI, counter++);
testGetSingle<8, 64>(testURI, counter++);
testGetSingle<8, 128>(testURI, counter++);
testGetSingle<8, 256>(testURI, counter++);
testGetSingle<8, 512>(testURI, counter++);
testGetSingle<8, 1024>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testGet_gridScaling_16)
{
std::string testURI = suiteURI + ".device.get.dense.single.2D.16.gridScaling";
unsigned int counter = 0;
testGetSingle<16, 16>(testURI, counter++);
testGetSingle<16, 32>(testURI, counter++);
testGetSingle<16, 64>(testURI, counter++);
testGetSingle<16, 128>(testURI, counter++);
testGetSingle<16, 256>(testURI, counter++);
testGetSingle<16, 512>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testGet_gridScaling_32)
{
std::string testURI = suiteURI + ".device.get.dense.single.2D.32.gridScaling";
unsigned int counter = 0;
testGetSingle<32, 8>(testURI, counter++);
testGetSingle<32, 16>(testURI, counter++);
testGetSingle<32, 32>(testURI, counter++);
testGetSingle<32, 64>(testURI, counter++);
testGetSingle<32, 128>(testURI, counter++);
testGetSingle<32, 256>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testGet_blockScaling)
{
std::string testURI = suiteURI + ".device.get.dense.single.2D.blockScaling";
unsigned int counter = 0;
testGetSingle<2, 1024>(testURI, counter++);
testGetSingle<4, 512>(testURI, counter++);
testGetSingle<8, 256>(testURI, counter++);
testGetSingle<16, 128>(testURI, counter++);
testGetSingle<32, 64>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_SUITE_END()
BOOST_AUTO_TEST_SUITE_END()
/*
* SparseGridGpu_performance_insert_block.cu
*
* Created on: Sep 10, 2019
* Author: i-bird
*/
#define SCAN_WITH_CUB
#define BOOST_TEST_DYN_LINK
#define OPENFPM_DATA_ENABLE_IO_MODULE
#define DISABLE_MPI_WRITTERS
#include <boost/test/unit_test.hpp>
#include "performancePlots.hpp"
#include <iostream>
#include "SparseGridGpu/SparseGridGpu.hpp"
#include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
extern std::string suiteURI;
extern report_sparse_grid_tests report_sparsegrid_funcs;
extern std::set<std::string> testSet;
template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
void test_insert_block(std::string testURI, unsigned int i)
{
auto testName = "Insert (one chunk per block)";
constexpr unsigned int dim = 2;
// constexpr unsigned int blockEdgeSize = 8;
constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
typedef aggregate<float> AggregateT;
// std::string base("performance.SparseGridGpu(" + std::to_string(i) + ").insert");
std::string base(testURI + "(" + std::to_string(i) + ")");
report_sparsegrid_funcs.graphs.put(base + ".test.name","InsertBlock");
report_sparsegrid_funcs.graphs.put(base + ".name","Block insert");
report_sparsegrid_funcs.graphs.put(base + ".dim",dim);
report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*blockEdgeSize);
report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*blockEdgeSize);
unsigned int iterations = 10;
openfpm::vector<double> measures;
unsigned long long numElements = gridEdgeSize*blockEdgeSize*gridEdgeSize*blockEdgeSize;
dim3 gridSize(gridEdgeSize, gridEdgeSize);
dim3 blockSize(blockEdgeSize, blockEdgeSize);
dim3 blockSizeBlockedInsert(1, 1);
grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry);
mgpu::ofp_context_t ctx;
sparseGrid.template setBackgroundValue<0>(0);
// Warmup
for (unsigned int iter=0; iter<5; ++iter)
{
auto offset = 0;
sparseGrid.setGPUInsertBuffer(gridSize, blockSizeBlockedInsert);
insertValues2DBlocked<0, 1, blockEdgeSize> << < gridSize, blockSize >> >
(sparseGrid.toKernel(), offset, offset);
sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
}
cudaDeviceSynchronize();
for (unsigned int iter=0; iter<iterations; ++iter)
{
auto offset = 0;
cudaDeviceSynchronize();
timer ts;
ts.start();
sparseGrid.setGPUInsertBuffer(gridSize, blockSizeBlockedInsert);
insertValues2DBlocked<0, 1, blockEdgeSize> << < gridSize, blockSize >> >
(sparseGrid.toKernel(), offset, offset);
sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
cudaDeviceSynchronize();
ts.stop();
float mElemS = numElements / (1e6 * ts.getwct());
measures.add(mElemS);
}
double mean = 0;
double deviation = 0;
standard_deviation(measures,mean,deviation);
report_sparsegrid_funcs.graphs.put(base + ".Minsert.mean",mean);
report_sparsegrid_funcs.graphs.put(base +".Minsert.dev",deviation);
// All times above are in ms
std::cout << "Test: " << testName << "\n";
std::cout << "Block: " << blockEdgeSize << "x" << blockEdgeSize << "\n";
std::cout << "Grid: " << gridEdgeSize*blockEdgeSize << "x" << gridEdgeSize*blockEdgeSize << "\n";
double dataOccupancyMean, dataOccupancyDev;
sparseGrid.deviceToHost();
sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
std::cout << "Iterations: " << iterations << "\n";
std::cout << "\tInsert: " << mean << " dev: " << deviation << " s" << std::endl;
std::cout << "Throughput:\n\t" << mean << " MElem/s\n";
}
BOOST_AUTO_TEST_SUITE(performance)
BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_2)
{
std::string testURI = suiteURI + ".device.insert.dense.block.2D.2.gridScaling";
unsigned int counter = 0;
test_insert_block<2,128>(testURI, counter++);
test_insert_block<2,256>(testURI, counter++);
test_insert_block<2,512>(testURI, counter++);
test_insert_block<2,1024>(testURI, counter++);
test_insert_block<2,2048>(testURI, counter++);
// test_insert_block<2,4096>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_4)
{
std::string testURI = suiteURI + ".device.insert.dense.block.2D.4.gridScaling";
unsigned int counter = 0;
test_insert_block<4,64>(testURI, counter++);
test_insert_block<4,128>(testURI, counter++);
test_insert_block<4,256>(testURI, counter++);
test_insert_block<4,512>(testURI, counter++);
test_insert_block<4,1024>(testURI, counter++);
test_insert_block<4,2048>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_8)
{
std::string testURI = suiteURI + ".device.insert.dense.block.2D.8.gridScaling";
unsigned int counter = 0;
test_insert_block<8,32>(testURI, counter++);
test_insert_block<8,64>(testURI, counter++);
test_insert_block<8,128>(testURI, counter++);
test_insert_block<8,256>(testURI, counter++);
test_insert_block<8,512>(testURI, counter++);
test_insert_block<8,1024>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_16)
{
std::string testURI = suiteURI + ".device.insert.dense.block.2D.16.gridScaling";
unsigned int counter = 0;
test_insert_block<16,16>(testURI, counter++);
test_insert_block<16,32>(testURI, counter++);
test_insert_block<16,64>(testURI, counter++);
test_insert_block<16,128>(testURI, counter++);
test_insert_block<16,256>(testURI, counter++);
test_insert_block<16,512>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testInsertBlocked_gridScaling_32)
{
std::string testURI = suiteURI + ".device.insert.dense.block.2D.32.gridScaling";
unsigned int counter = 0;
test_insert_block<32,8>(testURI, counter++);
test_insert_block<32,16>(testURI, counter++);
test_insert_block<32,32>(testURI, counter++);
test_insert_block<32,64>(testURI, counter++);
test_insert_block<32,128>(testURI, counter++);
test_insert_block<32,256>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testInsertBlocked_blockScaling)
{
std::string testURI = suiteURI + ".device.insert.dense.block.2D.blockScaling";
unsigned int counter = 0;
test_insert_block<2,2048>(testURI, counter++);
test_insert_block<4,1024>(testURI, counter++);
test_insert_block<8,512>(testURI, counter++);
test_insert_block<16,256>(testURI, counter++);
test_insert_block<32,128>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_SUITE_END()
BOOST_AUTO_TEST_SUITE_END()
/*
* SparseGridGpu_performance_insert_single.cu
*
* Created on: Sep 10, 2019
* Author: i-bird
*/
#define SCAN_WITH_CUB
#define BOOST_TEST_DYN_LINK
#define OPENFPM_DATA_ENABLE_IO_MODULE
#define DISABLE_MPI_WRITTERS
#include <boost/test/unit_test.hpp>
#include "performancePlots.hpp"
#include <iostream>
#include "SparseGridGpu/SparseGridGpu.hpp"
#include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
extern std::string suiteURI;
extern report_sparse_grid_tests report_sparsegrid_funcs;
extern std::set<std::string> testSet;
template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
void testInsertSingle(std::string testURI, unsigned int i)
{
auto testName = "Insert single (one chunk per element)";
constexpr unsigned int dim = 2;
// constexpr unsigned int blockEdgeSize = 8;
constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
typedef aggregate<float> AggregateT;
unsigned int iterations = 10;
bool prePopulateGrid = true;
// std::string base("performance.SparseGridGpu(" + std::to_string(i) + ").insertSingle");
std::string base(testURI + "(" + std::to_string(i) + ")");
report_sparsegrid_funcs.graphs.put(base + ".test.name","InsertSingle");
report_sparsegrid_funcs.graphs.put(base + ".dim",dim);
report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*blockEdgeSize);
report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*blockEdgeSize);
dim3 gridSize(gridEdgeSize, gridEdgeSize);
dim3 blockSize(blockEdgeSize, blockEdgeSize);
grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry);
mgpu::ofp_context_t ctx;
sparseGrid.template setBackgroundValue<0>(0);
if (prePopulateGrid)
{
// Pre-populate grid
sparseGrid.setGPUInsertBuffer(gridSize, blockSize);
insertValues2D<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), 0, 0);
sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
cudaDeviceSynchronize();
///
}
for (unsigned int iter=0; iter<5; ++iter)
{
auto offset = 0;
sparseGrid.setGPUInsertBuffer(gridSize, blockSize);
insertValues2D<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), offset, offset);
sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
cudaDeviceSynchronize();
}
unsigned long long numElements = gridEdgeSize*blockEdgeSize*gridEdgeSize*blockEdgeSize;
openfpm::vector<double> measures;
for (unsigned int iter=0; iter<iterations; ++iter)
{
auto offset = 0;
cudaDeviceSynchronize();
timer ts;
ts.start();
sparseGrid.setGPUInsertBuffer(gridSize, blockSize);
insertValues2D<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), offset, offset);
sparseGrid.template flush < smax_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
cudaDeviceSynchronize();
ts.stop();
float mElemS = numElements / (1e6 * ts.getwct());
measures.add(mElemS);
}
double mean = 0;
double deviation = 0;
standard_deviation(measures,mean,deviation);
report_sparsegrid_funcs.graphs.put(base + ".Minsert.mean",mean);
report_sparsegrid_funcs.graphs.put(base +".Minsert.dev",deviation);
// All times above are in ms
std::cout << "Test: " << testName << "\n";
std::cout << "Block: " << blockEdgeSize << "x" << blockEdgeSize << "\n";
std::cout << "Grid: " << gridEdgeSize*blockEdgeSize << "x" << gridEdgeSize*blockEdgeSize << "\n";
double dataOccupancyMean, dataOccupancyDev;
sparseGrid.deviceToHost();
sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
std::cout << "Iterations: " << iterations << "\n";
std::cout << "Throughput:\n\t" << mean << "M/s" << "\n";
}
BOOST_AUTO_TEST_SUITE(performance)
BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
BOOST_AUTO_TEST_CASE(testInsert_gridScaling_2)
{
std::string testURI = suiteURI + ".device.insert.dense.single.2D.2.gridScaling";
unsigned int counter = 0;
testInsertSingle<2, 128>(testURI, counter++);
testInsertSingle<2, 256>(testURI, counter++);
testInsertSingle<2, 512>(testURI, counter++);
testInsertSingle<2, 1024>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testInsert_gridScaling_4)
{
std::string testURI = suiteURI + ".device.insert.dense.single.2D.4.gridScaling";
unsigned int counter = 0;
testInsertSingle<4, 64>(testURI, counter++);
testInsertSingle<4, 128>(testURI, counter++);
testInsertSingle<4, 256>(testURI, counter++);
testInsertSingle<4, 512>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testInsert_gridScaling_8)
{
std::string testURI = suiteURI + ".device.insert.dense.single.2D.8.gridScaling";
unsigned int counter = 0;
testInsertSingle<8, 32>(testURI, counter++);
testInsertSingle<8, 64>(testURI, counter++);
testInsertSingle<8, 128>(testURI, counter++);
testInsertSingle<8, 256>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testInsert_blockScaling)
{
std::string testURI = suiteURI + ".device.insert.dense.single.2D.blockScaling";
unsigned int counter = 0;
testInsertSingle<2, 1024>(testURI, counter++);
testInsertSingle<4, 512>(testURI, counter++);
testInsertSingle<8, 256>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_SUITE_END()
BOOST_AUTO_TEST_SUITE_END()
/*
* SparseGridGpu_performance_insert_stencil.cu
*
* Created on: Sep 10, 2019
* Author: i-bird
*/
#define SCAN_WITH_CUB
#define BOOST_TEST_DYN_LINK
#define OPENFPM_DATA_ENABLE_IO_MODULE
#define DISABLE_MPI_WRITTERS
#include <boost/test/unit_test.hpp>
#include "performancePlots.hpp"
#include <iostream>
#include "SparseGridGpu/SparseGridGpu.hpp"
#include "SparseGridGpu/tests/utils/SparseGridGpu_util_test.cuh"
extern std::string suiteURI;
extern report_sparse_grid_tests report_sparsegrid_funcs;
extern std::set<std::string> testSet;
template<unsigned int blockEdgeSize, unsigned int gridEdgeSize>
void testInsertStencil(std::string testURI, unsigned int i)
{
auto testName = "Insert stencil";
constexpr unsigned int dim = 2;
// constexpr unsigned int blockEdgeSize = 8;
constexpr unsigned int chunkSize = IntPow<blockEdgeSize,dim>::value;
typedef aggregate<float> AggregateT;
typedef HeatStencil<dim,0,0> StencilT;
unsigned int iterations = 10;
std::string base(testURI + "(" + std::to_string(i) + ")");
report_sparsegrid_funcs.graphs.put(base + ".test.name","StencilInsertN");
report_sparsegrid_funcs.graphs.put(base + ".dim",2);
report_sparsegrid_funcs.graphs.put(base + ".blockSize",blockEdgeSize);
report_sparsegrid_funcs.graphs.put(base + ".gridSize.x",gridEdgeSize*blockEdgeSize);
report_sparsegrid_funcs.graphs.put(base + ".gridSize.y",gridEdgeSize*blockEdgeSize);
dim3 gridSize(gridEdgeSize, gridEdgeSize);
dim3 blockSize(blockEdgeSize, blockEdgeSize);
grid_smb<dim, blockEdgeSize> blockGeometry(gridSize);
SparseGridGpu<dim, AggregateT, blockEdgeSize, chunkSize> sparseGrid(blockGeometry);
mgpu::ofp_context_t ctx;
sparseGrid.template setBackgroundValue<0>(0);
// Initialize the grid
sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
CUDA_LAUNCH_DIM3((insertConstantValue<0>),gridSize, blockSize,sparseGrid.toKernel(), 0);
sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
sparseGrid.setGPUInsertBuffer(gridSize, dim3(1));
dim3 sourcePt(gridSize.x * blockEdgeSize / 2, gridSize.y * blockEdgeSize / 2, 0);
insertOneValue<0> << < gridSize, blockSize >> > (sparseGrid.toKernel(), sourcePt, 100);
sparseGrid.template flush < sRight_ < 0 >> (ctx, flush_type::FLUSH_ON_DEVICE);
sparseGrid.findNeighbours(); // Pre-compute the neighbours pos for each block!
unsigned long long numElements = gridEdgeSize*blockEdgeSize*gridEdgeSize*blockEdgeSize;
for (unsigned int iter=0; iter<5; ++iter)
{
sparseGrid.template applyStencils<StencilT>(STENCIL_MODE_INSERT, 0.1);
sparseGrid.template flush<smax_<0>>(ctx, flush_type::FLUSH_ON_DEVICE);
}
openfpm::vector<double> gElemSMeasures;
openfpm::vector<double> gFlopsSMeasures;
for (unsigned int iter=0; iter<iterations; ++iter)
{
timer ts;
ts.start();
cudaDeviceSynchronize();
sparseGrid.template applyStencils<StencilT>(STENCIL_MODE_INSERT, 0.1);
cudaDeviceSynchronize();
ts.stop();
float gElemS = numElements / (1e9 * ts.getwct());
float gFlopsS = gElemS * StencilT::flops;
gElemSMeasures.add(gElemS);
gFlopsSMeasures.add(gFlopsS);
}
double elemMean=0, elemDeviation=0;
standard_deviation(gElemSMeasures, elemMean, elemDeviation);
report_sparsegrid_funcs.graphs.put(base + ".GElems.mean",elemMean);
report_sparsegrid_funcs.graphs.put(base +".GElems.dev",elemDeviation);
double flopsMean=0, flopsDeviation=0;
standard_deviation(gFlopsSMeasures, flopsMean, flopsDeviation);
report_sparsegrid_funcs.graphs.put(base + ".GFlops.mean",flopsMean);
report_sparsegrid_funcs.graphs.put(base +".GFlops.dev",flopsDeviation);
std::cout << "Test: " << testName << "\n";
std::cout << "Block: " << blockEdgeSize << "x" << blockEdgeSize << "\n";
std::cout << "Grid: " << gridEdgeSize*blockEdgeSize << "x" << gridEdgeSize*blockEdgeSize << "\n";
double dataOccupancyMean, dataOccupancyDev;
sparseGrid.deviceToHost();
sparseGrid.measureBlockOccupancy(dataOccupancyMean, dataOccupancyDev);std::cout << "Data Occupancy: " << dataOccupancyMean << " dev:" << dataOccupancyDev << std::endl;
report_sparsegrid_funcs.graphs.put(base + ".dataOccupancy.mean",dataOccupancyMean);
report_sparsegrid_funcs.graphs.put(base +".dataOccupancy.dev",dataOccupancyDev);
std::cout << "Iterations: " << iterations << "\n";
std::cout << "Throughput:\n\t" << elemMean << " GElem/s dev: " << elemDeviation << " GElem/s" << std::endl
<< "\t" << flopsMean << " GFlops/s dev: " << flopsDeviation << " GFlops/s" << std::endl;
}
BOOST_AUTO_TEST_SUITE(performance)
BOOST_AUTO_TEST_SUITE(SparseGridGpu_test)
BOOST_AUTO_TEST_CASE(testStencilHeatInsert_gridScaling)
{
std::string testURI = suiteURI + ".device.stencilInsert.dense.N.2D.gridScaling";
unsigned int counter = 0;
testInsertStencil<8, 64>(testURI, counter++);
testInsertStencil<8, 128>(testURI, counter++);
testInsertStencil<8, 256>(testURI, counter++);
testInsertStencil<8, 512>(testURI, counter++);
testInsertStencil<8, 1024>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testStencilHeatInsert_gridScaling_8)
{
std::string testURI = suiteURI + ".device.stencilInsert.dense.N.2D.8.gridScaling";
unsigned int counter = 0;
testInsertStencil<8, 64>(testURI, counter++);
testInsertStencil<8, 128>(testURI, counter++);
testInsertStencil<8, 256>(testURI, counter++);
testInsertStencil<8, 512>(testURI, counter++);
testInsertStencil<8, 1024>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testStencilHeatInsert_gridScaling_16)
{
std::string testURI = suiteURI + ".device.stencilInsert.dense.N.2D.16.gridScaling";
unsigned int counter = 0;
testInsertStencil<16, 32>(testURI, counter++);
testInsertStencil<16, 64>(testURI, counter++);
testInsertStencil<16, 128>(testURI, counter++);
testInsertStencil<16, 256>(testURI, counter++);
testInsertStencil<16, 512>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_CASE(testStencilHeatInsert_blockScaling)
{
std::string testURI = suiteURI + ".device.stencilInsert.dense.N.2D.blockScaling";
unsigned int counter = 0;
testInsertStencil<4, 1024>(testURI, counter++);
testInsertStencil<8, 512>(testURI, counter++);
testInsertStencil<16, 256>(testURI, counter++);
testInsertStencil<32, 128>(testURI, counter++);
testSet.insert(testURI);
}
BOOST_AUTO_TEST_SUITE_END()
BOOST_AUTO_TEST_SUITE_END()
This diff is collapsed.
This diff is collapsed.
......@@ -10,6 +10,8 @@
#include <boost/lexical_cast.hpp>
#include <boost/filesystem.hpp>
#include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/xml_parser.hpp>
static void addUpdtateTime(GoogleChart & cg, int np)
{
......