Skip to content
Snippets Groups Projects
Commit ef72e544 authored by Pietro Incardona's avatar Pietro Incardona
Browse files

Separate read and write

parent 75d61202
No related branches found
No related tags found
No related merge requests found
Pipeline #4034 passed
......@@ -5,7 +5,7 @@
//! Memory bandwidth with small calculations
template<typename vector_type, typename vector_type2>
inline __global__ void translate_fill_prop(vector_type vd_out, vector_type2 vd_in)
inline __global__ void translate_fill_prop_write(vector_type vd_out, vector_type2 vd_in)
{
auto p = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -27,12 +27,34 @@ inline __global__ void translate_fill_prop(vector_type vd_out, vector_type2 vd_i
}
template<typename vector_type, typename vector_type2>
inline __global__ void translate_fill_prop_read(vector_type vd_out, vector_type2 vd_in)
{
auto p = blockIdx.x * blockDim.x + threadIdx.x;
float a = vd_out.template get<0>(p);
float b = vd_out.template get<1>(p)[0];
float c = vd_out.template get<1>(p)[1];
float d = vd_out.template get<2>(p)[0][0];
float e = vd_out.template get<2>(p)[0][1];
float f = vd_out.template get<2>(p)[1][0];
float g = vd_out.template get<2>(p)[1][1];
float h = vd_in.template get<0>(p)[0];
float i = vd_in.template get<0>(p)[1];
vd_in.template get<0>(p)[0] += a+b+c+d;
vd_in.template get<0>(p)[1] += e+f+g+h+i;
}
int main(int argc, char *argv[])
{
init_wrappers();
openfpm::vector_gpu<aggregate<double,double[2],double[2][2]>> out;
openfpm::vector_gpu<aggregate<double[2]>> in;
openfpm::vector_gpu<aggregate<float,float[2],float[2][2]>> out;
openfpm::vector_gpu<aggregate<float[2]>> in;
int nele = 16777216;
......@@ -52,29 +74,54 @@ int main(int argc, char *argv[])
for (int i = 0 ; i < 101 ; i++)
{
cudaDeviceSynchronize();
timer t;
t.start();
cudaDeviceSynchronize();
timer t;
t.start();
CUDA_LAUNCH(translate_fill_prop_write,ite,out.toKernel(),in.toKernel());
CUDA_LAUNCH(translate_fill_prop,ite,out.toKernel(),in.toKernel());
cudaDeviceSynchronize();
t.stop();
if (i >=1)
{res.get(i-1) = nele*4*13 / t.getwct() * 1e-9;}
std::cout << "Time: " << t.getwct() << std::endl;
std::cout << "BW: " << nele*4*13 / t.getwct() * 1e-9 << " GB/s" << std::endl;
}
double mean_write = 0.0;
double dev_write = 0.0;
standard_deviation(res,mean_write,dev_write);
for (int i = 0 ; i < 101 ; i++)
{
cudaDeviceSynchronize();
timer t;
t.start();
CUDA_LAUNCH(translate_fill_prop_read,ite,out.toKernel(),in.toKernel());
cudaDeviceSynchronize();
t.stop();
t.stop();
if (i >=1)
{res.get(i-1) = nele*8*11 / t.getwct() * 1e-9;}
if (i >=1)
{res.get(i-1) = nele*4*13 / t.getwct() * 1e-9;}
std::cout << "Time: " << t.getwct() << std::endl;
std::cout << "BW: " << nele*8*11 / t.getwct() * 1e-9 << " GB/s" << std::endl;
std::cout << "Time: " << t.getwct() << std::endl;
std::cout << "BW: " << nele*4*13 / t.getwct() * 1e-9 << " GB/s" << std::endl;
}
double mean = 0.0;
double dev = 0.0;
standard_deviation(res,mean,dev);
double mean_read = 0.0;
double dev_read = 0.0;
standard_deviation(res,mean_read,dev_read);
std::cout << "Average: " << mean << " deviation: " << dev << std::endl;
std::cout << "Average READ: " << mean_read << " deviation: " << dev_read << std::endl;
std::cout << "Average WRITE: " << mean_write << " deviation: " << dev_write << std::endl;
}
#else
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment