Fixing CUDIFY

ef8e58f2 · Pietro Incardona · f7b843e7 · ef8e58f2 · ef8e58f2 · ccdd64f9
Commit ef8e58f2 authored 3 years ago by Pietro Incardona
--- a/example/Performance/memBW/Makefile
+++ b/example/Performance/memBW/Makefile
@@ -42,7 +42,7 @@ OBJ = main.o
 memBW:

 %.o: %.cu
-	$(CUDA_CC) -g -O3 $(CUDA_OPTIONS) $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)
+	$(CUDA_CC) -O3 $(CUDA_OPTIONS) $(OPT) -c --std=c++14 -o $@ $< $(INCLUDE_PATH_NVCC)

 %.o: %.cpp
 	$(CC) -g -O3 $(OPT) -g -c --std=c++14 -o $@ $< $(INCLUDE_PATH)

--- a/example/Performance/memBW/main.cu
+++ b/example/Performance/memBW/main.cu
@@ -53,18 +53,18 @@ __global__ void translate_fill_prop_write_array(float * vd_out_scal,
 {
 	auto p = blockIdx.x * blockDim.x + threadIdx.x;

-	float a = vd_in_vec[p* + 0*stride];
-	float b = vd_in_vec[p* + 1*stride];
+	float a = vd_in_vec[p + 0*stride];

-	vd_out_scal[p] = a + b;
+	vd_out_scal[p] = a;

 	vd_out_vec[p + 0*stride] = a;
-	vd_out_vec[p + 1*stride] = b;
+	vd_out_vec[p + 1*stride] = a;

 	vd_out_mat[p + 0*2*stride + 0*stride ] = a;
-	vd_out_mat[p + 0*2*stride + 1*stride ] = b;
-	vd_out_mat[p + 1*2*stride + 0*stride ] = a + b;
-	vd_out_mat[p + 1*2*stride + 1*stride ] = b - a;
+	vd_out_mat[p + 0*2*stride + 1*stride ] = a;
+	vd_out_mat[p + 1*2*stride + 0*stride ] = a;
+	vd_out_mat[p + 1*2*stride + 1*stride ] = a;
+	vd_in_vec[p + 1*stride] = a;
 }


@@ -184,6 +184,18 @@ int main(int argc, char *argv[])

    initialize_buf(in,out);

+
+for (int j = 0 ; j < 100 ; j++)
+{
+
+    for (int i = 0 ; i < 16777216; i++)
+    {
+	    out.get<2>(i)[1][0] = in.get<0>(i)[1];
+    }
+}
+
+    return 0;
+
    // Read write test with TLS

    auto ite = out.getGPUIterator(256);
@@ -344,6 +356,38 @@ int main(int argc, char *argv[])
    double dev_read_lamb = 0.0;
    standard_deviation(res,mean_read_lamb,dev_read_lamb);

+    // Array benchmark
+
+    for (int i = 0 ; i < 110 ; i++)
+    {
+        cudaDeviceSynchronize();
+        timer t;
+        t.start();
+
+	float * out_s = (float *)out.getDeviceBuffer<0>();
+	float * out_v = (float *)out.getDeviceBuffer<1>();
+	float * out_m = (float *)out.getDeviceBuffer<2>();
+	float * in_v = (float *)in.getDeviceBuffer<0>();
+
+        CUDA_LAUNCH(translate_fill_prop_write_array,ite,out_s,out_v,out_m,in_v,out.capacity());
+
+        cudaDeviceSynchronize();
+
+        t.stop();
+
+        if (i >=10)
+        {res.get(i-10) = nele*4*9 / t.getwct() * 1e-9;}
+
+        std::cout << "Time ARR: " << t.getwct() << std::endl;
+        std::cout << "BW ARR: " << nele*4*9 / t.getwct() * 1e-9 << " GB/s"  << std::endl;
+    }
+
+    double mean_write_arr = 0.0;
+    double dev_write_arr = 0.0;
+    standard_deviation(res,mean_write_arr,dev_write_arr);
+
+    ///////////////////
+
    #ifdef CUDIFY_USE_CUDA

    for (int i = 0 ; i < 110 ; i++)
@@ -382,6 +426,7 @@ int main(int argc, char *argv[])
    std::cout << "Average READ with lamb: " << mean_read_lamb << "  deviation: " << dev_read_lamb << std::endl;
    std::cout << "Average WRITE with lamb: " << mean_write_lamb << "  deviation: " << dev_write_lamb << std::endl;

+    std::cout << "Average WRITE with array: " << mean_write_arr << "  deviation: " << dev_write_arr << std::endl;
 }

 #else

--- a/openfpm_devices @ ccdd64f9
+++ b/openfpm_devices @ ccdd64f9
-Subproject commit f95b31b6d0af0a3c60a2643840854feba90abb1e
+Subproject commit ccdd64f9dee09116426e685ec44debfab255aa0a
--- a/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
+++ b/src/Grid/tests/sgrid_dist_id_gpu_unit_tests.cu
@@ -5,6 +5,7 @@
 #include <boost/test/unit_test.hpp>
 #include "Grid/grid_dist_id.hpp"

+
 BOOST_AUTO_TEST_SUITE( sgrid_gpu_test_suite )

 template<unsigned int p>