diff --git a/src/Grid/grid_sm.hpp b/src/Grid/grid_sm.hpp
index d2875f062df3df08f78da9c3c055413298a75c0c..a0588a177acc4edaedaa77916bb1f39c39ae0737 100755
--- a/src/Grid/grid_sm.hpp
+++ b/src/Grid/grid_sm.hpp
@@ -146,8 +146,8 @@ bool has_work_gpu(ite_gpu<dim> & ite)
 //! Declaration grid_sm
 template<unsigned int N, typename T> class grid_sm;
 
-template<unsigned int dim, typename T2, typename T>
-ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = default_kernel_wg_threads_);
+template<unsigned int dim, typename grid_sm_type, typename T>
+ite_gpu<dim> getGPUIterator_impl(const grid_sm_type & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, size_t n_thr = default_kernel_wg_threads_);
 
 //! Declaration print_warning_on_adjustment
 template <unsigned int dim, typename linearizer> class print_warning_on_adjustment;
@@ -834,8 +834,8 @@ public:
 };
 
 
-template<unsigned int dim, typename T2, typename T>
-ite_gpu<dim> getGPUIterator_impl(const grid_sm<dim,T2> & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, const size_t n_thr)
+template<unsigned int dim, typename grid_sm_type, typename T>
+ite_gpu<dim> getGPUIterator_impl(const grid_sm_type & g1, const grid_key_dx<dim,T> & key1, const grid_key_dx<dim,T> & key2, const size_t n_thr)
 {
 	size_t tot_work = 1;
 	for (size_t i = 0 ; i < dim ; i++)
diff --git a/src/NN/CellList/CellListIterator_test.hpp b/src/NN/CellList/CellListIterator_test.hpp
index 5e8266d0a98968f8f80ea02f5da1f264ffd19c7b..e20740ee174e8de948343a8d30c9b2ccc95d4f4e 100644
--- a/src/NN/CellList/CellListIterator_test.hpp
+++ b/src/NN/CellList/CellListIterator_test.hpp
@@ -12,6 +12,11 @@
 #include "NN/CellList/ParticleIt_Cells.hpp"
 #include "NN/CellList/ParticleItCRS_Cells.hpp"
 
+#ifdef OPENFPM_PDATA
+#include "VCluster/VCluster.hpp"
+#endif
+
+
 /*! \brief Fill the cell-list with particles in the box 0.0,1.0
  *
  * \param k Number of particles
@@ -96,6 +101,18 @@ BOOST_AUTO_TEST_CASE( celllist_lin_and_iterator_test )
 
 BOOST_AUTO_TEST_CASE( celllist_hilb_and_iterator_test )
 {
+#ifdef OPENFPM_PDATA
+
+	auto & v_cl = create_vcluster();
+
+	std::string c2 = std::string("openfpm_data/test_data/NN_hilb_keys");
+
+#else
+
+	std::string c2 = std::string("test_data/NN_hilb_keys");
+
+#endif
+
 	///////// INPUT DATA //////////
 
 	const size_t dim = 3;
@@ -145,7 +162,7 @@ BOOST_AUTO_TEST_CASE( celllist_hilb_and_iterator_test )
 
 	openfpm::vector<size_t> keys_old;
 
-	keys_old.load("test_data/NN_hilb_keys");
+	keys_old.load(c2);
 
 	for (size_t i = 0; i < keys_old.size(); i++)
 	{
diff --git a/src/Vector/vector_test_util.hpp b/src/Vector/vector_test_util.hpp
index 5a69757d1c2180ca377b7bc88d09f5cacbccf92e..6641954d64d9ad5c2324b39f71adb285d18b1308 100644
--- a/src/Vector/vector_test_util.hpp
+++ b/src/Vector/vector_test_util.hpp
@@ -27,7 +27,7 @@ typedef Point_test<float> P;
 
 #include "timer.hpp"
 
-std::vector<Point_orig<float>> allocate_stl()
+static std::vector<Point_orig<float>> allocate_stl()
 {
 	std::vector<Point_orig<float>> v_stl_test;
 
@@ -76,7 +76,7 @@ openfpm::vector<T> allocate_openfpm_primitive(size_t n, size_t fill)
 	return v;
 }
 
-openfpm::vector<Point_test<float>> allocate_openfpm_fill(size_t n, size_t fill)
+static openfpm::vector<Point_test<float>> allocate_openfpm_fill(size_t n, size_t fill)
 {
 	Point_test<float> pt;
 	openfpm::vector<Point_test<float>> v_send;
@@ -175,7 +175,7 @@ template<typename vector> vector allocate_openfpm(size_t n_ele)
 	return v_ofp_test;
 }
 
-openfpm::vector<Point_test_prp<float>> allocate_openfpm_prp(size_t n_ele)
+static openfpm::vector<Point_test_prp<float>> allocate_openfpm_prp(size_t n_ele)
 {
 	openfpm::vector<Point_test_prp<float>> v_ofp_test;
 
@@ -242,7 +242,7 @@ openfpm::vector<Point_test_prp<float>> allocate_openfpm_prp(size_t n_ele)
 }
 
 
-openfpm::vector< aggregate<float,float,float,float,float[3],float[3][3],openfpm::vector<int>> > allocate_openfpm_aggregate_with_complex(size_t n_ele)
+static openfpm::vector< aggregate<float,float,float,float,float[3],float[3][3],openfpm::vector<int>> > allocate_openfpm_aggregate_with_complex(size_t n_ele)
 {
 	//! [Create add and access]
 	openfpm::vector< aggregate<float,float,float,float,float[3],float[3][3],openfpm::vector<int>> > v_ofp_test;
diff --git a/src/config/config_cmake.h.in b/src/config/config_cmake.h.in
index 2c956fc29ec06f81a30e381865d2c5fc795e31ee..22ec292a80f2ca2c78713de8d6f26144550b2211 100644
--- a/src/config/config_cmake.h.in
+++ b/src/config/config_cmake.h.in
@@ -189,4 +189,4 @@ ${DEFINE_TEST_COVERAGE_MODE}
 /* Version number of package */
 #define VERSION "1.0.0"
 
-#define OPENFPM_PDATA
+#define OPENFPM_DATA
diff --git a/src/main.cpp b/src/main.cpp
index fd69236807133256957bcabaf2831de402ceb62f..c53fa51257b449c64b01abe824e8729f15877de3 100755
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -8,6 +8,8 @@
 
 #define DISABLE_MPI_WRITTERS
 
+#ifndef NO_INIT_AND_MAIN
+
 // initialization function:
 bool init_unit_test()
 {
@@ -23,6 +25,10 @@ int main(int argc, char* argv[])
 	return boost::unit_test::unit_test_main( &init_unit_test, argc, argv );
 }
 
+#include "unit_test_init_cleanup.hpp"
+
+#endif
+
 #include <boost/fusion/include/mpl.hpp>
 
 #include <iostream>
@@ -58,4 +64,4 @@ int main(int argc, char* argv[])
 #ifdef PERFORMANCE_TEST
 #include "performance.hpp"
 #endif
-#include "unit_test_init_cleanup.hpp"
+
diff --git a/src/unit_test_init_cleanup.hpp b/src/unit_test_init_cleanup.hpp
index 5157a1345f0aa8afbd74514aedb10d3f97395989..64b0c580b8443817abd07e58ce99c44f7a50c232 100644
--- a/src/unit_test_init_cleanup.hpp
+++ b/src/unit_test_init_cleanup.hpp
@@ -8,6 +8,8 @@
 #ifndef UNIT_TEST_INIT_CLEANUP_HPP_
 #define UNIT_TEST_INIT_CLEANUP_HPP_
 
+#include "util/cudify/cudify.hpp"
+
 //! boost unit test fixation (start procedure to call before testing)
 struct ut_start
 {
diff --git a/src/util/cuda/segreduce_ofp.cuh b/src/util/cuda/segreduce_ofp.cuh
index f4732fac998cdb903c80697194e257ccc81dd6e2..9a0c3764108e2e0e009f3d2e2a62fb470b7a20df 100644
--- a/src/util/cuda/segreduce_ofp.cuh
+++ b/src/util/cuda/segreduce_ofp.cuh
@@ -83,6 +83,7 @@ __global__ void realign_output(output_it out, out_tmp_type out_tmp, segs_type se
         for ( ; i < num_segments - 1; i++)
         {
             int j = segments[i];
+            output[i] = init;
             if (j == segments[i+1]) {continue;}
             output[i] = input[j];
             ++j;