Commit f281e69b authored by incardon's avatar incardon

Included vcluster log system

parent 452e41e6
......@@ -25,6 +25,25 @@ AC_PROG_CXX
AC_CANONICAL_HOST
###### Check for verbose
AC_MSG_CHECKING(whether to build with verbose print)
verbose=no
AC_ARG_ENABLE(verbose,
AC_HELP_STRING(
[--enable-verbose],
[enable verbose print (def=no)]
),
verbose="$enableval"
)
AC_MSG_RESULT($verbose)
if test x"$verbose" = x"yes"; then
AC_DEFINE([VERBOSE_TEST],[],[Verbose test])
fi
###### Check for debug compilation
AC_MSG_CHECKING(whether to build with debug information)
......
......@@ -191,7 +191,7 @@ CUDA_CFLAGS = -I -I -I/usr/local/cuda-5.5/include
CUDA_LIBS = -L -L -L/usr/local/cuda-5.5/lib64 -lcuda -lcudart
CXX = mpic++
CXXDEPMODE = depmode=gcc3
CXXFLAGS = --std=c++11 -march=native -mtune=native -Wno-unused-local-typedefs -Wextra -Wno-unused-parameter -g3 -Wall -O0
CXXFLAGS = --std=c++11 -march=native -mtune=native -Wno-unused-local-typedefs -Wextra -Wno-unused-parameter -Wall -O3 -g3 -funroll-loops
CYGPATH_W = echo
DEFAULT_LIB = -lrt
DEFS = -DHAVE_CONFIG_H
......@@ -213,7 +213,7 @@ LTLIBOBJS =
MAKEINFO = ${SHELL} /home/i-bird/Desktop/MOSAIC/OpenFPM_project/OpenFPM_vcluster/missing makeinfo
MKDIR_P = /usr/bin/mkdir -p
NVCC = /usr/local/cuda-5.5/bin/nvcc
NVCCFLAGS = -g -O0
NVCCFLAGS = -O3
NVCC_EXIST = yes
OBJEXT = o
PACKAGE = full-package-name
......
......@@ -14,6 +14,11 @@ void init_global_v_cluster(int *argc, char ***argv)
global_v_cluster = new Vcluster(argc,argv);
}
void delete_global_v_cluster()
{
delete global_v_cluster;
}
// Global MPI initialization
bool global_mpi_init = false;
......
......@@ -16,6 +16,7 @@
#include "util/check_no_pointers.hpp"
#include "util.hpp"
#endif
#include "util/Vcluster_log.hpp"
#define MSG_LENGTH 1024
#define MSG_SEND_RECV 1025
......@@ -75,6 +76,8 @@ class exec_exception: public std::exception
class Vcluster
{
Vcluster_log log;
//! NBX has a potential pitfall that must be addressed
//! NBX Send all the messages and than probe for incoming messages
//! If there is an incoming message it receive it producing
......@@ -172,6 +175,9 @@ public:
{
map_scatter.get(i) = 1;
}
// open the log file
log.openLog(rank);
}
#ifdef DEBUG
......@@ -494,6 +500,7 @@ public:
{
req.add();
MPI_SAFE_CALL(MPI_Issend(ptr[i], sz[i], MPI_BYTE, prc[i], SEND_SPARSE + NBX_cnt, MPI_COMM_WORLD,&req.last()));
log.logSend(prc[i]);
}
}
......@@ -503,9 +510,12 @@ public:
bool reached_bar_req = false;
MPI_Request bar_req;
log.start(10);
// Wait that all the send are acknowledge
do
{
// flag that notify that this processor reach the barrier
// Barrier request
......@@ -523,6 +533,9 @@ public:
// Get the pointer to receive the message
void * ptr = msg_alloc(msize,0,0,stat_t.MPI_SOURCE,rid,ptr_arg);
// Log the receiving request
log.logRecv(stat_t);
rid++;
MPI_SAFE_CALL(MPI_Recv(ptr,msize,MPI_BYTE,stat_t.MPI_SOURCE,SEND_SPARSE+NBX_cnt,MPI_COMM_WORLD,&stat_t));
......@@ -545,13 +558,18 @@ public:
// Check if all processor reach the async barrier
if (reached_bar_req)
MPI_Test(&bar_req,&flag,MPI_STATUSES_IGNORE);
{MPI_SAFE_CALL(MPI_Test(&bar_req,&flag,MPI_STATUSES_IGNORE))};
// produce a report if communication get stuck
log.NBXreport(NBX_cnt,req);
} while (flag == false);
// Remove the executed request
req.clear();
stat.clear();
log.clear();
// Circular counter
NBX_cnt = (NBX_cnt + 1) % 1024;
......
......@@ -171,6 +171,7 @@ template<unsigned int ip> void test()
#ifdef VERBOSE_TEST
t.stop();
double clk = t.getwct();
double clk_max = clk;
......
......@@ -18,6 +18,13 @@
BOOST_AUTO_TEST_SUITE( VCluster_test )
BOOST_AUTO_TEST_CASE (Vcluster_robustness)
{
Vcluster & vcl = *global_v_cluster;
vcl.execute();
}
BOOST_AUTO_TEST_CASE( VCluster_use_reductions)
{
Vcluster & vcl = *global_v_cluster;
......
/*
* Vcluster_log.hpp
*
* Created on: Jul 11, 2015
* Author: Pietro Incardona
*/
#ifndef VCLUSTER_LOG_HPP_
#define VCLUSTER_LOG_HPP_
#include <fstream>
#include "timer.hpp"
#ifdef VERBOSE_TEST
/*! \brief Vcluster log
*
* It basically produce a report of the communication status
*
*/
class Vcluster_log
{
timer t;
// delay of the log
size_t log_delay;
size_t rank;
// Receive status vector
openfpm::vector<MPI_Status> r_log;
// Send processors vector
openfpm::vector<size_t> s_log;
// log file
std::ofstream f;
public:
/*! \brief Start to count the seconds
*
* The log report is generated only after "log_delay" seconds
*
* \param log_delay
*
*/
void start(size_t log_delay)
{
this->log_delay = log_delay;
t.start();
}
/*! \brief Create the log file
*
* \param rank processor id
*
*/
void openLog(size_t rank)
{
std::stringstream str;
str << "vcluster_log_" << rank;
f.open(str.str());
}
/*! \brief Allocate and MPI_Status
*
* \return a valid MPI_Status
*
*/
void logRecv(MPI_Status & stat)
{
r_log.add(stat);
}
/*! \brief
*
* \param prc processor
*
*/
void logSend(size_t prc)
{
s_log.add(prc);
}
/*! \brief This function write a report for the NBX communication strategy
*
* \param nbx
* \param send request
*/
void NBXreport(size_t nbx, openfpm::vector<MPI_Request> & req)
{
// req and s_log must match
if (req.size() != s_log.size())
std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " req.size() != s_log.size() " << req.size() << "!=" << s_log.size() << "\n" ;
// if it is waiting more than 20 seconds
// Write a deadlock status report
if (t.getwct() >= log_delay)
{
f << "=============================== NBX ==================================\n";
int flag;
f << "NBX counter: " << nbx << "\n";
f << "\n";
// Print the send requests and their status
for (size_t i = 0 ; i < req.size() ; i++)
{
MPI_Status stat;
MPI_SAFE_CALL(MPI_Request_get_status(req.get(i),&flag,&stat));
if (flag == true)
f << "Send to: " << s_log.get(i) << " with tag " << stat.MPI_TAG << " completed" << "\n";
else
f << "Send to: " << s_log.get(i) << " with tag " << stat.MPI_TAG << " pending" << "\n";
}
f << "\n";
// Print the receive request and their status
for (size_t j = 0 ; j < r_log.size() ; j++)
{
f << "Received from: " << r_log.get(j).MPI_SOURCE << " with tag " << r_log.get(j).MPI_TAG << "\n";
}
f << "======================================================================\n";
f.flush();
t.reset();
}
}
/*! \brief Clear all the logged status
*
*
*/
void clear()
{
r_log.clear();
s_log.clear();
}
};
#else
/*! \brief Vcluster log
*
* Stub object, it does nothing
*
*/
class Vcluster_log
{
public:
inline void start(size_t log_delay) {}
inline void openLog(size_t rank) {}
inline void logStatus(MPI_Status & stat) {}
inline void NBXreport(size_t nbx, openfpm::vector<MPI_Request> & req) {}
};
#endif
#endif /* VCLUSTER_LOG_HPP_ */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment