2015-02-26 09:02:31 -06:00
|
|
|
|
2015-03-23 16:03:14 -05:00
|
|
|
#include <iostream>
|
2015-02-26 09:02:31 -06:00
|
|
|
#include <libgen.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
#include <dlfcn.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <sstream>
|
|
|
|
#include <fstream>
|
|
|
|
#include <iomanip>
|
|
|
|
|
|
|
|
#ifdef _DMTCP
|
|
|
|
#include "dmtcpaware.h"
|
|
|
|
#endif
|
|
|
|
|
2015-06-01 10:28:29 -05:00
|
|
|
#include "trick/DMTCP.hh"
|
|
|
|
#include "trick/SimObject.hh"
|
|
|
|
#include "trick/exec_proto.h"
|
|
|
|
#include "trick/message_proto.h"
|
|
|
|
#include "trick/Executive.hh"
|
|
|
|
#include "trick/CheckPointRestart.hh"
|
2015-02-26 09:02:31 -06:00
|
|
|
|
|
|
|
using namespace std;
|
|
|
|
|
|
|
|
Trick::DMTCP * the_dmtcp ;
|
|
|
|
|
|
|
|
Trick::DMTCP::DMTCP() {
|
|
|
|
|
|
|
|
int num_classes = 0 ;
|
|
|
|
|
|
|
|
checkpoint_now = false ;
|
|
|
|
dmtcp_ckpt_complete = 0;
|
|
|
|
|
|
|
|
class_map["dmtcp_pre"] = num_classes ;
|
|
|
|
class_to_queue[num_classes++] = &pre_queue ;
|
|
|
|
|
|
|
|
class_map["dmtcp_post"] = num_classes ;
|
|
|
|
class_to_queue[num_classes++] = &post_queue ;
|
|
|
|
|
|
|
|
class_map["dmtcp_restart"] = num_classes ;
|
|
|
|
class_to_queue[num_classes++] = &restart_queue ;
|
|
|
|
|
|
|
|
the_dmtcp = this ;
|
|
|
|
}
|
|
|
|
|
|
|
|
int Trick::DMTCP::init() {
|
|
|
|
|
|
|
|
#ifdef _DMTCP
|
|
|
|
if( dmtcpIsEnabled() )
|
|
|
|
dmtcpInstallHooks(dmtcp_pre_checkpoint, dmtcp_post_checkpoint, dmtcp_restart);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
call_dmtcp();
|
|
|
|
|
|
|
|
return 0 ;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int Trick::DMTCP::freeze() {
|
|
|
|
call_dmtcp();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int Trick::DMTCP::pre() {
|
|
|
|
JobData * curr_job ;
|
|
|
|
|
|
|
|
pre_queue.reset_curr_index() ;
|
|
|
|
|
|
|
|
while ( (curr_job = pre_queue.get_next_job()) != NULL ) {
|
|
|
|
curr_job->call() ;
|
|
|
|
}
|
|
|
|
return 0 ;
|
|
|
|
}
|
|
|
|
|
|
|
|
int Trick::DMTCP::post() {
|
|
|
|
JobData * curr_job ;
|
|
|
|
|
|
|
|
renameRestartScript();
|
|
|
|
|
|
|
|
post_queue.reset_curr_index() ;
|
2016-11-08 10:25:07 +01:00
|
|
|
|
2015-02-26 09:02:31 -06:00
|
|
|
while ( (curr_job = post_queue.get_next_job()) != NULL )
|
|
|
|
curr_job->call() ;
|
|
|
|
|
|
|
|
return 0 ;
|
|
|
|
}
|
|
|
|
|
|
|
|
int Trick::DMTCP::restart() {
|
|
|
|
JobData * curr_job ;
|
|
|
|
|
|
|
|
restart_queue.reset_curr_index() ;
|
|
|
|
|
|
|
|
while ( (curr_job = restart_queue.get_next_job()) != NULL )
|
|
|
|
curr_job->call() ;
|
|
|
|
|
|
|
|
while ( !dmtcp_checkpoint_jobs_queue.empty())
|
|
|
|
dmtcp_checkpoint_jobs_queue.pop();
|
|
|
|
|
|
|
|
return 0 ;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int Trick::DMTCP::call_dmtcp() {
|
|
|
|
|
|
|
|
if ( checkpoint_now == true ) {
|
|
|
|
|
|
|
|
dmtcp_ckpt_complete = 0;
|
|
|
|
sleep(1); // this sleep() is required to give flag
|
|
|
|
// enough time to refresh in Trick View
|
|
|
|
// it does not cause overruns
|
|
|
|
|
|
|
|
#ifdef _DMTCP
|
|
|
|
if( dmtcpIsEnabled() )
|
|
|
|
dmtcpRunCommand('c') ;
|
|
|
|
#endif
|
|
|
|
checkpoint_now = false ;
|
|
|
|
}
|
|
|
|
return 0 ;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Trick::DMTCP::dmtcpSetCheckpointNow() {
|
|
|
|
checkpoint_now = true ;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string Trick::DMTCP::splitFilename( const string& str ) {
|
|
|
|
size_t found;
|
|
|
|
std::string filename;
|
|
|
|
|
|
|
|
found=str.find_last_of("/\\");
|
|
|
|
filename = str.substr(found+1);
|
|
|
|
|
|
|
|
return filename;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Trick::DMTCP::isSpecialCharacter( const std::string& str) {
|
|
|
|
|
|
|
|
size_t found = str.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_.-#%@=:^" );
|
|
|
|
|
|
|
|
if (found!=string::npos)
|
|
|
|
return true;
|
|
|
|
else
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Trick::DMTCP::dmtcpJobQueue( std::string file_name ) {
|
|
|
|
dmtcp_checkpoint_jobs_queue.push( file_name ) ;
|
|
|
|
|
|
|
|
if ( (dmtcp_checkpoint_jobs_queue.size() >= 2) and (strcmp(file_name.c_str(),"dmtcp_chkpnt_safestore") ) )
|
|
|
|
dmtcp_checkpoint_jobs_queue.pop() ;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
std::string Trick::DMTCP::getScriptName() {
|
|
|
|
std::string restart_script_name ;
|
|
|
|
#ifdef _DMTCP
|
|
|
|
std::stringstream default_script_name ;
|
|
|
|
|
|
|
|
// Default script name
|
|
|
|
default_script_name << "dmtcp_chkpnt_" << std::fixed << std::setprecision(2) << exec_get_sim_time() ;
|
|
|
|
|
|
|
|
// Script name from memory manager wrapper (input file or gui)
|
|
|
|
std::string mmw_script_name = dmtcp_checkpoint_jobs_queue.top();
|
|
|
|
|
|
|
|
// User specified a new directory
|
|
|
|
if ( strstr(mmw_script_name.c_str(),"/") != NULL ){
|
|
|
|
std::string stripped_name = splitFilename( mmw_script_name );
|
|
|
|
std::cout << "DMTCP checkpoints are written to the --ckptdir directory (defined in $TRICK_HOME/bin/s_define_dmtcp).\n";
|
|
|
|
mmw_script_name = stripped_name;
|
|
|
|
}
|
|
|
|
|
|
|
|
// User specifed a valid script name (or is using default script name)
|
|
|
|
if ( ( mmw_script_name == default_script_name.str() ) or ( !isSpecialCharacter( mmw_script_name ) ) )
|
|
|
|
restart_script_name = mmw_script_name;
|
|
|
|
|
2016-11-08 10:25:07 +01:00
|
|
|
// User specifed an invalid script name
|
2015-02-26 09:02:31 -06:00
|
|
|
else {
|
|
|
|
std::cout << mmw_script_name.c_str() << " is not a valid name. The default DMTCP script name will be used: " << default_script_name.str() << endl;
|
|
|
|
restart_script_name = default_script_name.str();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
return restart_script_name;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Trick::DMTCP::dmtcpRenameCmd() {
|
|
|
|
#ifdef _DMTCP
|
|
|
|
std::stringstream dmtcp_rename_command;
|
|
|
|
std::stringstream no_env_dmtcp_rename_command;
|
|
|
|
std::stringstream trick_restart_script;
|
|
|
|
std::stringstream dmtcp_restart_script;
|
|
|
|
std::stringstream no_env_dmtcp_restart_script;
|
|
|
|
std::string mmw_filename = getScriptName();
|
|
|
|
|
|
|
|
const DmtcpLocalStatus * lsp = dmtcpGetLocalStatus() ;
|
|
|
|
|
|
|
|
if (lsp->genId) {
|
|
|
|
dmtcp_restart_script << lsp->env << "/dmtcp_restart_script_" << lsp->uniquePidStr << "_" << std::setfill('0') << std::setw(5) << lsp->genId << ".sh";
|
|
|
|
no_env_dmtcp_restart_script << "./dmtcp_restart_script_" << lsp->uniquePidStr << "_" << std::setfill('0') << std::setw(5) << lsp->genId << ".sh";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
dmtcp_restart_script << lsp->env << "/dmtcp_restart_script_" << lsp->uniquePidStr << ".sh";
|
|
|
|
no_env_dmtcp_restart_script << "./dmtcp_restart_script_" << lsp->uniquePidStr << ".sh";
|
|
|
|
}
|
|
|
|
|
|
|
|
trick_restart_script << lsp->env << "/" << mmw_filename;
|
|
|
|
dmtcp_rename_command << "mv " << dmtcp_restart_script.str() << " " << trick_restart_script.str();
|
|
|
|
no_env_dmtcp_rename_command << "mv " << no_env_dmtcp_restart_script.str() << " " << trick_restart_script.str() << "; rm -f ./dmtcp_restart_script.sh";
|
|
|
|
|
|
|
|
ifstream dmtcp_file( dmtcp_restart_script.str().c_str() );
|
|
|
|
if ( dmtcp_file.good() ) {
|
|
|
|
dmtcpSystemCmd( dmtcp_rename_command.str() );
|
|
|
|
}
|
|
|
|
|
|
|
|
// There is a bug in DMTCP that causes the DMTCP Coordinator (see dmtcp::DmtcpCoordinator::writeRestartScript())
|
2016-11-08 10:25:07 +01:00
|
|
|
// to not recognize ENV_VAR_CHECKPOINT_DIR when dumping a checkpoint from a restarted checkpoint.
|
2015-02-26 09:02:31 -06:00
|
|
|
// As a result, DMTCP writes the restart script to the current directory.
|
|
|
|
// This code just checks to see if it's in the current dir, if so, move it to the ENV_VAR_CHECKPOINT_DIR with the new script name.
|
|
|
|
ifstream no_env_dmtcp_file( no_env_dmtcp_restart_script.str().c_str() );
|
|
|
|
if ( no_env_dmtcp_file.good() ) {
|
|
|
|
dmtcpSystemCmd( no_env_dmtcp_rename_command.str() );
|
|
|
|
}
|
|
|
|
|
|
|
|
dmtcpSafestoreDir();
|
|
|
|
|
|
|
|
// Don't print out "dumped" message until checkpont file (ckpt_*.dmtcp) is written
|
|
|
|
std::stringstream dmtcp_temp;
|
|
|
|
dmtcp_temp << lsp->env << "/ckpt_" << lsp->uniquePidStr << "_" << std::setfill('0') << std::setw(5) << lsp->genId << "/ckpt_S_main_" << getenv("TRICK_HOST_CPU") << ".exe_" << lsp->uniquePidStr << ".dmtcp.temp";
|
|
|
|
|
|
|
|
while( access( dmtcp_temp.str().c_str(),F_OK ) == 0 ) {}
|
|
|
|
|
|
|
|
dmtcp_ckpt_complete = 1;
|
|
|
|
|
|
|
|
message_publish(MSG_INFO, "Dumped DMTCP Checkpoint: %s\n", mmw_filename.c_str()) ;
|
|
|
|
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
void Trick::DMTCP::dmtcpSystemCmd( const string& str ) {
|
|
|
|
std::string real_system_name ;
|
|
|
|
|
|
|
|
void* dlhandle ;
|
|
|
|
void (*real_system_ptr)(const char *) = NULL ;
|
|
|
|
|
|
|
|
dlhandle = dlopen( NULL, RTLD_LAZY) ;
|
|
|
|
real_system_name = "_real_system" ;
|
|
|
|
real_system_ptr = (void (*)(const char *))dlsym( dlhandle , real_system_name.c_str()) ;
|
|
|
|
|
|
|
|
if ( real_system_ptr != NULL )
|
|
|
|
(*real_system_ptr)( str.c_str() ) ;
|
2016-11-08 10:25:07 +01:00
|
|
|
else
|
2015-02-26 09:02:31 -06:00
|
|
|
system( str.c_str() );
|
|
|
|
}
|
|
|
|
|
|
|
|
void Trick::DMTCP::dmtcpSafestoreDir() {
|
|
|
|
#ifdef _DMTCP
|
|
|
|
|
|
|
|
std::string restart_script_name;
|
|
|
|
char* checkpoint_dirname ;
|
|
|
|
char* long_checkpoint_name ;
|
|
|
|
char rm_safestore_dir[512] ;
|
|
|
|
const DmtcpLocalStatus * lsp = dmtcpGetLocalStatus() ;
|
|
|
|
|
|
|
|
long_checkpoint_name = strdup(lsp->checkpointFilename) ;
|
|
|
|
|
|
|
|
// For DMTCP Safestore checkpoints, only keep newest ckpt_* directory
|
|
|
|
if ( restart_script_name.c_str() == "dmtcp_chkpnt_safestore" ) {
|
|
|
|
|
|
|
|
checkpoint_dirname = dirname(long_checkpoint_name) ;
|
|
|
|
|
|
|
|
// If there's a new ckpt_* directory, remove the previous version
|
|
|
|
if ( strcmp(prev_checkpoint_dirname, checkpoint_dirname) ) {
|
|
|
|
sprintf( rm_safestore_dir, "cd %s; rm -rf %s", lsp->env, prev_checkpoint_dirname);
|
|
|
|
dmtcpSystemCmd( rm_safestore_dir );
|
|
|
|
}
|
|
|
|
|
|
|
|
strcpy( prev_checkpoint_dirname, checkpoint_dirname );
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
strcpy( prev_checkpoint_dirname, "" );
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void Trick::DMTCP::dmtcpCleanup() {
|
|
|
|
#ifdef _DMTCP
|
|
|
|
char dmtcp_restart_script_sh[1024];
|
|
|
|
const DmtcpLocalStatus * lsp = dmtcpGetLocalStatus() ;
|
|
|
|
|
|
|
|
sprintf( dmtcp_restart_script_sh, "%s/dmtcp_restart_script.sh", lsp->env );
|
|
|
|
unlink(dmtcp_restart_script_sh) ;
|
2016-11-08 10:25:07 +01:00
|
|
|
|
2015-02-26 09:02:31 -06:00
|
|
|
if ( !dmtcp_checkpoint_jobs_queue.empty() )
|
|
|
|
dmtcp_checkpoint_jobs_queue.pop();
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
void Trick::DMTCP::renameRestartScript() {
|
|
|
|
#ifdef _DMTCP
|
|
|
|
if( dmtcpIsEnabled() ) {
|
|
|
|
dmtcpRenameCmd();
|
|
|
|
dmtcpCleanup();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int Trick::DMTCP::write_s_job_execution( FILE * fp ) {
|
|
|
|
|
|
|
|
if ( fp == NULL ) {
|
|
|
|
return(0) ;
|
|
|
|
}
|
|
|
|
|
|
|
|
fprintf(fp, "\n===================================================================================================\n") ;
|
|
|
|
fprintf(fp, "DMTCP :\n\n") ;
|
|
|
|
|
|
|
|
write_non_sched_queue(fp, &pre_queue) ;
|
|
|
|
write_non_sched_queue(fp, &post_queue) ;
|
|
|
|
write_non_sched_queue(fp, &restart_queue) ;
|
|
|
|
|
|
|
|
return 0 ;
|
|
|
|
}
|
|
|
|
|
|
|
|
int Trick::DMTCP::instrument_job_before(Trick::JobData * instrument_job __attribute__((unused)) ) {
|
|
|
|
return 0 ;
|
|
|
|
}
|
|
|
|
int Trick::DMTCP::instrument_job_after(Trick::JobData * instrument_job __attribute__((unused)) ) {
|
|
|
|
return 0 ;
|
|
|
|
}
|
|
|
|
int Trick::DMTCP::instrument_job_remove(std::string in_job __attribute__((unused)) ) {
|
|
|
|
return 0 ;
|
|
|
|
}
|