Note the process exit status of Monte Carlo runs

Closes #481
This commit is contained in:
Derek Bankieris 2019-06-21 14:02:03 -05:00
parent 00302208b5
commit 40cf5c0b29
6 changed files with 38 additions and 15 deletions

View File

@ -43,6 +43,10 @@ namespace Trick {
*/
class Executive : public Trick::Scheduler {
public:
/** gets #except_return */
virtual int get_except_return() const;
protected:
/** Attempts to attach a debugger in the event a signal shuts down the simulation.\n */
bool attach_debugger; /**< trick_units(--) */

View File

@ -39,11 +39,12 @@ namespace Trick {
/** Details the manner in which this run exited. */
enum ExitStatus {
MC_RUN_INCOMPLETE, /**< not completed */
MC_RUN_COMPLETE, /**< completed with no errors */
MC_RUN_COMPLETE, /**< process completed with exit status zero */
MC_RUN_FAILED, /**< process completed with non-zero exit status */
MC_RUN_DUMPED_CORE, /**< core dumped */
MC_RUN_TIMED_OUT, /**< timed out */
MC_CANT_CREATE_OUTPUT_DIR, /**< could not write output files */
MC_PROBLEM_PARSING_INPUT, /**< problem parseing monte carlo input */
MC_PROBLEM_PARSING_INPUT, /**< problem parsing monte carlo input */
MC_UNRECOGNIZED_RETURN_CODE /**< unrecognized return code */
};
@ -338,9 +339,12 @@ namespace Trick {
/** Runs to be dispatched. */
std::deque <Trick::MonteRun *> runs; /**< \n trick_io(**) trick_units(--) */
/** Failed runs. */
/** Runs whose slave child process completed with a non-zero exit status. */
std::deque <Trick::MonteRun *> failed_runs; /**< \n trick_io(**) trick_units(--) */
/** Runs whose slave child process terminated with an error. */
std::deque <Trick::MonteRun *> error_runs; /**< \n trick_io(**) trick_units(--) */
/** Valid ranges. */
std::vector <Trick::MonteRange *> run_ranges; /**< \n trick_io(**) trick_units(--) */

View File

@ -399,3 +399,6 @@ int Trick::Executive::set_current_version(std::string version) {
return(0) ;
}
int Trick::Executive::get_except_return() const {
return except_return;
}

View File

@ -9,6 +9,8 @@
#include "trick/message_type.h"
#include "trick/exec_proto.h"
extern Trick::Executive * the_exec ;
void Trick::MonteCarlo::set_enabled(bool in_enabled) {
this->enabled = in_enabled;
}
@ -282,7 +284,7 @@ int Trick::MonteCarlo::shutdown() {
if (enabled && is_slave()) {
connection_device.port = master_port;
if (tc_connect(&connection_device) == TC_SUCCESS) {
int exit_status = MonteRun::MC_RUN_COMPLETE;
int exit_status = the_exec->get_except_return() ? MonteRun::MC_RUN_FAILED : MonteRun::MC_RUN_COMPLETE;
if (verbosity >= MC_ALL) {
message_publish(MSG_INFO, "Monte [%s:%d] Sending run exit status to master: %d\n",
machine_name.c_str(), slave_id, exit_status) ;
@ -322,9 +324,12 @@ void Trick::MonteCarlo::handle_retry(MonteSlave& slave, MonteRun::ExitStatus exi
/** @par Detailed Design: */
void Trick::MonteCarlo::resolve_run(MonteSlave& slave, MonteRun::ExitStatus exit_status) {
if (exit_status != MonteRun::MC_RUN_COMPLETE) {
if (exit_status == MonteRun::MC_RUN_FAILED) {
failed_runs.push_back(slave.current_run);
}
else if (exit_status != MonteRun::MC_RUN_COMPLETE) {
error_runs.push_back(slave.current_run);
}
/** <li> Update the bookkeeping. */
struct timeval time_val;

View File

@ -1,4 +1,3 @@
#include <sys/time.h>
#include "trick/MonteCarlo.hh"
@ -67,9 +66,9 @@ void Trick::MonteCarlo::print_statistics(FILE** fp) {
"No Permission to Output Directory", "Bad Input" } ;
fprintf(*fp,
"\nMonte Carlo complete: %u runs (%zu successful) (%zu errors) (%u out of range)\n",
num_runs, num_results - failed_runs.size(), failed_runs.size(),
num_runs - num_results);
"\nMonte Carlo complete: %u runs (%zu successful) (%zu non-zero exit status) (%zu errors) (%u out of range)\n",
num_runs, num_results - failed_runs.size() - error_runs.size(), failed_runs.size(),
error_runs.size(), num_runs - num_results);
fprintf(*fp, "\nMachine work unit breakdown:\n");
fprintf(*fp, "----------------------------------------------------------------------\n");
@ -100,10 +99,17 @@ void Trick::MonteCarlo::print_statistics(FILE** fp) {
fprintf(*fp, "Efficency (speedup / num slaves): %.2lf%%\n", efficency);
if (failed_runs.size()) {
fprintf(*fp, "\nError Summary\n");
for (std::vector<MonteRun *>::size_type j = 0; j < failed_runs.size(); ++j) {
fprintf(*fp, "RUN_%05d exit status = %s (%d)\n", failed_runs[j]->id,
exit_status_string[failed_runs[j]->exit_status], failed_runs[j]->exit_status);
fprintf(*fp, "\nThe following runs completed with a non-zero process exit status:\n");
for (const MonteRun* run : failed_runs) {
fprintf(*fp, "RUN_%05d\n", run->id);
}
}
if (error_runs.size()) {
fprintf(*fp, "\nThe following runs failed to complete:\n");
for (const MonteRun* run : error_runs) {
fprintf(*fp, "RUN_%05d MonteRun::ExitStatus = %s (%d)\n", run->id,
exit_status_string[run->exit_status], run->exit_status);
}
}
}

View File

@ -119,7 +119,8 @@ void Trick::MonteCarlo::handle_run_data(Trick::MonteSlave& slave) {
switch (exit_status) {
case MonteRun::MC_RUN_COMPLETE:
resolve_run(slave, MonteRun::MC_RUN_COMPLETE);
case MonteRun::MC_RUN_FAILED:
resolve_run(slave, static_cast<MonteRun::ExitStatus>(exit_status));
run_queue(&master_post_queue, "in master_post queue") ;
break;
@ -154,7 +155,7 @@ void Trick::MonteCarlo::handle_run_data(Trick::MonteSlave& slave) {
break;
/**
* <li> Timeouts and are redispatched. However, we must first check to
* <li> Timeouts are redispatched. However, we must first check to
* see if this run has already been processed in #check_timeouts, which
* can occur when the master determines that a slave has timed out, and
* then that slave itself reports a timeout. </ul>