mirror of
https://github.com/nasa/trick.git
synced 2025-01-18 18:56:31 +00:00
Merge pull request #835 from nasa/481
Note the process exit status of Monte Carlo runs
This commit is contained in:
commit
7b16743b39
@ -43,6 +43,10 @@ namespace Trick {
|
||||
*/
|
||||
|
||||
class Executive : public Trick::Scheduler {
|
||||
public:
|
||||
/** gets #except_return */
|
||||
virtual int get_except_return() const;
|
||||
|
||||
protected:
|
||||
/** Attempts to attach a debugger in the event a signal shuts down the simulation.\n */
|
||||
bool attach_debugger; /**< trick_units(--) */
|
||||
|
@ -39,11 +39,12 @@ namespace Trick {
|
||||
/** Details the manner in which this run exited. */
|
||||
enum ExitStatus {
|
||||
MC_RUN_INCOMPLETE, /**< not completed */
|
||||
MC_RUN_COMPLETE, /**< completed with no errors */
|
||||
MC_RUN_COMPLETE, /**< process completed with exit status zero */
|
||||
MC_RUN_FAILED, /**< process completed with non-zero exit status */
|
||||
MC_RUN_DUMPED_CORE, /**< core dumped */
|
||||
MC_RUN_TIMED_OUT, /**< timed out */
|
||||
MC_CANT_CREATE_OUTPUT_DIR, /**< could not write output files */
|
||||
MC_PROBLEM_PARSING_INPUT, /**< problem parseing monte carlo input */
|
||||
MC_PROBLEM_PARSING_INPUT, /**< problem parsing monte carlo input */
|
||||
MC_UNRECOGNIZED_RETURN_CODE /**< unrecognized return code */
|
||||
};
|
||||
|
||||
@ -338,9 +339,12 @@ namespace Trick {
|
||||
/** Runs to be dispatched. */
|
||||
std::deque <Trick::MonteRun *> runs; /**< \n trick_io(**) trick_units(--) */
|
||||
|
||||
/** Failed runs. */
|
||||
/** Runs whose slave child process completed with a non-zero exit status. */
|
||||
std::deque <Trick::MonteRun *> failed_runs; /**< \n trick_io(**) trick_units(--) */
|
||||
|
||||
/** Runs whose slave child process terminated with an error. */
|
||||
std::deque <Trick::MonteRun *> error_runs; /**< \n trick_io(**) trick_units(--) */
|
||||
|
||||
/** Valid ranges. */
|
||||
std::vector <Trick::MonteRange *> run_ranges; /**< \n trick_io(**) trick_units(--) */
|
||||
|
||||
|
@ -399,3 +399,6 @@ int Trick::Executive::set_current_version(std::string version) {
|
||||
return(0) ;
|
||||
}
|
||||
|
||||
int Trick::Executive::get_except_return() const {
|
||||
return except_return;
|
||||
}
|
||||
|
@ -9,6 +9,8 @@
|
||||
#include "trick/message_type.h"
|
||||
#include "trick/exec_proto.h"
|
||||
|
||||
extern Trick::Executive * the_exec ;
|
||||
|
||||
void Trick::MonteCarlo::set_enabled(bool in_enabled) {
|
||||
this->enabled = in_enabled;
|
||||
}
|
||||
@ -282,7 +284,7 @@ int Trick::MonteCarlo::shutdown() {
|
||||
if (enabled && is_slave()) {
|
||||
connection_device.port = master_port;
|
||||
if (tc_connect(&connection_device) == TC_SUCCESS) {
|
||||
int exit_status = MonteRun::MC_RUN_COMPLETE;
|
||||
int exit_status = the_exec->get_except_return() ? MonteRun::MC_RUN_FAILED : MonteRun::MC_RUN_COMPLETE;
|
||||
if (verbosity >= MC_ALL) {
|
||||
message_publish(MSG_INFO, "Monte [%s:%d] Sending run exit status to master: %d\n",
|
||||
machine_name.c_str(), slave_id, exit_status) ;
|
||||
@ -322,9 +324,12 @@ void Trick::MonteCarlo::handle_retry(MonteSlave& slave, MonteRun::ExitStatus exi
|
||||
|
||||
/** @par Detailed Design: */
|
||||
void Trick::MonteCarlo::resolve_run(MonteSlave& slave, MonteRun::ExitStatus exit_status) {
|
||||
if (exit_status != MonteRun::MC_RUN_COMPLETE) {
|
||||
if (exit_status == MonteRun::MC_RUN_FAILED) {
|
||||
failed_runs.push_back(slave.current_run);
|
||||
}
|
||||
else if (exit_status != MonteRun::MC_RUN_COMPLETE) {
|
||||
error_runs.push_back(slave.current_run);
|
||||
}
|
||||
|
||||
/** <li> Update the bookkeeping. */
|
||||
struct timeval time_val;
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "trick/MonteCarlo.hh"
|
||||
@ -67,9 +66,9 @@ void Trick::MonteCarlo::print_statistics(FILE** fp) {
|
||||
"No Permission to Output Directory", "Bad Input" } ;
|
||||
|
||||
fprintf(*fp,
|
||||
"\nMonte Carlo complete: %u runs (%zu successful) (%zu errors) (%u out of range)\n",
|
||||
num_runs, num_results - failed_runs.size(), failed_runs.size(),
|
||||
num_runs - num_results);
|
||||
"\nMonte Carlo complete: %u runs (%zu successful) (%zu non-zero exit status) (%zu errors) (%u out of range)\n",
|
||||
num_runs, num_results - failed_runs.size() - error_runs.size(), failed_runs.size(),
|
||||
error_runs.size(), num_runs - num_results);
|
||||
|
||||
fprintf(*fp, "\nMachine work unit breakdown:\n");
|
||||
fprintf(*fp, "----------------------------------------------------------------------\n");
|
||||
@ -100,10 +99,17 @@ void Trick::MonteCarlo::print_statistics(FILE** fp) {
|
||||
fprintf(*fp, "Efficency (speedup / num slaves): %.2lf%%\n", efficency);
|
||||
|
||||
if (failed_runs.size()) {
|
||||
fprintf(*fp, "\nError Summary\n");
|
||||
for (std::vector<MonteRun *>::size_type j = 0; j < failed_runs.size(); ++j) {
|
||||
fprintf(*fp, "RUN_%05d exit status = %s (%d)\n", failed_runs[j]->id,
|
||||
exit_status_string[failed_runs[j]->exit_status], failed_runs[j]->exit_status);
|
||||
fprintf(*fp, "\nThe following runs completed with a non-zero process exit status:\n");
|
||||
for (const MonteRun* run : failed_runs) {
|
||||
fprintf(*fp, "RUN_%05d\n", run->id);
|
||||
}
|
||||
}
|
||||
|
||||
if (error_runs.size()) {
|
||||
fprintf(*fp, "\nThe following runs failed to complete:\n");
|
||||
for (const MonteRun* run : error_runs) {
|
||||
fprintf(*fp, "RUN_%05d MonteRun::ExitStatus = %s (%d)\n", run->id,
|
||||
exit_status_string[run->exit_status], run->exit_status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -119,7 +119,8 @@ void Trick::MonteCarlo::handle_run_data(Trick::MonteSlave& slave) {
|
||||
switch (exit_status) {
|
||||
|
||||
case MonteRun::MC_RUN_COMPLETE:
|
||||
resolve_run(slave, MonteRun::MC_RUN_COMPLETE);
|
||||
case MonteRun::MC_RUN_FAILED:
|
||||
resolve_run(slave, static_cast<MonteRun::ExitStatus>(exit_status));
|
||||
run_queue(&master_post_queue, "in master_post queue") ;
|
||||
break;
|
||||
|
||||
@ -154,7 +155,7 @@ void Trick::MonteCarlo::handle_run_data(Trick::MonteSlave& slave) {
|
||||
break;
|
||||
|
||||
/**
|
||||
* <li> Timeouts and are redispatched. However, we must first check to
|
||||
* <li> Timeouts are redispatched. However, we must first check to
|
||||
* see if this run has already been processed in #check_timeouts, which
|
||||
* can occur when the master determines that a slave has timed out, and
|
||||
* then that slave itself reports a timeout. </ul>
|
||||
|
Loading…
Reference in New Issue
Block a user