Note the process exit status of Monte Carlo runs

Closes #481
2025-05-31 06:31:13 +00:00 · 2019-06-21 14:02:03 -05:00 · 2019-06-21 14:02:03 -05:00 · 40cf5c0b29
commit 40cf5c0b29
parent 00302208b5
6 changed files with 38 additions and 15 deletions
--- a/include/trick/Executive.hh
+++ b/include/trick/Executive.hh
@ -43,6 +43,10 @@ namespace Trick {
     */

    class Executive : public Trick::Scheduler {
+        public:
+            /** gets #except_return */
+            virtual int get_except_return() const;
+
        protected:
            /** Attempts to attach a debugger in the event a signal shuts down the simulation.\n */
            bool attach_debugger;            /**< trick_units(--) */
--- a/include/trick/MonteCarlo.hh
+++ b/include/trick/MonteCarlo.hh
@ -39,11 +39,12 @@ namespace Trick {
        /** Details the manner in which this run exited. */
        enum ExitStatus {
            MC_RUN_INCOMPLETE,          /**< not completed */
-            MC_RUN_COMPLETE,            /**< completed with no errors */
+            MC_RUN_COMPLETE,            /**< process completed with exit status zero */
+            MC_RUN_FAILED,              /**< process completed with non-zero exit status */
            MC_RUN_DUMPED_CORE,         /**< core dumped */
            MC_RUN_TIMED_OUT,           /**< timed out */
            MC_CANT_CREATE_OUTPUT_DIR,  /**< could not write output files */
-            MC_PROBLEM_PARSING_INPUT,   /**< problem parseing monte carlo input */
+            MC_PROBLEM_PARSING_INPUT,   /**< problem parsing monte carlo input */
            MC_UNRECOGNIZED_RETURN_CODE /**< unrecognized return code */
        };

@ -338,9 +339,12 @@ namespace Trick {
        /** Runs to be dispatched. */
        std::deque <Trick::MonteRun *> runs;                 /**< \n trick_io(**) trick_units(--) */

-        /** Failed runs. */
+        /** Runs whose slave child process completed with a non-zero exit status. */
        std::deque <Trick::MonteRun *> failed_runs;          /**< \n trick_io(**) trick_units(--) */

+        /** Runs whose slave child process terminated with an error. */
+        std::deque <Trick::MonteRun *> error_runs;           /**< \n trick_io(**) trick_units(--) */
+
        /** Valid ranges. */
        std::vector <Trick::MonteRange *> run_ranges;        /**< \n trick_io(**) trick_units(--) */

--- a/trick_source/sim_services/Executive/Executive.cpp
+++ b/trick_source/sim_services/Executive/Executive.cpp
@ -399,3 +399,6 @@ int Trick::Executive::set_current_version(std::string version) {
    return(0) ;
 }

+int Trick::Executive::get_except_return() const {
+    return except_return;
+}
--- a/trick_source/sim_services/MonteCarlo/MonteCarlo_funcs.cpp
+++ b/trick_source/sim_services/MonteCarlo/MonteCarlo_funcs.cpp
@ -9,6 +9,8 @@
 #include "trick/message_type.h"
 #include "trick/exec_proto.h"

+extern Trick::Executive * the_exec ;
+
 void Trick::MonteCarlo::set_enabled(bool in_enabled) {
    this->enabled = in_enabled;
 }
@ -282,7 +284,7 @@ int Trick::MonteCarlo::shutdown() {
    if (enabled && is_slave()) {
        connection_device.port = master_port;
        if (tc_connect(&connection_device) == TC_SUCCESS) {
-            int exit_status = MonteRun::MC_RUN_COMPLETE;
+            int exit_status = the_exec->get_except_return() ? MonteRun::MC_RUN_FAILED : MonteRun::MC_RUN_COMPLETE;
            if (verbosity >= MC_ALL) {
                message_publish(MSG_INFO, "Monte [%s:%d] Sending run exit status to master: %d\n",
                                machine_name.c_str(), slave_id, exit_status) ;
@ -322,9 +324,12 @@ void Trick::MonteCarlo::handle_retry(MonteSlave& slave, MonteRun::ExitStatus exi

 /** @par Detailed Design: */
 void Trick::MonteCarlo::resolve_run(MonteSlave& slave, MonteRun::ExitStatus exit_status) {
-    if (exit_status != MonteRun::MC_RUN_COMPLETE) {
+    if (exit_status == MonteRun::MC_RUN_FAILED) {
        failed_runs.push_back(slave.current_run);
    }
+    else if (exit_status != MonteRun::MC_RUN_COMPLETE) {
+        error_runs.push_back(slave.current_run);
+    }

    /** <li> Update the bookkeeping. */
    struct timeval time_val;
--- a/trick_source/sim_services/MonteCarlo/MonteCarlo_master_shutdown.cpp
+++ b/trick_source/sim_services/MonteCarlo/MonteCarlo_master_shutdown.cpp
@ -1,4 +1,3 @@
-
 #include <sys/time.h>

 #include "trick/MonteCarlo.hh"
@ -67,9 +66,9 @@ void Trick::MonteCarlo::print_statistics(FILE** fp) {
       "No Permission to Output Directory", "Bad Input" } ;

    fprintf(*fp,
-      "\nMonte Carlo complete: %u runs (%zu successful) (%zu errors) (%u out of range)\n",
-      num_runs, num_results - failed_runs.size(), failed_runs.size(),
-      num_runs - num_results);
+      "\nMonte Carlo complete: %u runs (%zu successful) (%zu non-zero exit status) (%zu errors) (%u out of range)\n",
+      num_runs, num_results - failed_runs.size() - error_runs.size(), failed_runs.size(),
+      error_runs.size(), num_runs - num_results);

    fprintf(*fp, "\nMachine work unit breakdown:\n");
    fprintf(*fp, "----------------------------------------------------------------------\n");
@ -100,10 +99,17 @@ void Trick::MonteCarlo::print_statistics(FILE** fp) {
    fprintf(*fp, "Efficency (speedup / num slaves): %.2lf%%\n", efficency);

    if (failed_runs.size()) {
-        fprintf(*fp, "\nError Summary\n");
-        for (std::vector<MonteRun *>::size_type j = 0; j < failed_runs.size(); ++j) {
-            fprintf(*fp, "RUN_%05d exit status = %s (%d)\n", failed_runs[j]->id,
-              exit_status_string[failed_runs[j]->exit_status], failed_runs[j]->exit_status);
+        fprintf(*fp, "\nThe following runs completed with a non-zero process exit status:\n");
+        for (const MonteRun* run : failed_runs) {
+            fprintf(*fp, "RUN_%05d\n", run->id);
+        }
+    }
+
+    if (error_runs.size()) {
+        fprintf(*fp, "\nThe following runs failed to complete:\n");
+        for (const MonteRun* run : error_runs) {
+            fprintf(*fp, "RUN_%05d MonteRun::ExitStatus = %s (%d)\n", run->id,
+              exit_status_string[run->exit_status], run->exit_status);
        }
    }
 }
--- a/trick_source/sim_services/MonteCarlo/MonteCarlo_receive_results.cpp
+++ b/trick_source/sim_services/MonteCarlo/MonteCarlo_receive_results.cpp
@ -119,7 +119,8 @@ void Trick::MonteCarlo::handle_run_data(Trick::MonteSlave& slave) {
    switch (exit_status) {

        case MonteRun::MC_RUN_COMPLETE:
-            resolve_run(slave, MonteRun::MC_RUN_COMPLETE);
+        case MonteRun::MC_RUN_FAILED:
+            resolve_run(slave, static_cast<MonteRun::ExitStatus>(exit_status));
            run_queue(&master_post_queue, "in master_post queue") ;
            break;

@ -154,7 +155,7 @@ void Trick::MonteCarlo::handle_run_data(Trick::MonteSlave& slave) {
            break;

        /**
-         * <li> Timeouts and are redispatched. However, we must first check to
+         * <li> Timeouts are redispatched. However, we must first check to
         * see if this run has already been processed in #check_timeouts, which
         * can occur when the master determines that a slave has timed out, and
         * then that slave itself reports a timeout. </ul>