Dequeue timed-out runs if the slave later returns

Fixes #569
This commit is contained in:
Derek Bankieris 2018-02-28 14:13:05 -06:00
parent 4f0093ddd6
commit 27bf0b030b
3 changed files with 20 additions and 22 deletions

View File

@ -397,8 +397,6 @@ Trick::MonteRun *Trick::MonteCarlo::get_next_dispatch() {
/** <ul><li> While there are remaining runs: */ /** <ul><li> While there are remaining runs: */
while (!runs.empty()) { while (!runs.empty()) {
MonteRun *curr_run = runs.front(); MonteRun *curr_run = runs.front();
/** <ul><li> If this run hasn't been dispatched before: */
if (curr_run->num_tries == 0) {
/** <li> If it is in range, return it. </ul>*/ /** <li> If it is in range, return it. </ul>*/
if (in_range(curr_run)) { if (in_range(curr_run)) {
return curr_run; return curr_run;
@ -409,13 +407,6 @@ Trick::MonteRun *Trick::MonteCarlo::get_next_dispatch() {
} }
prepare_run(curr_run); prepare_run(curr_run);
} }
/**
* <li> If this run has been dispatched before, it may have been requeued due to a slave timeout for which the slave
* later returned results. In such a case, do not dispatch it again (return NULL). Otherwise, return it.
*/
} else if (curr_run->exit_status == MonteRun::INCOMPLETE) {
return curr_run;
}
} }
return NULL; return NULL;
} }

View File

@ -81,7 +81,14 @@ void Trick::MonteCarlo::handle_run_data(Trick::MonteSlave& slave) {
} }
/** /**
* <ul><li> This run may have already been resolved by another slave if * <ul><li> Try to remove this run from the queue in case it was requeue by #check_timeouts.
* This covers the case in which the master determines that a slave has timed out, requeues
* the run, and then the slave reports results.
*/
dequeue_run(slave.current_run);
/**
* <li> This run may have already been resolved by another slave if
* this slave was marked as having timed out. If that is the case, * this slave was marked as having timed out. If that is the case,
* discard these results. * discard these results.
*/ */

View File

@ -16,7 +16,7 @@ int Trick::MonteCarlo::execute_as_slave() {
/** <ul><li> On a blocking read, wait for a MonteSlave::Command from the master. */ /** <ul><li> On a blocking read, wait for a MonteSlave::Command from the master. */
if (tc_accept(&listen_device, &connection_device) != TC_SUCCESS) { if (tc_accept(&listen_device, &connection_device) != TC_SUCCESS) {
if (verbosity >= ERROR) { if (verbosity >= ERROR) {
message_publish(MSG_ERROR, "Monte [%s:%d] Lost connection to Master.\nShutting down.\n", message_publish(MSG_ERROR, "Monte [%s:%d] Lost connection to Master. Shutting down.\n",
machine_name.c_str(), slave_id) ; machine_name.c_str(), slave_id) ;
} }
slave_shutdown(); slave_shutdown();
@ -24,7 +24,7 @@ int Trick::MonteCarlo::execute_as_slave() {
int command; int command;
if (tc_read(&connection_device, (char *)&command, (int)sizeof(command)) != (int)sizeof(command)) { if (tc_read(&connection_device, (char *)&command, (int)sizeof(command)) != (int)sizeof(command)) {
if (verbosity >= ERROR) { if (verbosity >= ERROR) {
message_publish(MSG_ERROR, "Monte [%s:%d] Lost connection to Master while receiving instructions.\nShutting down.\n", message_publish(MSG_ERROR, "Monte [%s:%d] Lost connection to Master while receiving instructions. Shutting down.\n",
machine_name.c_str(), slave_id) ; machine_name.c_str(), slave_id) ;
} }
slave_shutdown(); slave_shutdown();
@ -44,7 +44,7 @@ int Trick::MonteCarlo::execute_as_slave() {
case MonteSlave::SHUTDOWN: case MonteSlave::SHUTDOWN:
/** <li> MonteSlave::SHUTDOWN: Call #slave_shutdown. */ /** <li> MonteSlave::SHUTDOWN: Call #slave_shutdown. */
if (verbosity >= INFORMATIONAL) { if (verbosity >= INFORMATIONAL) {
message_publish(MSG_INFO, "Monte [%s:%d] Shutdown command received from Master.\nShutting down.\n", message_publish(MSG_INFO, "Monte [%s:%d] Shutdown command received from Master. Shutting down.\n",
machine_name.c_str(), slave_id) ; machine_name.c_str(), slave_id) ;
} }
slave_shutdown(); slave_shutdown();
@ -60,7 +60,7 @@ int Trick::MonteCarlo::execute_as_slave() {
default: default:
/** <li> default: Call #slave_shutdown. */ /** <li> default: Call #slave_shutdown. */
if (verbosity >= ERROR) { if (verbosity >= ERROR) {
message_publish(MSG_ERROR, "Monte [%s:%d] Unrecognized command %d received from Master.\nShutting down.\n", message_publish(MSG_ERROR, "Monte [%s:%d] Unrecognized command %d received from Master. Shutting down.\n",
machine_name.c_str(), slave_id, command) ; machine_name.c_str(), slave_id, command) ;
} }
slave_shutdown(); slave_shutdown();