Dequeue timed-out runs if the slave later returns

Fixes #569
This commit is contained in:
Derek Bankieris 2018-02-28 14:13:05 -06:00
parent 4f0093ddd6
commit 27bf0b030b
3 changed files with 20 additions and 22 deletions

View File

@ -397,8 +397,6 @@ Trick::MonteRun *Trick::MonteCarlo::get_next_dispatch() {
/** <ul><li> While there are remaining runs: */
while (!runs.empty()) {
MonteRun *curr_run = runs.front();
/** <ul><li> If this run hasn't been dispatched before: */
if (curr_run->num_tries == 0) {
/** <li> If it is in range, return it. </ul>*/
if (in_range(curr_run)) {
return curr_run;
@ -409,13 +407,6 @@ Trick::MonteRun *Trick::MonteCarlo::get_next_dispatch() {
}
prepare_run(curr_run);
}
/**
* <li> If this run has been dispatched before, it may have been requeued due to a slave timeout for which the slave
* later returned results. In such a case, do not dispatch it again (return NULL). Otherwise, return it.
*/
} else if (curr_run->exit_status == MonteRun::INCOMPLETE) {
return curr_run;
}
}
return NULL;
}

View File

@ -81,7 +81,14 @@ void Trick::MonteCarlo::handle_run_data(Trick::MonteSlave& slave) {
}
/**
* <ul><li> This run may have already been resolved by another slave if
* <ul><li> Try to remove this run from the queue in case it was requeue by #check_timeouts.
* This covers the case in which the master determines that a slave has timed out, requeues
* the run, and then the slave reports results.
*/
dequeue_run(slave.current_run);
/**
* <li> This run may have already been resolved by another slave if
* this slave was marked as having timed out. If that is the case,
* discard these results.
*/

View File

@ -16,7 +16,7 @@ int Trick::MonteCarlo::execute_as_slave() {
/** <ul><li> On a blocking read, wait for a MonteSlave::Command from the master. */
if (tc_accept(&listen_device, &connection_device) != TC_SUCCESS) {
if (verbosity >= ERROR) {
message_publish(MSG_ERROR, "Monte [%s:%d] Lost connection to Master.\nShutting down.\n",
message_publish(MSG_ERROR, "Monte [%s:%d] Lost connection to Master. Shutting down.\n",
machine_name.c_str(), slave_id) ;
}
slave_shutdown();
@ -24,7 +24,7 @@ int Trick::MonteCarlo::execute_as_slave() {
int command;
if (tc_read(&connection_device, (char *)&command, (int)sizeof(command)) != (int)sizeof(command)) {
if (verbosity >= ERROR) {
message_publish(MSG_ERROR, "Monte [%s:%d] Lost connection to Master while receiving instructions.\nShutting down.\n",
message_publish(MSG_ERROR, "Monte [%s:%d] Lost connection to Master while receiving instructions. Shutting down.\n",
machine_name.c_str(), slave_id) ;
}
slave_shutdown();
@ -44,7 +44,7 @@ int Trick::MonteCarlo::execute_as_slave() {
case MonteSlave::SHUTDOWN:
/** <li> MonteSlave::SHUTDOWN: Call #slave_shutdown. */
if (verbosity >= INFORMATIONAL) {
message_publish(MSG_INFO, "Monte [%s:%d] Shutdown command received from Master.\nShutting down.\n",
message_publish(MSG_INFO, "Monte [%s:%d] Shutdown command received from Master. Shutting down.\n",
machine_name.c_str(), slave_id) ;
}
slave_shutdown();
@ -60,7 +60,7 @@ int Trick::MonteCarlo::execute_as_slave() {
default:
/** <li> default: Call #slave_shutdown. */
if (verbosity >= ERROR) {
message_publish(MSG_ERROR, "Monte [%s:%d] Unrecognized command %d received from Master.\nShutting down.\n",
message_publish(MSG_ERROR, "Monte [%s:%d] Unrecognized command %d received from Master. Shutting down.\n",
machine_name.c_str(), slave_id, command) ;
}
slave_shutdown();