946 lines
30 KiB
C++
Raw Normal View History

2015-02-26 09:02:31 -06:00
/*
PURPOSE: (Monte Carlo simulation.)
*/
#ifndef MONTECARLO_HH
#define MONTECARLO_HH
2015-02-26 09:02:31 -06:00
#include <deque>
#include <vector>
#include "MonteVar.hh"
#include "sim_services/Executive/include/Executive.hh"
#include "sim_services/include/RemoteShell.hh"
#include "trick_utils/comm/include/tc.h"
#ifndef HOST_NAME_MAX
#define HOST_NAME_MAX 128
#endif
namespace Trick {
/**
* Represents a particular iteration in a Monte Carlo simulation. In addition to some bookkeeping information, a run
* contains the variable values specific to this iteration.
*
* @author Alex Lin
* @author Donna Panter
* @author Derek Bankieris
*
* @date August 2010
*/
class MonteRun {
public:
/** Details the manner in which this run exited. */
enum ExitStatus {
INCOMPLETE, /**< not completed */
COMPLETE, /**< completed with no errors */
CORED, /**< core dumped */
TIMEDOUT, /**< timed out */
NO_PERM, /**< could not write output files */
BAD_INPUT, /**< problem parsing monte carlo input */
2015-02-26 09:02:31 -06:00
UNKNOWN /**< unrecognized return code */
};
/** Unique identifier sequentially assigned, starting at zero, by the master. */
unsigned int id; /**< \n trick_units(--) */
/** Number of times this run has been dispatched. */
unsigned int num_tries; /**< \n trick_units(--) */
/** Time at which this run began. */
double start_time; /**< \n trick_units(--) */
/** Time at which this run ended. */
double end_time; /**< \n trick_units(--) */
/** Variable values specific to this Monte Carlo iteration. */
std::vector <std::string> variables; /**< \n trick_units(--) */
/** Manner in which this run exited. */
ExitStatus exit_status; /**< \n trick_units(--) */
/**
* Constructs a MonteRun with the specified id.
*
* @param id unique indentifier
*/
MonteRun(unsigned int in_id) :
id(in_id),
num_tries(0),
start_time(0),
end_time(0),
exit_status(INCOMPLETE) {}
};
/**
* Represents a slave in a Monte Carlo simulation.
*
* @see @ref MonteCarloSlaves "Slaves"
*
* @author Alex Lin
* @author Donna Panter
* @author Derek Bankieris
*
* @date August 2010
*/
class MonteSlave {
public:
/** Operational state. */
enum State {
UNINITIALIZED, /**< newly created */
INITIALIZING, /**< starting up */
READY, /**< awaiting new run */
RUNNING, /**< processing a run */
STOPPING, /**< stopping after current run */
STOPPED, /**< not accepting new runs */
FINISHED, /**< completed all runs */
UNRESPONSIVE_RUNNING, /**< timed out and in a running state */
UNRESPONSIVE_STOPPING, /**< timed out and in a stopping state */
DISCONNECTED /**< lost connection */
};
/** Master-to-slave commands. */
enum Command {
PROCESS_RUN, /**< process a new run */
SHUTDOWN, /**< kill any executing run, call shutdown jobs, and shutdown cleanly */
DIE /**< kill any executing run, do not call shutdown jobs, and exit */
};
/** Unique identifier assigned by the master. */
unsigned int id; /**< \n trick_units(--) */
/** Operational state. */
State state; /**< \n trick_units(--) */
/** Name of the machine on which this slave is running. */
std::string machine_name; /**< \n trick_units(--) */
/** Port over which this slave is listening for dispatches. */
unsigned int port; /**< \n trick_units(--) */
/** Run most recently dispatched to this slave. */
MonteRun *current_run; /**< \n trick_units(--) */
/** Number of runs dispatched to this slave. */
unsigned int num_dispatches; /**< \n trick_units(--) */
/** Number of results returned by this slave. */
unsigned int num_results; /**< \n trick_units(--) */
/** Total cpu time used. */
double cpu_time; /**< \n trick_units(--) */
/** Shell command with which to start this slave. */
Trick::RemoteShell remote_shell; /**< \n trick_units(--) */
/**
* User defined shell command with which to start this slave when the #remote_shell is
* Trick::TRICK_USER_REMOTE_SHELL.
*/
std::string user_remote_shell; /**< \n trick_units(--) */
/** Optional arguments to use with the remote shell command. */
std::string remote_shell_args; /**< \n trick_units(--) */
/** Multiplier speed of this slave's machine. */
double multiplier; /**< \n trick_units(--) */
/** Remote program directory pathname. */
std::string sim_path; /**< \n trick_units(--) */
/** Remote program name. */
std::string S_main_name; /**< \n trick_units(--) */
void set_S_main_name(std::string name); /**< \n trick_units(--) */
/**
* Constructs a MonteSlave with the specified machine name.
*
* @param name the slave's machine's name
*/
MonteSlave(std::string name = "localhost") :
id(0),
state(UNINITIALIZED),
port(0),
current_run(NULL),
num_dispatches(0),
num_results(0),
cpu_time(0),
remote_shell(Trick::TRICK_SSH),
multiplier(1) {
if (name.empty()) {
machine_name = "localhost";
}
else {
machine_name = name;
}
}
};
/**
* Specifies a range of valid run numbers.
*
* @see MonteRun::id
*
* @author Alex Lin
* @author Donna Panter
* @author Derek Bankieris
*
* @date August 2010
*/
class MonteRange {
protected:
/** Starting run number. */
unsigned int start; /**< \n trick_units(--) */
/** Ending run number. */
unsigned int end; /**< \n trick_units(--) */
public:
/**
* Constructs a MonteRange with the specified inclusive end points. Specifying an end value that is less than the
* start value results in a range which includes only the start value.
*
* @param start starting run number
* @param end ending run number
*/
MonteRange(unsigned int in_start, unsigned int in_end) :
start(in_start),
end(in_end) {
if (end < start) {
this->end = start;
}
}
/** Gets #start. */
unsigned int get_start() {
return start;
}
/** Gets #end. */
unsigned int get_end() {
return end;
}
};
/**
* Represents a Monte Carlo simulation.
*
* @see @ref MonteCarloPage "Monte Carlo"
*
* @author Alex Lin
* @author Donna Panter
* @author Derek Bankieris
*
* @date August 2010
*/
class MonteCarlo : public Trick::Scheduler {
friend class InputProcessor;
#ifndef SWIG
friend void init_attrTrick__MonteCarlo();
#endif
public:
/** Verbosity of message reporting. */
enum Verbosity {
NONE, /**< report no messages */
ERROR, /**< report error messages */
INFORMATIONAL, /**< report error and informational messages, no warning messages */
ALL /**< report all messages (error, informational & warning) */
};
private:
int run_queue(Trick::ScheduledJobQueue* queue, std::string in_string) ;
int open_file(std::string file_name, FILE** file_ptr) ;
void write_to_run_files(std::string file_name) ;
int initialize_sockets() ;
int construct_run_directory() ;
void shutdown_slaves() ;
void print_statistics(FILE** fp) ;
void dryrun() ;
void initialize_slave(Trick::MonteSlave* slave_to_init) ;
void default_slave_dispatch_pre_text(Trick::MonteSlave*, std::string &buffer) ;
protected:
/** Indicates whether or not this is a Monte Carlo simulation. */
bool enabled; /**< \n trick_units(--) */
/**
* Indicates whether or not this is a dry run. A dry run executes pre run jobs only. Post run jobs and the runs
* themselves are not executed.
*/
bool dry_run; /**< \n trick_units(--) */
/**
* Indicates whether or not the localhost should be treated as a remote machine. This determines if slaves running
* locally use remote shells.
*/
bool localhost_as_remote; /**< \n trick_units(--) */
/**
* Indicates how much automation should be employed in forming the commands used to dispatch slaves.
* - A value of <code>true</code> indicates that Trick should form only the core of the command, which consists of
* S_main executable with the proper slave-specific arguments. #custom_pre_text and #custom_post_text will then be
* prepended and appended to the core, respectively, before executing the command. #remote_shell_args,
* #user_cmd_string, MonteSlave::remote_shell, and MonteSlave::machine_name are ignored.
* - A value of <code>false</code> indicates that Trick should use the shell specified by MonteSlave::remote_shell
* with the #remote_shell_args, followed by the slave's machine name, #user_cmd_string, navigation to the correct
* directory, and finally the core command described above. #custom_pre_text and #custom_post_text are ignored.
*/
bool custom_slave_dispatch; /**< \n trick_units(--) */
/** Maximum time to wait for a run to complete. Defaults to 120 seconds. */
double timeout; /**< \n trick_units(s) */
/** Maximum number of times that a run may be dispatched. Defaults to two. Specify zero for no limit. */
unsigned int max_tries; /**< \n trick_units(--) */
/** Options to be passed to the remote shell when spawning new slaves. */
std::string user_cmd_string; /**< \n trick_units(--) */
/** Text to be prepended to the core slave dispatch command as described in #custom_slave_dispatch. */
std::string custom_pre_text; /**< \n trick_units(--) */
/** Text to be appended to the core slave dispatch command as described in #custom_slave_dispatch. */
std::string custom_post_text; /**< \n trick_units(--) */
/** Highest level of messages to report. */
Verbosity verbosity; /**< \n trick_units(--) */
/** Default to false and randomly find port numbers. True, use the user provided port numbers. */
bool default_port_flag; /**< \n trick_units(--) */
/** Device over which connections are accepted. */
TCDevice listen_device; /**< \n trick_units(--) */
/** Device over which data is sent and received. */
TCDevice connection_device; /**< \n trick_units(--) */
/** Device over which connections are accepted between the Slave child and Master. */
TCDevice data_listen_device; /**< \n trick_units(--) */
/** Device over which data is sent and received between Slave child and Master. */
TCDevice data_connection_device; /**< \n trick_units(--) */
/** Runs to be dispatched. */
std::deque <Trick::MonteRun *> runs; /**< \n trick_units(--) */
/** Failed runs. */
std::deque <Trick::MonteRun *> failed_runs; /**< \n trick_units(--) */
/** Valid ranges. */
std::vector <Trick::MonteRange *> run_ranges; /**< \n trick_units(--) */
/** Variables. */
std::vector <Trick::MonteVar *> variables; /**< \n trick_units(--) */
/** Slaves. */
std::vector <Trick::MonteSlave *> slaves; /**< \n trick_units(--) */
/** Number of slaves. Exists for Variable Server access. */
int num_slaves; /**< \n trick_units(--) */
/** List of slave pointers. Exists for Variable Server access. */
Trick::MonteSlave **slaves_head; /**< \n trick_units(--) */
/** Current run dispatched. */
unsigned int current_run; /**< \n trick_units(--) */
/** User-specified number of runs. */
unsigned int num_runs; /**< \n trick_units(--) */
/** Number of runs in range. */
unsigned int actual_num_runs; /**< \n trick_units(--) */
/** Number of results. */
unsigned int num_results; /**< \n trick_units(--) */
/** Time simulation began. */
double start_time; /**< \n trick_units(--) */
/** Time simulation ended. */
double end_time; /**< \n trick_units(--) */
/** Port on which the master is listening. This value is unspecified for the master. */
unsigned int master_port; /**< \n trick_units(--) */
/** Port on which the master is listening for data. This value is unspecified for the master. */
unsigned int data_port; /**< \n trick_units(--) */
/** Unique identifier. This value is zero for the master. */
unsigned int slave_id; /**< \n trick_units(--) */
/** Name of the machine on which this simulation is running. */
std::string machine_name; /**< \n trick_units(--) */
/** Run data file. */
FILE *run_data_file; /**< \n trick_io(**) */
/** Run header file. */
FILE *run_header_file; /**< \n trick_io(**) */
/** Run directory. */
std::string run_directory; /**, \n trick_units(--) */
Trick::JobData * curr_job ; /**< trick_io(**) */
/** Return code to be returned by Executive:init(), Executive::loop(), and Executive::shutdown() \n */
int except_return ; /**< trick_io(**) */
/** File name of exceptions caught in init() and loop()\n */
std::string except_file ; /**< trick_io(**) */
/** Error message of exceptions caught in init() and loop()\n */
std::string except_message ; /**< trick_io(**) */
/** Jobs to be run by the master during initialization. */
Trick::ScheduledJobQueue master_init_queue; /**< \n trick_units(--) */
/** Jobs to be run by the master before each run. */
Trick::ScheduledJobQueue master_pre_queue; /**< \n trick_units(--) */
/** Jobs to the run by the master after each run. */
Trick::ScheduledJobQueue master_post_queue; /**< \n trick_units(--) */
/** Jobs to be run by the master during shutdown. */
Trick::ScheduledJobQueue master_shutdown_queue; /**< \n trick_units(--) */
/** Jobs to be run by the slave during initialization. */
Trick::ScheduledJobQueue slave_init_queue; /**< \n trick_units(--) */
/** Jobs to be run by the slave before each run. */
Trick::ScheduledJobQueue slave_pre_queue; /**< \n trick_units(--) */
/** Jobs to the run by the slave after each run. */
Trick::ScheduledJobQueue slave_post_queue; /**< \n trick_units(--) */
/** Jobs to be run by the slave during shutdown. */
Trick::ScheduledJobQueue slave_shutdown_queue; /**< \n trick_units(--) */
public:
/** Constructs a new MonteCarlo. */
MonteCarlo();
~MonteCarlo();
/**
* S_define level job. Sends sims through master/slave logic if monte carlo is enabled.
*
* @return 0 on success
*/
int execute_monte();
/**
* Sets #enabled.
*
* @see @ref MonteCarloEnabling "Enabling Monte Carlo"
*/
void set_enabled(bool enabled);
/**
* Gets #enabled.
*
* @see @ref MonteCarloEnabling "Enabling Monte Carlo"
*/
bool get_enabled();
/**
* Sets #dry_run.
*
* @see @ref MonteCarloDryRun "Dry Run"
*/
void set_dry_run(bool dry_run);
/**
* Gets #dry_run.
*
* @see @ref MonteCarloDryRun "Dry Run"
*/
bool get_dry_run();
/**
* Returns true if executive is running as the slave,
* based on value of slave_id (which is > 0 for slave).
*/
bool is_slave();
/**
* Returns true if executive is running as the master,
* based on the value of slave_id (which is 0 for master).
*/
bool is_master();
/**
* Sets #localhost_as_remote.
*/
void set_localhost_as_remote(bool localhost_as_remote);
/**
* Gets #localhost_as_remote.
*/
bool get_localhost_as_remote();
/**
* Sets #custom_slave_dispatch.
*/
void set_custom_slave_dispatch(bool custom_slave_dispatch);
/**
* Gets #custom_slave_dispatch.
*/
bool get_custom_slave_dispatch();
/**
* Sets #timeout.
*/
void set_timeout(double timeout);
/**
* Gets #timeout.
*/
double get_timeout();
/**
* Sets #max_tries.
*/
void set_max_tries(unsigned int max_tries);
/**
* Gets #max_tries.
*/
unsigned int get_max_tries();
/**
* Sets #user_cmd_string.
*/
void set_user_cmd_string(std::string user_cmd_string);
/**
* Gets #user_cmd_string.
*/
std::string get_user_cmd_string();
/**
* Sets #custom_pre_text.
*/
void set_custom_pre_text(std::string custom_pre_text);
/**
* Gets #custom_pre_text.
*/
std::string get_custom_pre_text();
/**
* Sets #custom_post_text.
*/
void set_custom_post_text(std::string custom_post_text);
/**
* Gets #custom_post_text.
*/
std::string get_custom_post_text();
/**
* Sets #verbosity.
*/
void set_verbosity(Verbosity verbosity);
/**
* Gets #verbosity.
*/
Verbosity get_verbosity();
/**
* Sets #num_runs.
*
* @see @ref MonteCarloRuns "Specifying the Number of Runs"
*/
void set_num_runs(unsigned int num_runs);
/**
* Gets #num_runs.
*
* @see @ref MonteCarloRuns "Specifying the Number of Runs"
*/
unsigned int get_num_runs();
/**
* Gets #num_results.
*/
unsigned int get_num_results();
/**
* Gets #slave_id.
*/
unsigned int get_slave_id();
/**
* Adds the specified range to the list of valid ranges.
*
* @param start the starting run's id
* @param end the ending run's id
*
* @see MonteRun::id
* @see MonteRange
* @see @ref MonteCarloRanges "Specifying Valid Ranges"
*/
void add_range(unsigned int start, unsigned int end = 0);
/**
* Determines if the specified run falls within a valid range.
*
* @param run the run in question
*
* @see @ref MonteCarloRanges "Specifying Valid Ranges"
*/
bool in_range(Trick::MonteRun *run);
/**
* Copies the current vector of valid run ranges into the argument vector.
*
* @param ranges the vector into which the ranges will be copied
*
* @see @ref MonteCarloRanges "Specifying Valid Ranges"
*/
void get_ranges(std::vector<MonteRange *> &ranges);
/**
* Adds the specified variable.
*
* @param variable the variable to add
*
* @see @ref MonteCarloVariables "Adding Variables"
*/
void add_variable(Trick::MonteVar *variable);
/**
* Adds a new slave with the specified machine name.
*
* @param machine_name the target machine's name
*
* @see @ref MonteCarloAddingSlaves "Adding Slaves"
*/
void add_slave(std::string machine_name);
/**
* Adds the specified slave.
*
* @param slave the slave to add
*
* @see @ref MonteCarloAddingSlaves "Adding Slaves"
*/
void add_slave(Trick::MonteSlave *slave);
/**
* Starts the slave with the specified id if it exists. Starting a slave puts it into a state in which it
* continuously accepts and processes runs from the master.
*
* @param id the id of the slave to start
*
* @see MonteSlave::id
*/
void start_slave(unsigned int id);
/**
* Stops the slave with the specified id if it exists. Stopping a slave puts it into a state in which it will not
* accept new runs from the master.
*
* @param id the id of the slave to stop
*
* @see MonteSlave::id
*/
void stop_slave(unsigned int id);
/**
* Disables the slave at initialization of the Master. Must be called before master_init is called
* (i.e. in input file, default_data jobs, or initialization jobs with a phase number = 0)
*
* @param name the name of the slave to disable
* @param disabled true if the slave should be disabled, false if the slave should be used.
*/
void disable_slave(std::string name, bool disabled);
/**
* Processes command line arguments specific to Monte Carlo simulations.
*
* @return 0 on success
*/
int process_sim_args();
/**
* Performs a Monte Carlo specific shutdown before calling Executive::shutdown().
*
* @return 0 on success
*/
int shutdown();
/** Gets #current_run being processed
*
* @return the current run number
*/
unsigned int get_current_run() ;
/** Sets the #current_run being processed
*
* @param run_num the number to set the run
*/
void set_current_run(int run_num) ;
/** Retrieves the #data_connection_device
*
* @return the address of the data_connection_device
*/
TCDevice* get_data_connection_device();
/** Allows the user to set the port number for
* the listen_device
*
* @param port_number number for the port
*/
void set_listen_device_port(int port_number) ;
/** Allows the user to set the port number for
* the data_listen_device
*
* @param port_number number for the port
*/
void set_data_listen_device_port(int port_number) ;
/** Allows the user to set the port number for
* the connection_device
*
* @param port_number number for the port
*/
void set_connection_device_port(int port_number) ;
/** Allows the user to set the port number for
* the data_connection_device
*
* @param port_number number for the port
*/
void set_data_connection_device_port(int port_number) ;
/** Allows the user to get the port number for
* the listen_device
*
* @return the port number
*/
int get_listen_device_port() ;
/** Allows the user to get the port number for
* the data_listen_device
*
* @return the port number
*/
int get_data_listen_device_port() ;
/** Allows the user to get the port number for
* the connection_device
*
* @return the port number
*/
int get_connection_device_port() ;
/** Allows the user to get the port number for
* the data_connection_device
*
* @return the port number
*/
int get_data_connection_device_port() ;
#if 0
/**
Overload default implentation of Scheduler::add_sim_object
*/
virtual int add_sim_object( Trick::SimObject * in_object ) ;
#endif
protected:
/**
* Initializes sockets.
*
* @return 0 on success
*/
int socket_init(TCDevice *listen_device);
/**
* Initializes the master.
*
* @return 0 on success
*/
int master_init();
/**
* Begins Monte Carlo simulation as the master.
*
* @return 0 on success
*/
int master();
/**
* Spawns all uninitialized slaves.
*
* @see MonteSlave::state
*/
void spawn_slaves();
/** Receives from any slaves that are ready to return results. */
void receive_results();
/** Receives the results from the slave */
void receive_slave_results() ;
void read_machine_name(MonteSlave *curr_slave);
void set_disconnected_state(MonteSlave *curr_slave);
void read_slave_port(MonteSlave *curr_slave);
/**
* Handles the retrying of the current run of the specified slave with the specified exit status.
*
* @param slave the slave processing the run
* @param exit_status the exit status of the run
*
* @see max_tries
*/
void handle_retry(MonteSlave *slave, MonteRun::ExitStatus exit_status);
/**
* Resolves the current run of the specified slave with the specified exit status.
*
* @param slave the slave processing the run
* @param exit_status the exit status of the run
*/
void resolve_run(MonteSlave *slave, MonteRun::ExitStatus exit_status);
/** Checks dispatched runs for timeouts. */
void check_timeouts();
/**
* Gets a slave that is ready for a new dispatch.
*
* @return a ready slave, or <code>NULL</code> if there is none
*/
MonteSlave *get_ready_slave();
/**
* Gets the slave with the specified id.
*
* @param id the slave's id
*
* @return the slave, or <code>NULL</code> if no such slave exists
*
* @see MonteSlave::id
*/
MonteSlave *get_slave(unsigned int id);
/**
* Gets the index within #slaves for the slave with the specified id.
*
* @param id the slave's id
*
* @return the slave's index, or -1 if no such slave exists
*
* @see MonteSlave::id
*/
int get_slave_index(unsigned int id);
/**
* Gets the next run to be dispatched.
*
* @return the next run, or <code>NULL</code> if there is none
*/
MonteRun *get_next_dispatch();
/**
* Prepares the specified run for dispatch.
*
* @param run the run to initialize
*/
int prepare_run(MonteRun *run);
/** Removes the specified run, if present, from #runs. */
void dequeue_run(MonteRun *run);
/**
* Dispatches the specified run to the specified slave.
*
* @param run the run to dispatch
* @param slave the target slave
*/
void dispatch_run_to_slave(MonteRun *run, MonteSlave *slave);
/** Updates the #num_slaves and #slaves_head to reflect the #slaves. */
void sync_slaves_head();
/** Updates #actual_num_runs. */
void update_actual_num_runs();
/** Shuts down the master. */
void master_shutdown();
/**
* Initializes the slave.
*
* @return 0 on success
*/
int slave_init();
/**
* Begins Monte Carlo simulation as a slave.
*
* @return 0 on success
*/
int slave();
/** Processes an incoming run. */
int slave_process_run();
/** Shuts down the slave. */
void slave_shutdown();
/** Kills the slave. */
void slave_die();
/** Kills the current run. */
void slave_kill_run();
int instrument_job_before(Trick::JobData* instrument_job);
int instrument_job_after(Trick::JobData* instrument_job);
int instrument_job_remove(std::string in_job);
int write_s_job_execution(FILE* fp);
/**
* Determines if the specified stings are equivalent, ignoring case.
*
* @param string1 the first string
* @param string2 the second string
*
* @return the case-insensitive equivalency
*/
bool equals_ignore_case(std::string string1, std::string string2); // I am appalled by having to write this myself.
};
};
#endif