mirror of
https://github.com/nasa/trick.git
synced 2024-12-18 20:57:55 +00:00
9099792947
* Provide MonteCarloGenerate capability Intermediate commit, this squash represents all of Isaac Reaves' work during his Fall 2022 Pathways internship tour [skip ci] * TrickOps: Add phase, [min-max] range, and overhaul YAML verification * Add new "phase:" mechanism to TrickOps Runs and Builds to support project-specific constraints on build and run ordering - phase defaults to zero if not specified and must be between -1000 and 1000 if given. - jobs can now optionally be requested by their phase or phase range - See trickops/README.md for details * Add [min-max] notation capability to run: entries and compare: entries - [min-max] ranges provide definition of a set of runs using a common numbering scheme in the YAML file, greatly reducing YAML file size for monte-carlo and other zero-padded run numbering use cases - See trickops/README.md for details * YAML parsing changes - Overhaul the logic which verifies YAML files for the expected TrickOps format. This is now done in TrickWorkflowYamlVerifier and provides much more robust error checking than previous approach - .yaml_requirements.yml now provides the required types, ranges, and default values as applicable to expected entries in YAML files - valgrind: is now an sub-option to run: entries, not its own section Users should now list their runs normallly and define their flags in in that run's valgrind: subsection - parallel_safety is now a per-sim parameter and not global. Users should move their global config to the sim layer - self.config_errors is now a list of errors. Users should now check for empty list when using instead of True/False * Robustify the get_koviz_report_jobs unit test to work whether koviz exists on PATH or not * Adjust trickops.py to use the new phase and range features - Make it more configurable on the command-line via argparse - Move SIM_mc_generation tests into test_sims.yml [skip ci] * Code review and cleanup from PR #1389 Documentation: * Adjust documentation to fit suggested symlinked approach. Also cleaned up duplicate images and old documentation. * Moved the verification section out of markdown and into a PDF since it heavily leverages formatting not available in markdown. * Clarify a couple points on the Darwin Trick install guide * Update wiki to clarify that data recording strings is not supported MCG Code: * Replace MonteCarloVariableRandomNormal::is_near_equal with new Trick::dbl_is_near from trick team MCG Testing: * Reduce the set of SIM_mc_generation comparisons. After discussion the trick team, we are choosing to remove all comparisons to verif_data/ which contain random-generated numbers since these tests cannot pass across all supported trick platforms. * Fix the wrong rule on exlcuding -Werror for Darwin builds of SIM_mc_generation * Remove data recording of strings in SIM_mc_generation Trickops: * Replace build_command with build_args per discussion w/ Trick team Since we only support arguments to trick-CP, replace the build_command yaml entry with build_args * Disable var server connection by default in SingleRun if TrickWorkflow.quiet is True * Guard against multiple Job starts * Remove SimulationJob inheritance layer since old monte-carlo wasn't and never will be supported by TrickOps * Ignore IOError raise from variable_server that looks like "The remote endpoint has closed the connection". This appears to occur when SingleRun jobs attempt to connect to the var server for a sim that terminates very early [skip ci] * Adjust phasing of old/new MCG initialize functions * Clarify failure message in generate_dispersions if new/old MC are both used. * Adjust the phasing order of MCG intialize method to be before legacy MC initialized. Without this, monte-carlo dry run completes with success before the check in generate_dispersions() can run * Add -Wno-stringop-truncation to S_override.mk for SIM_mc_generation since gcc 8+ warns about SWIG generated content in top.cpp * Introduce MonteCarloGenerationHelper python class This new class provides an easy-to-use interface for MCG sim-module users: 1. Run generation 2. Getting an sbatch array job suitable for SLURM 3. Getting a list of SingleRun() instances for generated runs, to be executed locally if desired --------- Co-authored-by: Dan Jordan <daniel.d.jordan@nasa.gov>
100 lines
5.3 KiB
Python
100 lines
5.3 KiB
Python
import sys
|
|
import os
|
|
|
|
thisdir = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
|
|
sys.path.append(os.path.join(thisdir,"share/trick/trickops"))
|
|
|
|
from TrickWorkflow import *
|
|
from WorkflowCommon import Job
|
|
|
|
max_retries = 5
|
|
|
|
class SimTestWorkflow(TrickWorkflow):
|
|
def __init__( self, quiet, trick_top_level, cpus):
|
|
self.cpus = cpus
|
|
# Create the trick_test directory if it doesn't already exist
|
|
if not os.path.exists(trick_top_level + "/trick_test"):
|
|
os.makedirs(trick_top_level + "/trick_test")
|
|
|
|
# Base Class initialize, this creates internal management structures
|
|
TrickWorkflow.__init__(self, project_top_level=(trick_top_level), log_dir=(trick_top_level +'/trickops_logs/'),
|
|
trick_dir=trick_top_level, config_file=(trick_top_level + "/test_sims.yml"), cpus=self.cpus, quiet=quiet)
|
|
def run( self ):
|
|
build_jobs = self.get_jobs(kind='build')
|
|
# Two sims have runs that require ordering via phases:
|
|
# - SIM_stls dumps a checkpoint that is then read in and checked by a subsequent run
|
|
# - SIM_checkpoint_data_recording dumps checkpoints that are read by subsequent runs
|
|
first_run_jobs = self.get_jobs(kind='run', phase=-1) # Get all jobs with early phase -1
|
|
remaining_run_jobs = self.get_jobs(kind='run', phase=0) # Get all jobs with default phase 0
|
|
analysis_jobs = self.get_jobs(kind='analyze')
|
|
|
|
# Some tests fail intermittently for reasons not related to the tests themselves, mostly network weirdness.
|
|
# Allow retries so that we can still cover some network-adjacent code
|
|
retry_allowed_sims = self.get_sims(labels='retries_allowed')
|
|
retry_allowed_jobs = [run.get_run_job() for run in [item for sublist in [sim.get_runs() for sim in retry_allowed_sims] for item in sublist]]
|
|
for job in retry_allowed_jobs:
|
|
# Note there's an assumption/dependency here that 'retries_allowed' runs
|
|
# are only in the remaining_run_jobs list. - Jordan 2/2023
|
|
remaining_run_jobs.remove(job)
|
|
|
|
builds_status = self.execute_jobs(build_jobs, max_concurrent=self.cpus, header='Executing all sim builds.')
|
|
first_phase_run_status = self.execute_jobs(first_run_jobs, max_concurrent=self.cpus, header="Executing first phase runs.")
|
|
runs_status = self.execute_jobs(remaining_run_jobs, max_concurrent=self.cpus, header='Executing remaining runs.')
|
|
|
|
# Run the retry_allowed jobs
|
|
self.execute_jobs(retry_allowed_jobs, max_concurrent=self.cpus, header='Executing retry-allowed runs.')
|
|
|
|
# If anything failed, try it again up to max_retries times
|
|
all_retried_status = 0
|
|
final_retry_jobs = []
|
|
for sim in retry_allowed_sims:
|
|
failing_runs = [run for run in sim.get_runs() if run.get_run_job().get_status() == Job.Status.FAILED]
|
|
for run in failing_runs:
|
|
status, final_job = self.retry_job(sim, run, max_retries)
|
|
final_retry_jobs += [final_job]
|
|
all_retried_status = all_retried_status or status
|
|
comparison_result = self.compare()
|
|
analysis_status = self.execute_jobs(analysis_jobs, max_concurrent=self.cpus, header='Executing all analysis.')
|
|
|
|
self.report() # Print Verbose report
|
|
self.status_summary() # Print a Succinct summary
|
|
|
|
# Dump failing logs
|
|
jobs = build_jobs + first_run_jobs + remaining_run_jobs + final_retry_jobs
|
|
for job in jobs:
|
|
if job.get_status() == Job.Status.FAILED:
|
|
print("Failing job: ", job.name)
|
|
print ("*"*120)
|
|
print(open(job.log_file, "r").read())
|
|
print ("*"*120, "\n")
|
|
|
|
return (builds_status or runs_status or first_phase_run_status or all_retried_status or len(self.config_errors) > 0 or comparison_result or analysis_status)
|
|
|
|
# Retries a job up to max_retries times and adds runs to the sim
|
|
# Returns tuple of (job_status, final retry job)
|
|
def retry_job(self, sim, run, max_retries):
|
|
tries = 0
|
|
job_failing = 1
|
|
retry_run = None
|
|
retry_job = None
|
|
while tries < max_retries and job_failing:
|
|
tries += 1
|
|
retry_run = TrickWorkflow.Run(sim_dir=run.sim_dir, input_file=run.input_file, binary=run.binary, returns=run.returns,log_dir=run.log_dir)
|
|
retry_job = retry_run.get_run_job()
|
|
retry_job.name = retry_job.name + "_retry_" + str(tries)
|
|
job_failing = self.execute_jobs([retry_job], max_concurrent=1, header="Retrying failed job")
|
|
sim.add_run(retry_run)
|
|
|
|
return (job_failing, retry_job)
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description='Build, run, and compare all test sims for Trick',
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
parser.add_argument( "--trick_top_level", type=str, help="Path to TRICK_HOME", default=thisdir)
|
|
parser.add_argument( "--quiet", action="store_true", help="Suppress progress bars.")
|
|
parser.add_argument( "--cpus", type=int, default=(os.cpu_count() if os.cpu_count() is not None else 8),
|
|
help="Number of cpus to use for testing. For builds this number is used for MAKEFLAGS *and* number of "
|
|
"concurrent builds (cpus^2). For sim runs this controls the maximum number of simultaneous runs.")
|
|
myargs = parser.parse_args()
|
|
sys.exit(SimTestWorkflow(quiet=myargs.quiet, trick_top_level=myargs.trick_top_level, cpus=myargs.cpus).run())
|