From 3de81b55f80146216f8dc6dbffc98e7520ccfbdc Mon Sep 17 00:00:00 2001 From: Joe Ranweiler Date: Thu, 1 Oct 2020 21:27:53 -0700 Subject: [PATCH] Transition to `Done` state on setup error, trigger node reimaging (#24) --- src/agent/onefuzz-supervisor/src/agent.rs | 17 +++++- .../onefuzz-supervisor/src/coordinator.rs | 10 +++- src/agent/onefuzz-supervisor/src/debug.rs | 5 +- src/agent/onefuzz-supervisor/src/scheduler.rs | 58 ++++++++++++++++--- .../__app__/agent_events/__init__.py | 29 +++++++++- src/pytypes/onefuzztypes/models.py | 49 +++++++++++----- 6 files changed, 140 insertions(+), 28 deletions(-) diff --git a/src/agent/onefuzz-supervisor/src/agent.rs b/src/agent/onefuzz-supervisor/src/agent.rs index 6ded0f6ad..eab9aa94a 100644 --- a/src/agent/onefuzz-supervisor/src/agent.rs +++ b/src/agent/onefuzz-supervisor/src/agent.rs @@ -174,6 +174,7 @@ impl Agent { let scheduler = match state.finish(self.setup_runner.as_mut()).await? { SetupDone::Ready(s) => s.into(), SetupDone::PendingReboot(s) => s.into(), + SetupDone::Done(s) => s.into(), }; Ok(scheduler) @@ -220,7 +221,21 @@ impl Agent { async fn done(&mut self, state: State) -> Result { verbose!("agent done"); - let event = StateUpdateEvent::Done.into(); + let event = match state.cause() { + DoneCause::SetupError { + error, + script_output, + } => StateUpdateEvent::Done { + error: Some(error), + script_output, + }, + DoneCause::Stopped | DoneCause::WorkersDone => StateUpdateEvent::Done { + error: None, + script_output: None, + }, + }; + + let event = event.into(); self.coordinator.emit_event(event).await?; // `Done` is a final state. diff --git a/src/agent/onefuzz-supervisor/src/coordinator.rs b/src/agent/onefuzz-supervisor/src/coordinator.rs index 9f494ab86..5d4e16cee 100644 --- a/src/agent/onefuzz-supervisor/src/coordinator.rs +++ b/src/agent/onefuzz-supervisor/src/coordinator.rs @@ -9,6 +9,7 @@ use uuid::Uuid; use crate::auth::AccessToken; use crate::config::Registration; +use crate::process::Output; use crate::work::{TaskId, WorkSet}; use crate::worker::WorkerEvent; @@ -82,11 +83,16 @@ impl From for NodeEvent { pub enum StateUpdateEvent { Init, Free, - SettingUp { tasks: Vec }, + SettingUp { + tasks: Vec, + }, Rebooting, Ready, Busy, - Done, + Done { + error: Option, + script_output: Option, + }, } impl From for NodeEvent { diff --git a/src/agent/onefuzz-supervisor/src/debug.rs b/src/agent/onefuzz-supervisor/src/debug.rs index 879fb1019..a83a3cbeb 100644 --- a/src/agent/onefuzz-supervisor/src/debug.rs +++ b/src/agent/onefuzz-supervisor/src/debug.rs @@ -60,7 +60,10 @@ fn debug_node_event_state_update(state: NodeState) -> Result<()> { NodeState::Rebooting => StateUpdateEvent::Rebooting, NodeState::Ready => StateUpdateEvent::Ready, NodeState::Busy => StateUpdateEvent::Busy, - NodeState::Done => StateUpdateEvent::Done, + NodeState::Done => StateUpdateEvent::Done { + error: None, + script_output: None, + }, }; let event = event.into(); print_json(into_envelope(event)) diff --git a/src/agent/onefuzz-supervisor/src/scheduler.rs b/src/agent/onefuzz-supervisor/src/scheduler.rs index 2980fb2fd..9f6302c21 100644 --- a/src/agent/onefuzz-supervisor/src/scheduler.rs +++ b/src/agent/onefuzz-supervisor/src/scheduler.rs @@ -6,6 +6,7 @@ use std::fmt; use anyhow::Result; use crate::coordinator::NodeCommand; +use crate::process::Output; use crate::reboot::RebootContext; use crate::setup::ISetupRunner; use crate::work::*; @@ -47,7 +48,10 @@ impl Scheduler { } } NodeCommand::Stop {} => { - let state = State { ctx: Done {} }; + let cause = DoneCause::Stopped; + let state = State { + ctx: Done { cause }, + }; *self = state.into(); } } @@ -81,7 +85,19 @@ pub struct Busy { workers: Vec>, } -pub struct Done; +pub struct Done { + cause: DoneCause, +} + +#[derive(Clone, Debug)] +pub enum DoneCause { + SetupError { + error: String, + script_output: Option, + }, + Stopped, + WorkersDone, +} pub trait Context {} @@ -129,17 +145,36 @@ impl State { pub enum SetupDone { Ready(State), PendingReboot(State), + Done(State), } impl State { pub async fn finish(self, runner: &mut dyn ISetupRunner) -> Result { let work_set = self.ctx.work_set; - let output = runner.run(&work_set).await?; + let output = runner.run(&work_set).await; - if let Some(output) = output { - if !output.exit_status.success { - anyhow::bail!("setup script failed: {:?}", output); + match output { + Ok(Some(output)) => { + if !output.exit_status.success { + let cause = DoneCause::SetupError { + error: "error running target setup script".to_owned(), + script_output: Some(output), + }; + let ctx = Done { cause }; + return Ok(SetupDone::Done(ctx.into())); + } + } + Ok(None) => { + // No script was executed. + } + Err(err) => { + let cause = DoneCause::SetupError { + error: err.to_string(), + script_output: None, + }; + let ctx = Done { cause }; + return Ok(SetupDone::Done(ctx.into())); } } @@ -194,7 +229,10 @@ impl State { } let updated = if self.all_workers_done() { - Updated::Done(Done.into()) + let done = Done { + cause: DoneCause::WorkersDone, + }; + Updated::Done(done.into()) } else { Updated::Busy(self) }; @@ -238,7 +276,11 @@ impl From for Scheduler { } } -impl State {} +impl State { + pub fn cause(&self) -> DoneCause { + self.ctx.cause.clone() + } +} impl From> for Scheduler { fn from(ctx: Option) -> Self { diff --git a/src/api-service/__app__/agent_events/__init__.py b/src/api-service/__app__/agent_events/__init__.py index 7a4b95674..0bac18258 100644 --- a/src/api-service/__app__/agent_events/__init__.py +++ b/src/api-service/__app__/agent_events/__init__.py @@ -4,14 +4,17 @@ # Licensed under the MIT License. import logging +from typing import Optional, cast from uuid import UUID import azure.functions as func from onefuzztypes.enums import ErrorCode, NodeState, NodeTaskState, TaskState from onefuzztypes.models import ( Error, + NodeDoneEventData, NodeEvent, NodeEventEnvelope, + NodeSettingUpEventData, NodeStateUpdate, WorkerEvent, ) @@ -54,10 +57,17 @@ def on_state_update( node.save() if state == NodeState.setting_up: + # Model-validated. + # # This field will be required in the future. # For now, it is optional for back compat. - if state_update.data: - for task_id in state_update.data.tasks: + setting_up_data = cast( + Optional[NodeSettingUpEventData], + state_update.data, + ) + + if setting_up_data: + for task_id in setting_up_data.tasks: task = get_task_checked(task_id) # The task state may be `running` if it has `vm_count` > 1, and @@ -82,6 +92,20 @@ def on_state_update( state=NodeTaskState.setting_up, ) node_task.save() + elif state == NodeState.done: + # Model-validated. + # + # This field will be required in the future. + # For now, it is optional for back compat. + done_data = cast(Optional[NodeDoneEventData], state_update.data) + + if done_data: + if done_data.error: + logging.error( + "node `done` with error: machine_id = %s, data = %s", + machine_id, + done_data, + ) else: logging.info("ignoring state updates from the node: %s: %s", machine_id, state) @@ -140,6 +164,7 @@ def on_worker_event(machine_id: UUID, event: WorkerEvent) -> func.HttpResponse: task.save() node.save() + task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event) task_event.save() return ok(BoolResult(result=True)) diff --git a/src/pytypes/onefuzztypes/models.py b/src/pytypes/onefuzztypes/models.py index 7781da698..55192dc18 100644 --- a/src/pytypes/onefuzztypes/models.py +++ b/src/pytypes/onefuzztypes/models.py @@ -484,6 +484,12 @@ class ExitStatus(BaseModel): success: bool +class ProcessOutput(BaseModel): + exit_status: ExitStatus + stderr: str + stdout: str + + class WorkerRunningEvent(BaseModel): task_id: UUID @@ -500,28 +506,43 @@ class WorkerEvent(EnumModel): running: Optional[WorkerRunningEvent] -class SettingUpEventData(BaseModel): +class NodeSettingUpEventData(BaseModel): tasks: List[UUID] +class NodeDoneEventData(BaseModel): + error: Optional[str] + script_output: Optional[ProcessOutput] + + +NodeStateData = Union[NodeSettingUpEventData, NodeDoneEventData] + + class NodeStateUpdate(BaseModel): state: NodeState - data: Optional[SettingUpEventData] + data: Optional[NodeStateData] + + @root_validator(pre=False, skip_on_failure=True) + def check_data(cls, values: Any) -> Any: + data = values.get("data") - @validator("data") - def check_data( - cls, - data: Optional[SettingUpEventData], - values: Any, - ) -> Optional[SettingUpEventData]: if data: - state = values.get("state") - if state and state != NodeState.setting_up: - raise ValueError( - "data for node state update event does not match state = %s" % state - ) + state = values["state"] - return data + if state == NodeState.setting_up: + if isinstance(data, NodeSettingUpEventData): + return values + + if state == NodeState.done: + if isinstance(data, NodeDoneEventData): + return values + + raise ValueError( + "data for node state update event does not match state = %s" % state + ) + else: + # For now, `data` is always optional. + return values class NodeEvent(EnumModel):