Transition to Done state on setup error, trigger node reimaging (#24)

This commit is contained in:
Joe Ranweiler 2020-10-01 21:27:53 -07:00 committed by GitHub
parent f72543dde2
commit 3de81b55f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 140 additions and 28 deletions

View File

@ -174,6 +174,7 @@ impl Agent {
let scheduler = match state.finish(self.setup_runner.as_mut()).await? {
SetupDone::Ready(s) => s.into(),
SetupDone::PendingReboot(s) => s.into(),
SetupDone::Done(s) => s.into(),
};
Ok(scheduler)
@ -220,7 +221,21 @@ impl Agent {
async fn done(&mut self, state: State<Done>) -> Result<Scheduler> {
verbose!("agent done");
let event = StateUpdateEvent::Done.into();
let event = match state.cause() {
DoneCause::SetupError {
error,
script_output,
} => StateUpdateEvent::Done {
error: Some(error),
script_output,
},
DoneCause::Stopped | DoneCause::WorkersDone => StateUpdateEvent::Done {
error: None,
script_output: None,
},
};
let event = event.into();
self.coordinator.emit_event(event).await?;
// `Done` is a final state.

View File

@ -9,6 +9,7 @@ use uuid::Uuid;
use crate::auth::AccessToken;
use crate::config::Registration;
use crate::process::Output;
use crate::work::{TaskId, WorkSet};
use crate::worker::WorkerEvent;
@ -82,11 +83,16 @@ impl From<WorkerEvent> for NodeEvent {
pub enum StateUpdateEvent {
Init,
Free,
SettingUp { tasks: Vec<TaskId> },
SettingUp {
tasks: Vec<TaskId>,
},
Rebooting,
Ready,
Busy,
Done,
Done {
error: Option<String>,
script_output: Option<Output>,
},
}
impl From<StateUpdateEvent> for NodeEvent {

View File

@ -60,7 +60,10 @@ fn debug_node_event_state_update(state: NodeState) -> Result<()> {
NodeState::Rebooting => StateUpdateEvent::Rebooting,
NodeState::Ready => StateUpdateEvent::Ready,
NodeState::Busy => StateUpdateEvent::Busy,
NodeState::Done => StateUpdateEvent::Done,
NodeState::Done => StateUpdateEvent::Done {
error: None,
script_output: None,
},
};
let event = event.into();
print_json(into_envelope(event))

View File

@ -6,6 +6,7 @@ use std::fmt;
use anyhow::Result;
use crate::coordinator::NodeCommand;
use crate::process::Output;
use crate::reboot::RebootContext;
use crate::setup::ISetupRunner;
use crate::work::*;
@ -47,7 +48,10 @@ impl Scheduler {
}
}
NodeCommand::Stop {} => {
let state = State { ctx: Done {} };
let cause = DoneCause::Stopped;
let state = State {
ctx: Done { cause },
};
*self = state.into();
}
}
@ -81,7 +85,19 @@ pub struct Busy {
workers: Vec<Option<Worker>>,
}
pub struct Done;
pub struct Done {
cause: DoneCause,
}
#[derive(Clone, Debug)]
pub enum DoneCause {
SetupError {
error: String,
script_output: Option<Output>,
},
Stopped,
WorkersDone,
}
pub trait Context {}
@ -129,17 +145,36 @@ impl State<Free> {
pub enum SetupDone {
Ready(State<Ready>),
PendingReboot(State<PendingReboot>),
Done(State<Done>),
}
impl State<SettingUp> {
pub async fn finish(self, runner: &mut dyn ISetupRunner) -> Result<SetupDone> {
let work_set = self.ctx.work_set;
let output = runner.run(&work_set).await?;
let output = runner.run(&work_set).await;
if let Some(output) = output {
if !output.exit_status.success {
anyhow::bail!("setup script failed: {:?}", output);
match output {
Ok(Some(output)) => {
if !output.exit_status.success {
let cause = DoneCause::SetupError {
error: "error running target setup script".to_owned(),
script_output: Some(output),
};
let ctx = Done { cause };
return Ok(SetupDone::Done(ctx.into()));
}
}
Ok(None) => {
// No script was executed.
}
Err(err) => {
let cause = DoneCause::SetupError {
error: err.to_string(),
script_output: None,
};
let ctx = Done { cause };
return Ok(SetupDone::Done(ctx.into()));
}
}
@ -194,7 +229,10 @@ impl State<Busy> {
}
let updated = if self.all_workers_done() {
Updated::Done(Done.into())
let done = Done {
cause: DoneCause::WorkersDone,
};
Updated::Done(done.into())
} else {
Updated::Busy(self)
};
@ -238,7 +276,11 @@ impl From<Updated> for Scheduler {
}
}
impl State<Done> {}
impl State<Done> {
pub fn cause(&self) -> DoneCause {
self.ctx.cause.clone()
}
}
impl From<Option<RebootContext>> for Scheduler {
fn from(ctx: Option<RebootContext>) -> Self {

View File

@ -4,14 +4,17 @@
# Licensed under the MIT License.
import logging
from typing import Optional, cast
from uuid import UUID
import azure.functions as func
from onefuzztypes.enums import ErrorCode, NodeState, NodeTaskState, TaskState
from onefuzztypes.models import (
Error,
NodeDoneEventData,
NodeEvent,
NodeEventEnvelope,
NodeSettingUpEventData,
NodeStateUpdate,
WorkerEvent,
)
@ -54,10 +57,17 @@ def on_state_update(
node.save()
if state == NodeState.setting_up:
# Model-validated.
#
# This field will be required in the future.
# For now, it is optional for back compat.
if state_update.data:
for task_id in state_update.data.tasks:
setting_up_data = cast(
Optional[NodeSettingUpEventData],
state_update.data,
)
if setting_up_data:
for task_id in setting_up_data.tasks:
task = get_task_checked(task_id)
# The task state may be `running` if it has `vm_count` > 1, and
@ -82,6 +92,20 @@ def on_state_update(
state=NodeTaskState.setting_up,
)
node_task.save()
elif state == NodeState.done:
# Model-validated.
#
# This field will be required in the future.
# For now, it is optional for back compat.
done_data = cast(Optional[NodeDoneEventData], state_update.data)
if done_data:
if done_data.error:
logging.error(
"node `done` with error: machine_id = %s, data = %s",
machine_id,
done_data,
)
else:
logging.info("ignoring state updates from the node: %s: %s", machine_id, state)
@ -140,6 +164,7 @@ def on_worker_event(machine_id: UUID, event: WorkerEvent) -> func.HttpResponse:
task.save()
node.save()
task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event)
task_event.save()
return ok(BoolResult(result=True))

View File

@ -484,6 +484,12 @@ class ExitStatus(BaseModel):
success: bool
class ProcessOutput(BaseModel):
exit_status: ExitStatus
stderr: str
stdout: str
class WorkerRunningEvent(BaseModel):
task_id: UUID
@ -500,28 +506,43 @@ class WorkerEvent(EnumModel):
running: Optional[WorkerRunningEvent]
class SettingUpEventData(BaseModel):
class NodeSettingUpEventData(BaseModel):
tasks: List[UUID]
class NodeDoneEventData(BaseModel):
error: Optional[str]
script_output: Optional[ProcessOutput]
NodeStateData = Union[NodeSettingUpEventData, NodeDoneEventData]
class NodeStateUpdate(BaseModel):
state: NodeState
data: Optional[SettingUpEventData]
data: Optional[NodeStateData]
@root_validator(pre=False, skip_on_failure=True)
def check_data(cls, values: Any) -> Any:
data = values.get("data")
@validator("data")
def check_data(
cls,
data: Optional[SettingUpEventData],
values: Any,
) -> Optional[SettingUpEventData]:
if data:
state = values.get("state")
if state and state != NodeState.setting_up:
raise ValueError(
"data for node state update event does not match state = %s" % state
)
state = values["state"]
return data
if state == NodeState.setting_up:
if isinstance(data, NodeSettingUpEventData):
return values
if state == NodeState.done:
if isinstance(data, NodeDoneEventData):
return values
raise ValueError(
"data for node state update event does not match state = %s" % state
)
else:
# For now, `data` is always optional.
return values
class NodeEvent(EnumModel):