mirror of
https://github.com/microsoft/onefuzz.git
synced 2025-06-09 16:51:35 +00:00
Transition to Done
state on setup error, trigger node reimaging (#24)
This commit is contained in:
parent
f72543dde2
commit
3de81b55f8
@ -174,6 +174,7 @@ impl Agent {
|
||||
let scheduler = match state.finish(self.setup_runner.as_mut()).await? {
|
||||
SetupDone::Ready(s) => s.into(),
|
||||
SetupDone::PendingReboot(s) => s.into(),
|
||||
SetupDone::Done(s) => s.into(),
|
||||
};
|
||||
|
||||
Ok(scheduler)
|
||||
@ -220,7 +221,21 @@ impl Agent {
|
||||
async fn done(&mut self, state: State<Done>) -> Result<Scheduler> {
|
||||
verbose!("agent done");
|
||||
|
||||
let event = StateUpdateEvent::Done.into();
|
||||
let event = match state.cause() {
|
||||
DoneCause::SetupError {
|
||||
error,
|
||||
script_output,
|
||||
} => StateUpdateEvent::Done {
|
||||
error: Some(error),
|
||||
script_output,
|
||||
},
|
||||
DoneCause::Stopped | DoneCause::WorkersDone => StateUpdateEvent::Done {
|
||||
error: None,
|
||||
script_output: None,
|
||||
},
|
||||
};
|
||||
|
||||
let event = event.into();
|
||||
self.coordinator.emit_event(event).await?;
|
||||
|
||||
// `Done` is a final state.
|
||||
|
@ -9,6 +9,7 @@ use uuid::Uuid;
|
||||
|
||||
use crate::auth::AccessToken;
|
||||
use crate::config::Registration;
|
||||
use crate::process::Output;
|
||||
use crate::work::{TaskId, WorkSet};
|
||||
use crate::worker::WorkerEvent;
|
||||
|
||||
@ -82,11 +83,16 @@ impl From<WorkerEvent> for NodeEvent {
|
||||
pub enum StateUpdateEvent {
|
||||
Init,
|
||||
Free,
|
||||
SettingUp { tasks: Vec<TaskId> },
|
||||
SettingUp {
|
||||
tasks: Vec<TaskId>,
|
||||
},
|
||||
Rebooting,
|
||||
Ready,
|
||||
Busy,
|
||||
Done,
|
||||
Done {
|
||||
error: Option<String>,
|
||||
script_output: Option<Output>,
|
||||
},
|
||||
}
|
||||
|
||||
impl From<StateUpdateEvent> for NodeEvent {
|
||||
|
@ -60,7 +60,10 @@ fn debug_node_event_state_update(state: NodeState) -> Result<()> {
|
||||
NodeState::Rebooting => StateUpdateEvent::Rebooting,
|
||||
NodeState::Ready => StateUpdateEvent::Ready,
|
||||
NodeState::Busy => StateUpdateEvent::Busy,
|
||||
NodeState::Done => StateUpdateEvent::Done,
|
||||
NodeState::Done => StateUpdateEvent::Done {
|
||||
error: None,
|
||||
script_output: None,
|
||||
},
|
||||
};
|
||||
let event = event.into();
|
||||
print_json(into_envelope(event))
|
||||
|
@ -6,6 +6,7 @@ use std::fmt;
|
||||
use anyhow::Result;
|
||||
|
||||
use crate::coordinator::NodeCommand;
|
||||
use crate::process::Output;
|
||||
use crate::reboot::RebootContext;
|
||||
use crate::setup::ISetupRunner;
|
||||
use crate::work::*;
|
||||
@ -47,7 +48,10 @@ impl Scheduler {
|
||||
}
|
||||
}
|
||||
NodeCommand::Stop {} => {
|
||||
let state = State { ctx: Done {} };
|
||||
let cause = DoneCause::Stopped;
|
||||
let state = State {
|
||||
ctx: Done { cause },
|
||||
};
|
||||
*self = state.into();
|
||||
}
|
||||
}
|
||||
@ -81,7 +85,19 @@ pub struct Busy {
|
||||
workers: Vec<Option<Worker>>,
|
||||
}
|
||||
|
||||
pub struct Done;
|
||||
pub struct Done {
|
||||
cause: DoneCause,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum DoneCause {
|
||||
SetupError {
|
||||
error: String,
|
||||
script_output: Option<Output>,
|
||||
},
|
||||
Stopped,
|
||||
WorkersDone,
|
||||
}
|
||||
|
||||
pub trait Context {}
|
||||
|
||||
@ -129,17 +145,36 @@ impl State<Free> {
|
||||
pub enum SetupDone {
|
||||
Ready(State<Ready>),
|
||||
PendingReboot(State<PendingReboot>),
|
||||
Done(State<Done>),
|
||||
}
|
||||
|
||||
impl State<SettingUp> {
|
||||
pub async fn finish(self, runner: &mut dyn ISetupRunner) -> Result<SetupDone> {
|
||||
let work_set = self.ctx.work_set;
|
||||
|
||||
let output = runner.run(&work_set).await?;
|
||||
let output = runner.run(&work_set).await;
|
||||
|
||||
if let Some(output) = output {
|
||||
if !output.exit_status.success {
|
||||
anyhow::bail!("setup script failed: {:?}", output);
|
||||
match output {
|
||||
Ok(Some(output)) => {
|
||||
if !output.exit_status.success {
|
||||
let cause = DoneCause::SetupError {
|
||||
error: "error running target setup script".to_owned(),
|
||||
script_output: Some(output),
|
||||
};
|
||||
let ctx = Done { cause };
|
||||
return Ok(SetupDone::Done(ctx.into()));
|
||||
}
|
||||
}
|
||||
Ok(None) => {
|
||||
// No script was executed.
|
||||
}
|
||||
Err(err) => {
|
||||
let cause = DoneCause::SetupError {
|
||||
error: err.to_string(),
|
||||
script_output: None,
|
||||
};
|
||||
let ctx = Done { cause };
|
||||
return Ok(SetupDone::Done(ctx.into()));
|
||||
}
|
||||
}
|
||||
|
||||
@ -194,7 +229,10 @@ impl State<Busy> {
|
||||
}
|
||||
|
||||
let updated = if self.all_workers_done() {
|
||||
Updated::Done(Done.into())
|
||||
let done = Done {
|
||||
cause: DoneCause::WorkersDone,
|
||||
};
|
||||
Updated::Done(done.into())
|
||||
} else {
|
||||
Updated::Busy(self)
|
||||
};
|
||||
@ -238,7 +276,11 @@ impl From<Updated> for Scheduler {
|
||||
}
|
||||
}
|
||||
|
||||
impl State<Done> {}
|
||||
impl State<Done> {
|
||||
pub fn cause(&self) -> DoneCause {
|
||||
self.ctx.cause.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Option<RebootContext>> for Scheduler {
|
||||
fn from(ctx: Option<RebootContext>) -> Self {
|
||||
|
@ -4,14 +4,17 @@
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import logging
|
||||
from typing import Optional, cast
|
||||
from uuid import UUID
|
||||
|
||||
import azure.functions as func
|
||||
from onefuzztypes.enums import ErrorCode, NodeState, NodeTaskState, TaskState
|
||||
from onefuzztypes.models import (
|
||||
Error,
|
||||
NodeDoneEventData,
|
||||
NodeEvent,
|
||||
NodeEventEnvelope,
|
||||
NodeSettingUpEventData,
|
||||
NodeStateUpdate,
|
||||
WorkerEvent,
|
||||
)
|
||||
@ -54,10 +57,17 @@ def on_state_update(
|
||||
node.save()
|
||||
|
||||
if state == NodeState.setting_up:
|
||||
# Model-validated.
|
||||
#
|
||||
# This field will be required in the future.
|
||||
# For now, it is optional for back compat.
|
||||
if state_update.data:
|
||||
for task_id in state_update.data.tasks:
|
||||
setting_up_data = cast(
|
||||
Optional[NodeSettingUpEventData],
|
||||
state_update.data,
|
||||
)
|
||||
|
||||
if setting_up_data:
|
||||
for task_id in setting_up_data.tasks:
|
||||
task = get_task_checked(task_id)
|
||||
|
||||
# The task state may be `running` if it has `vm_count` > 1, and
|
||||
@ -82,6 +92,20 @@ def on_state_update(
|
||||
state=NodeTaskState.setting_up,
|
||||
)
|
||||
node_task.save()
|
||||
elif state == NodeState.done:
|
||||
# Model-validated.
|
||||
#
|
||||
# This field will be required in the future.
|
||||
# For now, it is optional for back compat.
|
||||
done_data = cast(Optional[NodeDoneEventData], state_update.data)
|
||||
|
||||
if done_data:
|
||||
if done_data.error:
|
||||
logging.error(
|
||||
"node `done` with error: machine_id = %s, data = %s",
|
||||
machine_id,
|
||||
done_data,
|
||||
)
|
||||
else:
|
||||
logging.info("ignoring state updates from the node: %s: %s", machine_id, state)
|
||||
|
||||
@ -140,6 +164,7 @@ def on_worker_event(machine_id: UUID, event: WorkerEvent) -> func.HttpResponse:
|
||||
|
||||
task.save()
|
||||
node.save()
|
||||
|
||||
task_event = TaskEvent(task_id=task_id, machine_id=machine_id, event_data=event)
|
||||
task_event.save()
|
||||
return ok(BoolResult(result=True))
|
||||
|
@ -484,6 +484,12 @@ class ExitStatus(BaseModel):
|
||||
success: bool
|
||||
|
||||
|
||||
class ProcessOutput(BaseModel):
|
||||
exit_status: ExitStatus
|
||||
stderr: str
|
||||
stdout: str
|
||||
|
||||
|
||||
class WorkerRunningEvent(BaseModel):
|
||||
task_id: UUID
|
||||
|
||||
@ -500,28 +506,43 @@ class WorkerEvent(EnumModel):
|
||||
running: Optional[WorkerRunningEvent]
|
||||
|
||||
|
||||
class SettingUpEventData(BaseModel):
|
||||
class NodeSettingUpEventData(BaseModel):
|
||||
tasks: List[UUID]
|
||||
|
||||
|
||||
class NodeDoneEventData(BaseModel):
|
||||
error: Optional[str]
|
||||
script_output: Optional[ProcessOutput]
|
||||
|
||||
|
||||
NodeStateData = Union[NodeSettingUpEventData, NodeDoneEventData]
|
||||
|
||||
|
||||
class NodeStateUpdate(BaseModel):
|
||||
state: NodeState
|
||||
data: Optional[SettingUpEventData]
|
||||
data: Optional[NodeStateData]
|
||||
|
||||
@root_validator(pre=False, skip_on_failure=True)
|
||||
def check_data(cls, values: Any) -> Any:
|
||||
data = values.get("data")
|
||||
|
||||
@validator("data")
|
||||
def check_data(
|
||||
cls,
|
||||
data: Optional[SettingUpEventData],
|
||||
values: Any,
|
||||
) -> Optional[SettingUpEventData]:
|
||||
if data:
|
||||
state = values.get("state")
|
||||
if state and state != NodeState.setting_up:
|
||||
raise ValueError(
|
||||
"data for node state update event does not match state = %s" % state
|
||||
)
|
||||
state = values["state"]
|
||||
|
||||
return data
|
||||
if state == NodeState.setting_up:
|
||||
if isinstance(data, NodeSettingUpEventData):
|
||||
return values
|
||||
|
||||
if state == NodeState.done:
|
||||
if isinstance(data, NodeDoneEventData):
|
||||
return values
|
||||
|
||||
raise ValueError(
|
||||
"data for node state update event does not match state = %s" % state
|
||||
)
|
||||
else:
|
||||
# For now, `data` is always optional.
|
||||
return values
|
||||
|
||||
|
||||
class NodeEvent(EnumModel):
|
||||
|
Loading…
x
Reference in New Issue
Block a user