Report the setup failure in the task when available (#781)

This commit is contained in:
Cheick Keita
2021-04-09 05:57:56 -07:00
committed by GitHub
parent 75f0315eb9
commit 80b3533f83
2 changed files with 21 additions and 14 deletions

View File

@ -125,24 +125,30 @@ def on_state_update(
node_task.save() node_task.save()
elif state == NodeState.done: elif state == NodeState.done:
# if tasks are running on the node when it reports as Done
# those are stopped early
node.mark_tasks_stopped_early()
node.to_reimage(done=True)
# Model-validated. # Model-validated.
# #
# This field will be required in the future. # This field will be required in the future.
# For now, it is optional for back compat. # For now, it is optional for back compat.
done_data = cast(Optional[NodeDoneEventData], state_update.data) done_data = cast(Optional[NodeDoneEventData], state_update.data)
error = None
if done_data: if done_data:
# TODO: do something with this done data
if done_data.error: if done_data.error:
error_text = done_data.json(exclude_none=True)
error = Error(
code=ErrorCode.TASK_FAILED,
errors=[error_text],
)
logging.error( logging.error(
"node 'done' with error: machine_id:%s, data:%s", "node 'done' with error: machine_id:%s, data:%s",
machine_id, machine_id,
done_data, error_text,
) )
# if tasks are running on the node when it reports as Done
# those are stopped early
node.mark_tasks_stopped_early(error=error)
node.to_reimage(done=True)
return None return None

View File

@ -220,18 +220,19 @@ class Node(BASE_NODE, ORMMixin):
self.stop() self.stop()
return True return True
def mark_tasks_stopped_early(self) -> None: def mark_tasks_stopped_early(self, error: Optional[Error] = None) -> None:
from ..tasks.main import Task from ..tasks.main import Task
if error is None:
error = Error(
code=ErrorCode.TASK_FAILED,
errors=["node reimaged during task execution"],
)
for entry in NodeTasks.get_by_machine_id(self.machine_id): for entry in NodeTasks.get_by_machine_id(self.machine_id):
task = Task.get_by_task_id(entry.task_id) task = Task.get_by_task_id(entry.task_id)
if isinstance(task, Task): if isinstance(task, Task):
task.mark_failed( task.mark_failed(error)
Error(
code=ErrorCode.TASK_FAILED,
errors=["node reimaged during task execution"],
)
)
def could_shrink_scaleset(self) -> bool: def could_shrink_scaleset(self) -> bool:
from .scalesets import ScalesetShrinkQueue from .scalesets import ScalesetShrinkQueue