only record failures generated prior to stopping (#83)

This commit is contained in:
bmc-msft
2020-10-02 01:31:51 -04:00
committed by GitHub
parent a529f073a8
commit a196716e12
4 changed files with 29 additions and 16 deletions

View File

@ -145,7 +145,8 @@ def on_worker_event(machine_id: UUID, event: WorkerEvent) -> func.HttpResponse:
if not exit_status.success:
logging.error("task failed: status = %s", exit_status)
task.error = Error(
task.mark_failed(
Error(
code=ErrorCode.TASK_FAILED,
errors=[
"task failed. exit_status = %s" % exit_status,
@ -153,6 +154,7 @@ def on_worker_event(machine_id: UUID, event: WorkerEvent) -> func.HttpResponse:
event.done.stderr,
],
)
)
task.state = TaskState.stopping
if node.state not in NodeState.ready_for_reset():

View File

@ -24,7 +24,7 @@ from azure.devops.v6_0.work_item_tracking.work_item_tracking_client import (
WorkItemTrackingClient,
)
from memoization import cached
from onefuzztypes.enums import ErrorCode, TaskState
from onefuzztypes.enums import ErrorCode
from onefuzztypes.models import ADOTemplate, Error, Report
from ..tasks.main import Task
@ -211,9 +211,9 @@ def fail_task(report: Report, error: Exception) -> None:
task = Task.get(report.job_id, report.task_id)
if task:
task.error = Error(code=ErrorCode.NOTIFICATION_FAILURE, errors=[str(error)])
task.state = TaskState.stopping
task.save()
task.mark_failed(
Error(code=ErrorCode.NOTIFICATION_FAILURE, errors=[str(error)])
)
def notify_ado(

View File

@ -740,12 +740,12 @@ class Scaleset(BASE_SCALESET, ORMMixin):
if task.state in [TaskState.stopping, TaskState.stopped]:
continue
task.error = Error(
task.mark_failed(
Error(
code=ErrorCode.TASK_FAILED,
errors=["node reimaged during task execution"],
)
task.state = TaskState.stopping
task.save()
)
entry.delete()
if self.state == ScalesetState.shutdown:

View File

@ -154,6 +154,17 @@ class Task(BASE_TASK, ORMMixin):
task = tasks[0]
return task
def mark_failed(self, error: Error) -> None:
if self.state in [TaskState.stopped, TaskState.stopping]:
logging.debug(
"ignoring post-task stop failures for %s:%s", self.job_id, self.task_id
)
return
self.error = error
self.state = TaskState.stopping
self.save()
def get_pool(self) -> Optional[Pool]:
if self.config.pool:
pool = Pool.get_by_name(self.config.pool.pool_name)