From 8b74d08d3daee2e75b549913c6ae3af147306ac5 Mon Sep 17 00:00:00 2001 From: bmc-msft <41130664+bmc-msft@users.noreply.github.com> Date: Wed, 26 May 2021 13:06:44 -0400 Subject: [PATCH] fix deleting nodes with expired heartbeats (#930) --- .../__app__/onefuzzlib/workers/nodes.py | 7 ++++--- .../__app__/onefuzzlib/workers/scalesets.py | 14 +++++++++++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/nodes.py b/src/api-service/__app__/onefuzzlib/workers/nodes.py index 172d2bcc5..24fdcd4a5 100644 --- a/src/api-service/__app__/onefuzzlib/workers/nodes.py +++ b/src/api-service/__app__/onefuzzlib/workers/nodes.py @@ -287,8 +287,8 @@ class Node(BASE_NODE, ORMMixin): return False if self.could_shrink_scaleset(): - self.set_halt() logging.info("node scheduled to shrink. machine_id:%s", self.machine_id) + self.set_halt() return False if self.scaleset_id: @@ -384,8 +384,9 @@ class Node(BASE_NODE, ORMMixin): def set_halt(self) -> None: """Tell the node to stop everything.""" - self.set_shutdown() - self.stop() + logging.info("setting halt: %s", self.machine_id) + self.delete_requested = True + self.stop(done=True) self.set_state(NodeState.halt) @classmethod diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index df7d4c908..d42e385ee 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -374,9 +374,17 @@ class Scaleset(BASE_SCALESET, ORMMixin): to_reimage.append(node) dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME) - for node in dead_nodes: - node.set_halt() - to_reimage.append(node) + if dead_nodes: + logging.info( + SCALESET_LOG_PREFIX + + "reimaging nodes with expired heartbeats. " + + "scaleset_id:%s nodes:%s", + self.scaleset_id, + ",".join(str(x.machine_id) for x in dead_nodes), + ) + for node in dead_nodes: + if node not in to_reimage: + to_reimage.append(node) # Perform operations until they fail due to scaleset getting locked try: