diff --git a/src/api-service/__app__/onefuzzlib/workers/nodes.py b/src/api-service/__app__/onefuzzlib/workers/nodes.py index 172d2bcc5..24fdcd4a5 100644 --- a/src/api-service/__app__/onefuzzlib/workers/nodes.py +++ b/src/api-service/__app__/onefuzzlib/workers/nodes.py @@ -287,8 +287,8 @@ class Node(BASE_NODE, ORMMixin): return False if self.could_shrink_scaleset(): - self.set_halt() logging.info("node scheduled to shrink. machine_id:%s", self.machine_id) + self.set_halt() return False if self.scaleset_id: @@ -384,8 +384,9 @@ class Node(BASE_NODE, ORMMixin): def set_halt(self) -> None: """Tell the node to stop everything.""" - self.set_shutdown() - self.stop() + logging.info("setting halt: %s", self.machine_id) + self.delete_requested = True + self.stop(done=True) self.set_state(NodeState.halt) @classmethod diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index df7d4c908..d42e385ee 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -374,9 +374,17 @@ class Scaleset(BASE_SCALESET, ORMMixin): to_reimage.append(node) dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME) - for node in dead_nodes: - node.set_halt() - to_reimage.append(node) + if dead_nodes: + logging.info( + SCALESET_LOG_PREFIX + + "reimaging nodes with expired heartbeats. " + + "scaleset_id:%s nodes:%s", + self.scaleset_id, + ",".join(str(x.machine_id) for x in dead_nodes), + ) + for node in dead_nodes: + if node not in to_reimage: + to_reimage.append(node) # Perform operations until they fail due to scaleset getting locked try: