fix deleting nodes with expired heartbeats (#930)

This commit is contained in:
bmc-msft
2021-05-26 13:06:44 -04:00
committed by GitHub
parent 269a529f93
commit 8b74d08d3d
2 changed files with 15 additions and 6 deletions

View File

@ -287,8 +287,8 @@ class Node(BASE_NODE, ORMMixin):
return False return False
if self.could_shrink_scaleset(): if self.could_shrink_scaleset():
self.set_halt()
logging.info("node scheduled to shrink. machine_id:%s", self.machine_id) logging.info("node scheduled to shrink. machine_id:%s", self.machine_id)
self.set_halt()
return False return False
if self.scaleset_id: if self.scaleset_id:
@ -384,8 +384,9 @@ class Node(BASE_NODE, ORMMixin):
def set_halt(self) -> None: def set_halt(self) -> None:
"""Tell the node to stop everything.""" """Tell the node to stop everything."""
self.set_shutdown() logging.info("setting halt: %s", self.machine_id)
self.stop() self.delete_requested = True
self.stop(done=True)
self.set_state(NodeState.halt) self.set_state(NodeState.halt)
@classmethod @classmethod

View File

@ -374,9 +374,17 @@ class Scaleset(BASE_SCALESET, ORMMixin):
to_reimage.append(node) to_reimage.append(node)
dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME) dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME)
for node in dead_nodes: if dead_nodes:
node.set_halt() logging.info(
to_reimage.append(node) SCALESET_LOG_PREFIX
+ "reimaging nodes with expired heartbeats. "
+ "scaleset_id:%s nodes:%s",
self.scaleset_id,
",".join(str(x.machine_id) for x in dead_nodes),
)
for node in dead_nodes:
if node not in to_reimage:
to_reimage.append(node)
# Perform operations until they fail due to scaleset getting locked # Perform operations until they fail due to scaleset getting locked
try: try: