mirror of
https://github.com/microsoft/onefuzz.git
synced 2025-06-16 11:58:09 +00:00
fix deleting nodes with expired heartbeats (#930)
This commit is contained in:
@ -287,8 +287,8 @@ class Node(BASE_NODE, ORMMixin):
|
||||
return False
|
||||
|
||||
if self.could_shrink_scaleset():
|
||||
self.set_halt()
|
||||
logging.info("node scheduled to shrink. machine_id:%s", self.machine_id)
|
||||
self.set_halt()
|
||||
return False
|
||||
|
||||
if self.scaleset_id:
|
||||
@ -384,8 +384,9 @@ class Node(BASE_NODE, ORMMixin):
|
||||
|
||||
def set_halt(self) -> None:
|
||||
"""Tell the node to stop everything."""
|
||||
self.set_shutdown()
|
||||
self.stop()
|
||||
logging.info("setting halt: %s", self.machine_id)
|
||||
self.delete_requested = True
|
||||
self.stop(done=True)
|
||||
self.set_state(NodeState.halt)
|
||||
|
||||
@classmethod
|
||||
|
@ -374,9 +374,17 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
||||
to_reimage.append(node)
|
||||
|
||||
dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME)
|
||||
for node in dead_nodes:
|
||||
node.set_halt()
|
||||
to_reimage.append(node)
|
||||
if dead_nodes:
|
||||
logging.info(
|
||||
SCALESET_LOG_PREFIX
|
||||
+ "reimaging nodes with expired heartbeats. "
|
||||
+ "scaleset_id:%s nodes:%s",
|
||||
self.scaleset_id,
|
||||
",".join(str(x.machine_id) for x in dead_nodes),
|
||||
)
|
||||
for node in dead_nodes:
|
||||
if node not in to_reimage:
|
||||
to_reimage.append(node)
|
||||
|
||||
# Perform operations until they fail due to scaleset getting locked
|
||||
try:
|
||||
|
Reference in New Issue
Block a user