From 2be1edd9dc8ea68d5895e74f8b3be493f1e46ed2 Mon Sep 17 00:00:00 2001 From: bmc-msft <41130664+bmc-msft@users.noreply.github.com> Date: Wed, 9 Jun 2021 14:58:56 -0400 Subject: [PATCH] handle reimaging failures by resetting reimage_queued (#970) In a previous commit, reimage_queued was added to prevent reimaging a node while it is reimaging. However, this means reimaging failures due to Azure issues don't finish reimaging. This will reset the this flag allowing the node to reimage in the following cleanup cycle. --- .../__app__/onefuzzlib/workers/scalesets.py | 38 ++++++++++--------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index d42e385ee..dee88bfb4 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -298,6 +298,9 @@ class Scaleset(BASE_SCALESET, ORMMixin): # result = 'did I modify the scaleset in azure' def cleanup_nodes(self) -> bool: + logging.info( + SCALESET_LOG_PREFIX + "cleaning up nodes. scaleset_id:%s", self.scaleset_id + ) if self.state == ScalesetState.halt: logging.info( SCALESET_LOG_PREFIX + "halting scaleset scaleset_id:%s", @@ -369,7 +372,12 @@ class Scaleset(BASE_SCALESET, ORMMixin): if ScalesetShrinkQueue(self.scaleset_id).should_shrink(): node.set_halt() to_delete.append(node) - elif not node.reimage_queued: + elif node.reimage_queued: + # reset the reimage_queued flag, in case it's not done + # reimaging during the next cleanup_nodes cycle + node.reimage_queued = False + node.save() + else: # only add nodes that are not already set to reschedule to_reimage.append(node) @@ -388,23 +396,8 @@ class Scaleset(BASE_SCALESET, ORMMixin): # Perform operations until they fail due to scaleset getting locked try: - if to_delete: - logging.info( - SCALESET_LOG_PREFIX + "deleting nodes. scaleset_id:%s count:%d", - self.scaleset_id, - len(to_delete), - ) - self.delete_nodes(to_delete) - for node in to_delete: - node.set_halt() - - if to_reimage: - logging.info( - SCALESET_LOG_PREFIX + "reimaging nodes: scaleset_id:%s count:%d", - self.scaleset_id, - len(to_reimage), - ) - self.reimage_nodes(to_reimage) + self.delete_nodes(to_delete) + self.reimage_nodes(to_reimage) except UnableToUpdate: logging.info( SCALESET_LOG_PREFIX @@ -501,6 +494,9 @@ class Scaleset(BASE_SCALESET, ORMMixin): ) return + for node in nodes: + node.set_halt() + if self.state == ScalesetState.halt: logging.info( SCALESET_LOG_PREFIX @@ -566,6 +562,12 @@ class Scaleset(BASE_SCALESET, ORMMixin): else: machine_ids.append(node.machine_id) + if not machine_ids: + logging.info( + SCALESET_LOG_PREFIX + "no nodes to reimage: %s", self.scaleset_id + ) + return + result = reimage_vmss_nodes(self.scaleset_id, machine_ids) if isinstance(result, Error): raise Exception(