clarify scaleset logging (#568)

This commit is contained in:
bmc-msft
2021-02-19 14:36:16 -05:00
committed by GitHub
parent 4992b494f1
commit 8ce4638b8a

View File

@ -51,6 +51,7 @@ from .nodes import Node
NODE_EXPIRATION_TIME: datetime.timedelta = datetime.timedelta(hours=1)
NODE_REIMAGE_TIME: datetime.timedelta = datetime.timedelta(days=7)
SCALESET_LOG_PREFIX = "scalesets: "
# Future work:
#
@ -150,7 +151,7 @@ class Scaleset(BASE_SCALESET, ORMMixin):
def init(self) -> None:
from .pools import Pool
logging.info("scaleset init: %s", self.scaleset_id)
logging.info(SCALESET_LOG_PREFIX + "init. scaleset_id:%s", self.scaleset_id)
ScalesetShrinkQueue(self.scaleset_id).create()
@ -163,7 +164,9 @@ class Scaleset(BASE_SCALESET, ORMMixin):
if pool.state == PoolState.init:
logging.info(
"scaleset waiting for pool: %s - %s", self.pool_name, self.scaleset_id
SCALESET_LOG_PREFIX + "waiting for pool. pool_name:%s scaleset_id:%s",
self.pool_name,
self.scaleset_id,
)
elif pool.state == PoolState.running:
image_os = get_os(self.region, self.image)
@ -190,12 +193,16 @@ class Scaleset(BASE_SCALESET, ORMMixin):
# TODO: How do we pass in SSH configs for Windows? Previously
# This was done as part of the generated per-task setup script.
logging.info("scaleset setup: %s", self.scaleset_id)
logging.info(SCALESET_LOG_PREFIX + "setup. scalset_id:%s", self.scaleset_id)
network = Network(self.region)
network_id = network.get_id()
if not network_id:
logging.info("creating network: %s", self.region)
logging.info(
SCALESET_LOG_PREFIX + "creating network. region:%s scaleset_id:%s",
self.region,
self.scaleset_id,
)
result = network.create()
if isinstance(result, Error):
self.set_failed(result)
@ -217,7 +224,10 @@ class Scaleset(BASE_SCALESET, ORMMixin):
self.set_failed(pool)
return
logging.info("creating scaleset: %s", self.scaleset_id)
logging.info(
SCALESET_LOG_PREFIX + "creating scaleset. scaleset_id:%s",
self.scaleset_id,
)
extensions = fuzz_extensions(pool, self)
result = create_vmss(
self.region,
@ -236,12 +246,21 @@ class Scaleset(BASE_SCALESET, ORMMixin):
self.set_failed(result)
return
else:
logging.info("creating scaleset: %s", self.scaleset_id)
logging.info(
SCALESET_LOG_PREFIX + "creating scaleset scaleset_id:%s",
self.scaleset_id,
)
elif vmss.provisioning_state == "Creating":
logging.info("Waiting on scaleset creation: %s", self.scaleset_id)
logging.info(
SCALESET_LOG_PREFIX + "Waiting on scaleset creation scalset_id:%s",
self.scaleset_id,
)
self.try_set_identity(vmss)
else:
logging.info("scaleset running: %s", self.scaleset_id)
logging.info(
SCALESET_LOG_PREFIX + "scaleset running scaleset_id:%s",
self.scaleset_id,
)
identity_result = self.try_set_identity(vmss)
if identity_result:
self.set_failed(identity_result)
@ -279,7 +298,10 @@ class Scaleset(BASE_SCALESET, ORMMixin):
# result = 'did I modify the scaleset in azure'
def cleanup_nodes(self) -> bool:
if self.state == ScalesetState.halt:
logging.info("halting scaleset: %s", self.scaleset_id)
logging.info(
SCALESET_LOG_PREFIX + "halting scaleset scaleset_id:%s",
self.scaleset_id,
)
self.halt()
return True
@ -297,7 +319,10 @@ class Scaleset(BASE_SCALESET, ORMMixin):
for node in nodes:
if node.machine_id not in azure_nodes:
logging.info(
"no longer in scaleset: %s:%s", self.scaleset_id, node.machine_id
SCALESET_LOG_PREFIX
+ "no longer in scaleset. scaleset_id:%s machine_id:%s",
self.scaleset_id,
node.machine_id,
)
node.delete()
@ -315,7 +340,8 @@ class Scaleset(BASE_SCALESET, ORMMixin):
continue
logging.info(
"scaleset - adding missing azure node: %s:%s",
SCALESET_LOG_PREFIX
+ "adding missing azure node. scaleset_id:%s machine_id:%s",
self.scaleset_id,
machine_id,
)
@ -355,16 +381,27 @@ class Scaleset(BASE_SCALESET, ORMMixin):
try:
if to_delete:
logging.info(
"deleting nodes: %s - count: %d", self.scaleset_id, len(to_delete)
SCALESET_LOG_PREFIX + "deleting nodes. scaleset_id:%s count:%d",
self.scaleset_id,
len(to_delete),
)
self.delete_nodes(to_delete)
for node in to_delete:
node.set_halt()
if to_reimage:
logging.info(
SCALESET_LOG_PREFIX + "reimaging nodes: scaleset_id:%s count:%d",
self.scaleset_id,
len(to_reimage),
)
self.reimage_nodes(to_reimage)
except UnableToUpdate:
logging.info("scaleset update already in progress: %s", self.scaleset_id)
logging.info(
SCALESET_LOG_PREFIX
+ "scaleset update already in progress: scaleset_id:%s",
self.scaleset_id,
)
return bool(to_reimage) or bool(to_delete)
@ -373,14 +410,15 @@ class Scaleset(BASE_SCALESET, ORMMixin):
# This ensures that our idea of scaleset size agrees with Azure
node_count = len(Node.search_states(scaleset_id=self.scaleset_id))
if node_count == self.size:
logging.info("resize finished: %s", self.scaleset_id)
logging.info(SCALESET_LOG_PREFIX + "resize finished: %s", self.scaleset_id)
self.state = ScalesetState.running
self.save()
return
else:
logging.info(
"resize is finished, waiting for nodes to check in: "
"%s (%d of %d nodes checked in)",
SCALESET_LOG_PREFIX
+ "resize is finished, waiting for nodes to check in: "
"scaleset_id:%s (%d of %d nodes checked in)",
self.scaleset_id,
node_count,
self.size,
@ -391,7 +429,11 @@ class Scaleset(BASE_SCALESET, ORMMixin):
try:
resize_vmss(self.scaleset_id, self.size)
except UnableToUpdate:
logging.info("scaleset is mid-operation already")
logging.info(
SCALESET_LOG_PREFIX
+ "scaleset is mid-operation already scaleset_id:%s",
self.scaleset_id,
)
return
def _resize_shrink(self, to_remove: int) -> None:
@ -404,7 +446,11 @@ class Scaleset(BASE_SCALESET, ORMMixin):
if self.state != ScalesetState.resize:
return
logging.info("scaleset resize: %s - %s", self.scaleset_id, self.size)
logging.info(
SCALESET_LOG_PREFIX + "scaleset resize: scaleset_id:%s size:%d",
self.scaleset_id,
self.size,
)
# reset the node delete queue
ScalesetShrinkQueue(self.scaleset_id).clear()
@ -415,7 +461,10 @@ class Scaleset(BASE_SCALESET, ORMMixin):
# Treat Azure knowledge of the size of the scaleset as "ground truth"
size = get_vmss_size(self.scaleset_id)
if size is None:
logging.info("scaleset is unavailable: %s", self.scaleset_id)
logging.info(
SCALESET_LOG_PREFIX + "scaleset is unavailable. scaleset_id:%s",
self.scaleset_id,
)
return
if size == self.size:
@ -427,45 +476,71 @@ class Scaleset(BASE_SCALESET, ORMMixin):
def delete_nodes(self, nodes: List[Node]) -> None:
if not nodes:
logging.debug("no nodes to delete")
logging.info(
SCALESET_LOG_PREFIX + "no nodes to delete. scaleset_id:%s",
self.scaleset_id,
)
return
if self.state == ScalesetState.halt:
logging.debug("scaleset delete will delete node: %s", self.scaleset_id)
logging.info(
SCALESET_LOG_PREFIX
+ "scaleset halting, ignoring node deletion: scaleset_id:%s",
self.scaleset_id,
)
return
machine_ids = []
for node in nodes:
if node.debug_keep_node:
logging.warning(
"delete manually overridden %s:%s",
SCALESET_LOG_PREFIX + "not deleting manually overridden node. "
"scaleset_id:%s machine_id:%s",
self.scaleset_id,
node.machine_id,
)
else:
machine_ids.append(node.machine_id)
logging.info("deleting %s:%s", self.scaleset_id, machine_ids)
logging.info(
SCALESET_LOG_PREFIX + "deleting scaleset_id:%s machine_id:%s",
self.scaleset_id,
machine_ids,
)
delete_vmss_nodes(self.scaleset_id, machine_ids)
def reimage_nodes(self, nodes: List[Node]) -> None:
if not nodes:
logging.debug("no nodes to reimage")
logging.info(
SCALESET_LOG_PREFIX + "no nodes to reimage: scaleset_id:%s",
self.scaleset_id,
)
return
if self.state == ScalesetState.shutdown:
logging.info(
SCALESET_LOG_PREFIX
+ "scaleset shutting down, deleting rather than reimaging nodes. "
+ "scaleset_id:%s",
self.scaleset_id,
)
self.delete_nodes(nodes)
return
if self.state == ScalesetState.halt:
logging.debug("scaleset delete will delete node: %s", self.scaleset_id)
logging.info(
SCALESET_LOG_PREFIX
+ "scaleset halting, ignoring node reimage: scaleset_id:%s",
self.scaleset_id,
)
return
machine_ids = []
for node in nodes:
if node.debug_keep_node:
logging.warning(
"reimage manually overridden %s:%s",
SCALESET_LOG_PREFIX + "not reimaging manually overridden node. "
"scaleset_id:%s machine_id:%s",
self.scaleset_id,
node.machine_id,
)
@ -495,7 +570,11 @@ class Scaleset(BASE_SCALESET, ORMMixin):
def shutdown(self) -> None:
size = get_vmss_size(self.scaleset_id)
logging.info("scaleset shutdown: %s (current size: %s)", self.scaleset_id, size)
logging.info(
SCALESET_LOG_PREFIX + "scaleset shutdown: scaleset_id:%s size:%d",
self.scaleset_id,
size,
)
nodes = Node.search_states(scaleset_id=self.scaleset_id)
for node in nodes:
node.set_shutdown()
@ -506,12 +585,22 @@ class Scaleset(BASE_SCALESET, ORMMixin):
ScalesetShrinkQueue(self.scaleset_id).delete()
for node in Node.search_states(scaleset_id=self.scaleset_id):
logging.info("deleting node %s:%s", self.scaleset_id, node.machine_id)
logging.info(
SCALESET_LOG_PREFIX + "deleting node scaleset_id:%s machine_id:%s",
self.scaleset_id,
node.machine_id,
)
node.delete()
logging.info("scaleset delete starting: %s", self.scaleset_id)
logging.info(
SCALESET_LOG_PREFIX + "scaleset delete starting: scaleset_id:%s",
self.scaleset_id,
)
if delete_vmss(self.scaleset_id):
logging.info("scaleset deleted: %s", self.scaleset_id)
logging.info(
SCALESET_LOG_PREFIX + "scaleset deleted: scaleset_id:%s",
self.scaleset_id,
)
self.delete()
else:
self.save()
@ -569,23 +658,32 @@ class Scaleset(BASE_SCALESET, ORMMixin):
if self.state == ScalesetState.halt:
logging.info(
"not updating configs, scaleset is set to be deleted: %s",
SCALESET_LOG_PREFIX
+ "not updating configs, scaleset is set to be deleted. "
"scaleset_id:%s",
self.scaleset_id,
)
return
if not self.needs_config_update:
logging.debug("config update not needed: %s", self.scaleset_id)
logging.debug(
SCALESET_LOG_PREFIX + "config update not needed. scaleset_id:%s",
self.scaleset_id,
)
return
logging.info("updating scaleset configs: %s", self.scaleset_id)
logging.info(
SCALESET_LOG_PREFIX + "updating scaleset configs. scaleset_id:%s",
self.scaleset_id,
)
pool = Pool.get_by_name(self.pool_name)
if isinstance(pool, Error):
logging.error(
"unable to find pool during config update: %s - %s",
self.scaleset_id,
SCALESET_LOG_PREFIX
+ "unable to find pool during config update. pool:%s scaleset_id:%s",
pool,
self.scaleset_id,
)
self.set_failed(pool)
return
@ -596,8 +694,10 @@ class Scaleset(BASE_SCALESET, ORMMixin):
self.needs_config_update = False
self.save()
except UnableToUpdate:
logging.debug(
"unable to update configs, update already in progress: %s",
logging.info(
SCALESET_LOG_PREFIX
+ "unable to update configs, update already in progress. "
"scaleset_id:%s",
self.scaleset_id,
)