mirror of
https://github.com/microsoft/onefuzz.git
synced 2025-06-17 04:18:07 +00:00
clarify scaleset logging (#568)
This commit is contained in:
@ -51,6 +51,7 @@ from .nodes import Node
|
|||||||
|
|
||||||
NODE_EXPIRATION_TIME: datetime.timedelta = datetime.timedelta(hours=1)
|
NODE_EXPIRATION_TIME: datetime.timedelta = datetime.timedelta(hours=1)
|
||||||
NODE_REIMAGE_TIME: datetime.timedelta = datetime.timedelta(days=7)
|
NODE_REIMAGE_TIME: datetime.timedelta = datetime.timedelta(days=7)
|
||||||
|
SCALESET_LOG_PREFIX = "scalesets: "
|
||||||
|
|
||||||
# Future work:
|
# Future work:
|
||||||
#
|
#
|
||||||
@ -150,7 +151,7 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
def init(self) -> None:
|
def init(self) -> None:
|
||||||
from .pools import Pool
|
from .pools import Pool
|
||||||
|
|
||||||
logging.info("scaleset init: %s", self.scaleset_id)
|
logging.info(SCALESET_LOG_PREFIX + "init. scaleset_id:%s", self.scaleset_id)
|
||||||
|
|
||||||
ScalesetShrinkQueue(self.scaleset_id).create()
|
ScalesetShrinkQueue(self.scaleset_id).create()
|
||||||
|
|
||||||
@ -163,7 +164,9 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
|
|
||||||
if pool.state == PoolState.init:
|
if pool.state == PoolState.init:
|
||||||
logging.info(
|
logging.info(
|
||||||
"scaleset waiting for pool: %s - %s", self.pool_name, self.scaleset_id
|
SCALESET_LOG_PREFIX + "waiting for pool. pool_name:%s scaleset_id:%s",
|
||||||
|
self.pool_name,
|
||||||
|
self.scaleset_id,
|
||||||
)
|
)
|
||||||
elif pool.state == PoolState.running:
|
elif pool.state == PoolState.running:
|
||||||
image_os = get_os(self.region, self.image)
|
image_os = get_os(self.region, self.image)
|
||||||
@ -190,12 +193,16 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
|
|
||||||
# TODO: How do we pass in SSH configs for Windows? Previously
|
# TODO: How do we pass in SSH configs for Windows? Previously
|
||||||
# This was done as part of the generated per-task setup script.
|
# This was done as part of the generated per-task setup script.
|
||||||
logging.info("scaleset setup: %s", self.scaleset_id)
|
logging.info(SCALESET_LOG_PREFIX + "setup. scalset_id:%s", self.scaleset_id)
|
||||||
|
|
||||||
network = Network(self.region)
|
network = Network(self.region)
|
||||||
network_id = network.get_id()
|
network_id = network.get_id()
|
||||||
if not network_id:
|
if not network_id:
|
||||||
logging.info("creating network: %s", self.region)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "creating network. region:%s scaleset_id:%s",
|
||||||
|
self.region,
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
result = network.create()
|
result = network.create()
|
||||||
if isinstance(result, Error):
|
if isinstance(result, Error):
|
||||||
self.set_failed(result)
|
self.set_failed(result)
|
||||||
@ -217,7 +224,10 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
self.set_failed(pool)
|
self.set_failed(pool)
|
||||||
return
|
return
|
||||||
|
|
||||||
logging.info("creating scaleset: %s", self.scaleset_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "creating scaleset. scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
extensions = fuzz_extensions(pool, self)
|
extensions = fuzz_extensions(pool, self)
|
||||||
result = create_vmss(
|
result = create_vmss(
|
||||||
self.region,
|
self.region,
|
||||||
@ -236,12 +246,21 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
self.set_failed(result)
|
self.set_failed(result)
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
logging.info("creating scaleset: %s", self.scaleset_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "creating scaleset scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
elif vmss.provisioning_state == "Creating":
|
elif vmss.provisioning_state == "Creating":
|
||||||
logging.info("Waiting on scaleset creation: %s", self.scaleset_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "Waiting on scaleset creation scalset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
self.try_set_identity(vmss)
|
self.try_set_identity(vmss)
|
||||||
else:
|
else:
|
||||||
logging.info("scaleset running: %s", self.scaleset_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "scaleset running scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
identity_result = self.try_set_identity(vmss)
|
identity_result = self.try_set_identity(vmss)
|
||||||
if identity_result:
|
if identity_result:
|
||||||
self.set_failed(identity_result)
|
self.set_failed(identity_result)
|
||||||
@ -279,7 +298,10 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
# result = 'did I modify the scaleset in azure'
|
# result = 'did I modify the scaleset in azure'
|
||||||
def cleanup_nodes(self) -> bool:
|
def cleanup_nodes(self) -> bool:
|
||||||
if self.state == ScalesetState.halt:
|
if self.state == ScalesetState.halt:
|
||||||
logging.info("halting scaleset: %s", self.scaleset_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "halting scaleset scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
self.halt()
|
self.halt()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -297,7 +319,10 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
for node in nodes:
|
for node in nodes:
|
||||||
if node.machine_id not in azure_nodes:
|
if node.machine_id not in azure_nodes:
|
||||||
logging.info(
|
logging.info(
|
||||||
"no longer in scaleset: %s:%s", self.scaleset_id, node.machine_id
|
SCALESET_LOG_PREFIX
|
||||||
|
+ "no longer in scaleset. scaleset_id:%s machine_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
node.machine_id,
|
||||||
)
|
)
|
||||||
node.delete()
|
node.delete()
|
||||||
|
|
||||||
@ -315,7 +340,8 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
logging.info(
|
logging.info(
|
||||||
"scaleset - adding missing azure node: %s:%s",
|
SCALESET_LOG_PREFIX
|
||||||
|
+ "adding missing azure node. scaleset_id:%s machine_id:%s",
|
||||||
self.scaleset_id,
|
self.scaleset_id,
|
||||||
machine_id,
|
machine_id,
|
||||||
)
|
)
|
||||||
@ -355,16 +381,27 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
try:
|
try:
|
||||||
if to_delete:
|
if to_delete:
|
||||||
logging.info(
|
logging.info(
|
||||||
"deleting nodes: %s - count: %d", self.scaleset_id, len(to_delete)
|
SCALESET_LOG_PREFIX + "deleting nodes. scaleset_id:%s count:%d",
|
||||||
|
self.scaleset_id,
|
||||||
|
len(to_delete),
|
||||||
)
|
)
|
||||||
self.delete_nodes(to_delete)
|
self.delete_nodes(to_delete)
|
||||||
for node in to_delete:
|
for node in to_delete:
|
||||||
node.set_halt()
|
node.set_halt()
|
||||||
|
|
||||||
if to_reimage:
|
if to_reimage:
|
||||||
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "reimaging nodes: scaleset_id:%s count:%d",
|
||||||
|
self.scaleset_id,
|
||||||
|
len(to_reimage),
|
||||||
|
)
|
||||||
self.reimage_nodes(to_reimage)
|
self.reimage_nodes(to_reimage)
|
||||||
except UnableToUpdate:
|
except UnableToUpdate:
|
||||||
logging.info("scaleset update already in progress: %s", self.scaleset_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX
|
||||||
|
+ "scaleset update already in progress: scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
|
|
||||||
return bool(to_reimage) or bool(to_delete)
|
return bool(to_reimage) or bool(to_delete)
|
||||||
|
|
||||||
@ -373,14 +410,15 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
# This ensures that our idea of scaleset size agrees with Azure
|
# This ensures that our idea of scaleset size agrees with Azure
|
||||||
node_count = len(Node.search_states(scaleset_id=self.scaleset_id))
|
node_count = len(Node.search_states(scaleset_id=self.scaleset_id))
|
||||||
if node_count == self.size:
|
if node_count == self.size:
|
||||||
logging.info("resize finished: %s", self.scaleset_id)
|
logging.info(SCALESET_LOG_PREFIX + "resize finished: %s", self.scaleset_id)
|
||||||
self.state = ScalesetState.running
|
self.state = ScalesetState.running
|
||||||
self.save()
|
self.save()
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
logging.info(
|
logging.info(
|
||||||
"resize is finished, waiting for nodes to check in: "
|
SCALESET_LOG_PREFIX
|
||||||
"%s (%d of %d nodes checked in)",
|
+ "resize is finished, waiting for nodes to check in: "
|
||||||
|
"scaleset_id:%s (%d of %d nodes checked in)",
|
||||||
self.scaleset_id,
|
self.scaleset_id,
|
||||||
node_count,
|
node_count,
|
||||||
self.size,
|
self.size,
|
||||||
@ -391,7 +429,11 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
try:
|
try:
|
||||||
resize_vmss(self.scaleset_id, self.size)
|
resize_vmss(self.scaleset_id, self.size)
|
||||||
except UnableToUpdate:
|
except UnableToUpdate:
|
||||||
logging.info("scaleset is mid-operation already")
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX
|
||||||
|
+ "scaleset is mid-operation already scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
def _resize_shrink(self, to_remove: int) -> None:
|
def _resize_shrink(self, to_remove: int) -> None:
|
||||||
@ -404,7 +446,11 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
if self.state != ScalesetState.resize:
|
if self.state != ScalesetState.resize:
|
||||||
return
|
return
|
||||||
|
|
||||||
logging.info("scaleset resize: %s - %s", self.scaleset_id, self.size)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "scaleset resize: scaleset_id:%s size:%d",
|
||||||
|
self.scaleset_id,
|
||||||
|
self.size,
|
||||||
|
)
|
||||||
|
|
||||||
# reset the node delete queue
|
# reset the node delete queue
|
||||||
ScalesetShrinkQueue(self.scaleset_id).clear()
|
ScalesetShrinkQueue(self.scaleset_id).clear()
|
||||||
@ -415,7 +461,10 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
# Treat Azure knowledge of the size of the scaleset as "ground truth"
|
# Treat Azure knowledge of the size of the scaleset as "ground truth"
|
||||||
size = get_vmss_size(self.scaleset_id)
|
size = get_vmss_size(self.scaleset_id)
|
||||||
if size is None:
|
if size is None:
|
||||||
logging.info("scaleset is unavailable: %s", self.scaleset_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "scaleset is unavailable. scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
if size == self.size:
|
if size == self.size:
|
||||||
@ -427,45 +476,71 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
|
|
||||||
def delete_nodes(self, nodes: List[Node]) -> None:
|
def delete_nodes(self, nodes: List[Node]) -> None:
|
||||||
if not nodes:
|
if not nodes:
|
||||||
logging.debug("no nodes to delete")
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "no nodes to delete. scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.state == ScalesetState.halt:
|
if self.state == ScalesetState.halt:
|
||||||
logging.debug("scaleset delete will delete node: %s", self.scaleset_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX
|
||||||
|
+ "scaleset halting, ignoring node deletion: scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
machine_ids = []
|
machine_ids = []
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
if node.debug_keep_node:
|
if node.debug_keep_node:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"delete manually overridden %s:%s",
|
SCALESET_LOG_PREFIX + "not deleting manually overridden node. "
|
||||||
|
"scaleset_id:%s machine_id:%s",
|
||||||
self.scaleset_id,
|
self.scaleset_id,
|
||||||
node.machine_id,
|
node.machine_id,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
machine_ids.append(node.machine_id)
|
machine_ids.append(node.machine_id)
|
||||||
|
|
||||||
logging.info("deleting %s:%s", self.scaleset_id, machine_ids)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "deleting scaleset_id:%s machine_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
machine_ids,
|
||||||
|
)
|
||||||
delete_vmss_nodes(self.scaleset_id, machine_ids)
|
delete_vmss_nodes(self.scaleset_id, machine_ids)
|
||||||
|
|
||||||
def reimage_nodes(self, nodes: List[Node]) -> None:
|
def reimage_nodes(self, nodes: List[Node]) -> None:
|
||||||
if not nodes:
|
if not nodes:
|
||||||
logging.debug("no nodes to reimage")
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "no nodes to reimage: scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.state == ScalesetState.shutdown:
|
if self.state == ScalesetState.shutdown:
|
||||||
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX
|
||||||
|
+ "scaleset shutting down, deleting rather than reimaging nodes. "
|
||||||
|
+ "scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
self.delete_nodes(nodes)
|
self.delete_nodes(nodes)
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.state == ScalesetState.halt:
|
if self.state == ScalesetState.halt:
|
||||||
logging.debug("scaleset delete will delete node: %s", self.scaleset_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX
|
||||||
|
+ "scaleset halting, ignoring node reimage: scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
machine_ids = []
|
machine_ids = []
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
if node.debug_keep_node:
|
if node.debug_keep_node:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"reimage manually overridden %s:%s",
|
SCALESET_LOG_PREFIX + "not reimaging manually overridden node. "
|
||||||
|
"scaleset_id:%s machine_id:%s",
|
||||||
self.scaleset_id,
|
self.scaleset_id,
|
||||||
node.machine_id,
|
node.machine_id,
|
||||||
)
|
)
|
||||||
@ -495,7 +570,11 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
|
|
||||||
def shutdown(self) -> None:
|
def shutdown(self) -> None:
|
||||||
size = get_vmss_size(self.scaleset_id)
|
size = get_vmss_size(self.scaleset_id)
|
||||||
logging.info("scaleset shutdown: %s (current size: %s)", self.scaleset_id, size)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "scaleset shutdown: scaleset_id:%s size:%d",
|
||||||
|
self.scaleset_id,
|
||||||
|
size,
|
||||||
|
)
|
||||||
nodes = Node.search_states(scaleset_id=self.scaleset_id)
|
nodes = Node.search_states(scaleset_id=self.scaleset_id)
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
node.set_shutdown()
|
node.set_shutdown()
|
||||||
@ -506,12 +585,22 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
ScalesetShrinkQueue(self.scaleset_id).delete()
|
ScalesetShrinkQueue(self.scaleset_id).delete()
|
||||||
|
|
||||||
for node in Node.search_states(scaleset_id=self.scaleset_id):
|
for node in Node.search_states(scaleset_id=self.scaleset_id):
|
||||||
logging.info("deleting node %s:%s", self.scaleset_id, node.machine_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "deleting node scaleset_id:%s machine_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
node.machine_id,
|
||||||
|
)
|
||||||
node.delete()
|
node.delete()
|
||||||
|
|
||||||
logging.info("scaleset delete starting: %s", self.scaleset_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "scaleset delete starting: scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
if delete_vmss(self.scaleset_id):
|
if delete_vmss(self.scaleset_id):
|
||||||
logging.info("scaleset deleted: %s", self.scaleset_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "scaleset deleted: scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
self.delete()
|
self.delete()
|
||||||
else:
|
else:
|
||||||
self.save()
|
self.save()
|
||||||
@ -569,23 +658,32 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
|
|
||||||
if self.state == ScalesetState.halt:
|
if self.state == ScalesetState.halt:
|
||||||
logging.info(
|
logging.info(
|
||||||
"not updating configs, scaleset is set to be deleted: %s",
|
SCALESET_LOG_PREFIX
|
||||||
|
+ "not updating configs, scaleset is set to be deleted. "
|
||||||
|
"scaleset_id:%s",
|
||||||
self.scaleset_id,
|
self.scaleset_id,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
if not self.needs_config_update:
|
if not self.needs_config_update:
|
||||||
logging.debug("config update not needed: %s", self.scaleset_id)
|
logging.debug(
|
||||||
|
SCALESET_LOG_PREFIX + "config update not needed. scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
logging.info("updating scaleset configs: %s", self.scaleset_id)
|
logging.info(
|
||||||
|
SCALESET_LOG_PREFIX + "updating scaleset configs. scaleset_id:%s",
|
||||||
|
self.scaleset_id,
|
||||||
|
)
|
||||||
|
|
||||||
pool = Pool.get_by_name(self.pool_name)
|
pool = Pool.get_by_name(self.pool_name)
|
||||||
if isinstance(pool, Error):
|
if isinstance(pool, Error):
|
||||||
logging.error(
|
logging.error(
|
||||||
"unable to find pool during config update: %s - %s",
|
SCALESET_LOG_PREFIX
|
||||||
self.scaleset_id,
|
+ "unable to find pool during config update. pool:%s scaleset_id:%s",
|
||||||
pool,
|
pool,
|
||||||
|
self.scaleset_id,
|
||||||
)
|
)
|
||||||
self.set_failed(pool)
|
self.set_failed(pool)
|
||||||
return
|
return
|
||||||
@ -596,8 +694,10 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
|||||||
self.needs_config_update = False
|
self.needs_config_update = False
|
||||||
self.save()
|
self.save()
|
||||||
except UnableToUpdate:
|
except UnableToUpdate:
|
||||||
logging.debug(
|
logging.info(
|
||||||
"unable to update configs, update already in progress: %s",
|
SCALESET_LOG_PREFIX
|
||||||
|
+ "unable to update configs, update already in progress. "
|
||||||
|
"scaleset_id:%s",
|
||||||
self.scaleset_id,
|
self.scaleset_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user