mirror of
https://github.com/microsoft/onefuzz.git
synced 2025-06-17 12:28:07 +00:00
Delete nodes when they're done (#1763)
* Delete nodes when they're done * Missed a file * Load node disposal strategy from env var * Lint * Fix subtle bug * Deleting doesn't work, will 'decomission' nodes once they complete work * Missed a file * Remove logging line
This commit is contained in:
@ -156,6 +156,7 @@ def create_auto_scale_profile(
|
||||
# When there's more than 1 message in the pool queue
|
||||
operator=ComparisonOperationType.GREATER_THAN_OR_EQUAL,
|
||||
threshold=1,
|
||||
divide_per_instance=False,
|
||||
),
|
||||
scale_action=ScaleAction(
|
||||
direction=ScaleDirection.INCREASE,
|
||||
@ -170,16 +171,17 @@ def create_auto_scale_profile(
|
||||
metric_trigger=MetricTrigger(
|
||||
metric_name="ApproximateMessageCount",
|
||||
metric_resource_uri=queue_uri,
|
||||
# Check every 20 minutes
|
||||
time_grain=timedelta(minutes=20),
|
||||
# Check every 10 minutes
|
||||
time_grain=timedelta(minutes=10),
|
||||
# The average amount of messages there are in the pool queue
|
||||
time_aggregation=TimeAggregationType.AVERAGE,
|
||||
statistic=MetricStatisticType.SUM,
|
||||
# Over the past 20 minutes
|
||||
time_window=timedelta(minutes=20),
|
||||
# Over the past 10 minutes
|
||||
time_window=timedelta(minutes=10),
|
||||
# When there's no messages in the pool queue
|
||||
operator=ComparisonOperationType.EQUALS,
|
||||
threshold=0,
|
||||
divide_per_instance=False,
|
||||
),
|
||||
scale_action=ScaleAction(
|
||||
direction=ScaleDirection.DECREASE,
|
||||
@ -194,7 +196,7 @@ def create_auto_scale_profile(
|
||||
|
||||
def default_auto_scale_profile(queue_uri: str, scaleset_size: int) -> AutoscaleProfile:
|
||||
return create_auto_scale_profile(
|
||||
queue_uri, 1, scaleset_size, scaleset_size, 1, 10, 1, 15
|
||||
queue_uri, 1, scaleset_size, scaleset_size, 1, 10, 1, 5
|
||||
)
|
||||
|
||||
|
||||
|
@ -5,6 +5,7 @@
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from uuid import UUID
|
||||
|
||||
@ -437,8 +438,13 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
||||
|
||||
# Perform operations until they fail due to scaleset getting locked
|
||||
try:
|
||||
self.reimage_nodes(to_reimage, NodeDisaposalStrategy.scale_in)
|
||||
self.delete_nodes(to_delete, NodeDisaposalStrategy.scale_in)
|
||||
strategy_str = os.getenv("ONEFUZZ_NODE_DISPOSAL_STRATEGY", "scale_in")
|
||||
if strategy_str == "decomission":
|
||||
strategy = NodeDisaposalStrategy.decomission
|
||||
else:
|
||||
strategy = NodeDisaposalStrategy.scale_in
|
||||
self.reimage_nodes(to_reimage, strategy)
|
||||
self.delete_nodes(to_delete, strategy)
|
||||
except UnableToUpdate:
|
||||
logging.info(
|
||||
SCALESET_LOG_PREFIX
|
||||
@ -598,17 +604,23 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
||||
else:
|
||||
machine_ids.add(node.machine_id)
|
||||
|
||||
logging.info(
|
||||
SCALESET_LOG_PREFIX + "deleting nodes scaleset_id:%s machine_id:%s",
|
||||
self.scaleset_id,
|
||||
machine_ids,
|
||||
)
|
||||
delete_vmss_nodes(self.scaleset_id, machine_ids)
|
||||
for node in nodes:
|
||||
if node.machine_id in machine_ids:
|
||||
node.delete()
|
||||
if disposal_strategy == NodeDisaposalStrategy.scale_in:
|
||||
if disposal_strategy == NodeDisaposalStrategy.decomission:
|
||||
logging.info(SCALESET_LOG_PREFIX + "decomissioning nodes")
|
||||
for node in nodes:
|
||||
if node.machine_id in machine_ids:
|
||||
node.release_scale_in_protection()
|
||||
else:
|
||||
logging.info(
|
||||
SCALESET_LOG_PREFIX + "deleting nodes scaleset_id:%s machine_id:%s",
|
||||
self.scaleset_id,
|
||||
machine_ids,
|
||||
)
|
||||
delete_vmss_nodes(self.scaleset_id, machine_ids)
|
||||
for node in nodes:
|
||||
if node.machine_id in machine_ids:
|
||||
node.delete()
|
||||
if disposal_strategy == NodeDisaposalStrategy.scale_in:
|
||||
node.release_scale_in_protection()
|
||||
|
||||
def reimage_nodes(
|
||||
self, nodes: List[Node], disposal_strategy: NodeDisaposalStrategy
|
||||
@ -659,18 +671,24 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
||||
)
|
||||
return
|
||||
|
||||
result = reimage_vmss_nodes(self.scaleset_id, machine_ids)
|
||||
if isinstance(result, Error):
|
||||
raise Exception(
|
||||
"unable to reimage nodes: %s:%s - %s"
|
||||
% (self.scaleset_id, machine_ids, result)
|
||||
)
|
||||
|
||||
for node in nodes:
|
||||
if node.machine_id in machine_ids:
|
||||
node.delete()
|
||||
if disposal_strategy == NodeDisaposalStrategy.scale_in:
|
||||
if disposal_strategy == NodeDisaposalStrategy.decomission:
|
||||
logging.info(SCALESET_LOG_PREFIX + "decomissioning nodes")
|
||||
for node in nodes:
|
||||
if node.machine_id in machine_ids:
|
||||
node.release_scale_in_protection()
|
||||
else:
|
||||
result = reimage_vmss_nodes(self.scaleset_id, machine_ids)
|
||||
if isinstance(result, Error):
|
||||
raise Exception(
|
||||
"unable to reimage nodes: %s:%s - %s"
|
||||
% (self.scaleset_id, machine_ids, result)
|
||||
)
|
||||
|
||||
for node in nodes:
|
||||
if node.machine_id in machine_ids:
|
||||
node.delete()
|
||||
if disposal_strategy == NodeDisaposalStrategy.scale_in:
|
||||
node.release_scale_in_protection()
|
||||
|
||||
def set_shutdown(self, now: bool) -> None:
|
||||
if now:
|
||||
|
Reference in New Issue
Block a user