Delete nodes when they're done (#1763)

* Delete nodes when they're done

* Missed a file

* Load node disposal strategy from env var

* Lint

* Fix subtle bug

* Deleting doesn't work, will 'decomission' nodes once they complete work

* Missed a file

* Remove logging line
This commit is contained in:
Teo Voinea
2022-04-12 13:32:15 -04:00
committed by GitHub
parent faaa5d2d78
commit 87eb606b35
3 changed files with 49 additions and 28 deletions

View File

@ -156,6 +156,7 @@ def create_auto_scale_profile(
# When there's more than 1 message in the pool queue
operator=ComparisonOperationType.GREATER_THAN_OR_EQUAL,
threshold=1,
divide_per_instance=False,
),
scale_action=ScaleAction(
direction=ScaleDirection.INCREASE,
@ -170,16 +171,17 @@ def create_auto_scale_profile(
metric_trigger=MetricTrigger(
metric_name="ApproximateMessageCount",
metric_resource_uri=queue_uri,
# Check every 20 minutes
time_grain=timedelta(minutes=20),
# Check every 10 minutes
time_grain=timedelta(minutes=10),
# The average amount of messages there are in the pool queue
time_aggregation=TimeAggregationType.AVERAGE,
statistic=MetricStatisticType.SUM,
# Over the past 20 minutes
time_window=timedelta(minutes=20),
# Over the past 10 minutes
time_window=timedelta(minutes=10),
# When there's no messages in the pool queue
operator=ComparisonOperationType.EQUALS,
threshold=0,
divide_per_instance=False,
),
scale_action=ScaleAction(
direction=ScaleDirection.DECREASE,
@ -194,7 +196,7 @@ def create_auto_scale_profile(
def default_auto_scale_profile(queue_uri: str, scaleset_size: int) -> AutoscaleProfile:
return create_auto_scale_profile(
queue_uri, 1, scaleset_size, scaleset_size, 1, 10, 1, 15
queue_uri, 1, scaleset_size, scaleset_size, 1, 10, 1, 5
)

View File

@ -5,6 +5,7 @@
import datetime
import logging
import os
from typing import Any, Dict, List, Optional, Tuple, Union
from uuid import UUID
@ -437,8 +438,13 @@ class Scaleset(BASE_SCALESET, ORMMixin):
# Perform operations until they fail due to scaleset getting locked
try:
self.reimage_nodes(to_reimage, NodeDisaposalStrategy.scale_in)
self.delete_nodes(to_delete, NodeDisaposalStrategy.scale_in)
strategy_str = os.getenv("ONEFUZZ_NODE_DISPOSAL_STRATEGY", "scale_in")
if strategy_str == "decomission":
strategy = NodeDisaposalStrategy.decomission
else:
strategy = NodeDisaposalStrategy.scale_in
self.reimage_nodes(to_reimage, strategy)
self.delete_nodes(to_delete, strategy)
except UnableToUpdate:
logging.info(
SCALESET_LOG_PREFIX
@ -598,6 +604,12 @@ class Scaleset(BASE_SCALESET, ORMMixin):
else:
machine_ids.add(node.machine_id)
if disposal_strategy == NodeDisaposalStrategy.decomission:
logging.info(SCALESET_LOG_PREFIX + "decomissioning nodes")
for node in nodes:
if node.machine_id in machine_ids:
node.release_scale_in_protection()
else:
logging.info(
SCALESET_LOG_PREFIX + "deleting nodes scaleset_id:%s machine_id:%s",
self.scaleset_id,
@ -659,6 +671,12 @@ class Scaleset(BASE_SCALESET, ORMMixin):
)
return
if disposal_strategy == NodeDisaposalStrategy.decomission:
logging.info(SCALESET_LOG_PREFIX + "decomissioning nodes")
for node in nodes:
if node.machine_id in machine_ids:
node.release_scale_in_protection()
else:
result = reimage_vmss_nodes(self.scaleset_id, machine_ids)
if isinstance(result, Error):
raise Exception(

View File

@ -417,3 +417,4 @@ class UserFieldType(Enum):
class NodeDisaposalStrategy(Enum):
scale_in = "scale_in"
decomission = "decomission"