mirror of
https://github.com/microsoft/onefuzz.git
synced 2025-06-18 12:48:07 +00:00
Abstract out node disposal (#1686)
* Abstract node disposal strategy * Cleanup + lint * Handle possibile scalesets being in resize state * Setting the size is still exposed via CLI, we don't want to break that functionality * PR comments
This commit is contained in:
@ -118,28 +118,54 @@ def create_auto_scale_profile(min: int, max: int, queue_uri: str) -> AutoscalePr
|
||||
return AutoscaleProfile(
|
||||
name=str(uuid.uuid4()),
|
||||
capacity=ScaleCapacity(minimum=min, maximum=max, default=max),
|
||||
# Auto scale tuning guidance:
|
||||
# https://docs.microsoft.com/en-us/azure/architecture/best-practices/auto-scaling
|
||||
rules=[
|
||||
ScaleRule(
|
||||
metric_trigger=MetricTrigger(
|
||||
metric_name="ApproximateMessageCount",
|
||||
metric_resource_uri=queue_uri,
|
||||
# Check every minute
|
||||
time_grain=timedelta(minutes=1),
|
||||
# Check every 15 minutes
|
||||
time_grain=timedelta(minutes=15),
|
||||
# The average amount of messages there are in the pool queue
|
||||
time_aggregation=TimeAggregationType.AVERAGE,
|
||||
statistic=MetricStatisticType.COUNT,
|
||||
# Over the past 10 minutes
|
||||
time_window=timedelta(minutes=10),
|
||||
# Over the past 15 minutes
|
||||
time_window=timedelta(minutes=15),
|
||||
# When there's more than 1 message in the pool queue
|
||||
operator=ComparisonOperationType.GREATER_THAN,
|
||||
operator=ComparisonOperationType.GREATER_THAN_OR_EQUAL,
|
||||
threshold=1,
|
||||
),
|
||||
scale_action=ScaleAction(
|
||||
direction=ScaleDirection.INCREASE,
|
||||
type=ScaleType.CHANGE_COUNT,
|
||||
value=1,
|
||||
cooldown=timedelta(minutes=5),
|
||||
value=2,
|
||||
cooldown=timedelta(minutes=10),
|
||||
),
|
||||
)
|
||||
),
|
||||
# Scale in
|
||||
ScaleRule(
|
||||
# Scale in if no work in the past 20 mins
|
||||
metric_trigger=MetricTrigger(
|
||||
metric_name="ApproximateMessageCount",
|
||||
metric_resource_uri=queue_uri,
|
||||
# Check every 20 minutes
|
||||
time_grain=timedelta(minutes=20),
|
||||
# The average amount of messages there are in the pool queue
|
||||
time_aggregation=TimeAggregationType.AVERAGE,
|
||||
statistic=MetricStatisticType.SUM,
|
||||
# Over the past 20 minutes
|
||||
time_window=timedelta(minutes=20),
|
||||
# When there's no messages in the pool queue
|
||||
operator=ComparisonOperationType.EQUALS,
|
||||
threshold=0,
|
||||
),
|
||||
scale_action=ScaleAction(
|
||||
direction=ScaleDirection.DECREASE,
|
||||
type=ScaleType.CHANGE_COUNT,
|
||||
value=1,
|
||||
cooldown=timedelta(minutes=15),
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
|
@ -8,7 +8,13 @@ import logging
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from uuid import UUID
|
||||
|
||||
from onefuzztypes.enums import ErrorCode, NodeState, PoolState, ScalesetState
|
||||
from onefuzztypes.enums import (
|
||||
ErrorCode,
|
||||
NodeDisaposalStrategy,
|
||||
NodeState,
|
||||
PoolState,
|
||||
ScalesetState,
|
||||
)
|
||||
from onefuzztypes.events import (
|
||||
EventScalesetCreated,
|
||||
EventScalesetDeleted,
|
||||
@ -420,8 +426,8 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
||||
|
||||
# Perform operations until they fail due to scaleset getting locked
|
||||
try:
|
||||
self.delete_nodes(to_delete)
|
||||
self.reimage_nodes(to_reimage)
|
||||
self.reimage_nodes(to_reimage, NodeDisaposalStrategy.scale_in)
|
||||
self.delete_nodes(to_delete, NodeDisaposalStrategy.scale_in)
|
||||
except UnableToUpdate:
|
||||
logging.info(
|
||||
SCALESET_LOG_PREFIX
|
||||
@ -491,6 +497,8 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
||||
return
|
||||
|
||||
if size != self.size:
|
||||
# Azure auto-scaled us or nodes were manually added/removed
|
||||
# New node state will be synced in cleanup_nodes
|
||||
logging.info(
|
||||
SCALESET_LOG_PREFIX + "unexpected scaleset size, resizing. "
|
||||
"scaleset_id:%s expected:%d actual:%d",
|
||||
@ -498,7 +506,8 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
||||
self.size,
|
||||
size,
|
||||
)
|
||||
self.set_state(ScalesetState.resize)
|
||||
self.size = size
|
||||
self.save()
|
||||
|
||||
def set_size(self, size: int) -> None:
|
||||
# ensure we always stay within max_size boundaries
|
||||
@ -545,7 +554,9 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
||||
else:
|
||||
self._resize_shrink(size - self.size)
|
||||
|
||||
def delete_nodes(self, nodes: List[Node]) -> None:
|
||||
def delete_nodes(
|
||||
self, nodes: List[Node], disposal_strategy: NodeDisaposalStrategy
|
||||
) -> None:
|
||||
if not nodes:
|
||||
logging.info(
|
||||
SCALESET_LOG_PREFIX + "no nodes to delete. scaleset_id:%s",
|
||||
@ -585,8 +596,12 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
||||
for node in nodes:
|
||||
if node.machine_id in machine_ids:
|
||||
node.delete()
|
||||
if disposal_strategy == NodeDisaposalStrategy.scale_in:
|
||||
node.release_scale_in_protection()
|
||||
|
||||
def reimage_nodes(self, nodes: List[Node]) -> None:
|
||||
def reimage_nodes(
|
||||
self, nodes: List[Node], disposal_strategy: NodeDisaposalStrategy
|
||||
) -> None:
|
||||
if not nodes:
|
||||
logging.info(
|
||||
SCALESET_LOG_PREFIX + "no nodes to reimage: scaleset_id:%s",
|
||||
@ -601,7 +616,7 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
||||
+ "scaleset_id:%s",
|
||||
self.scaleset_id,
|
||||
)
|
||||
self.delete_nodes(nodes)
|
||||
self.delete_nodes(nodes, disposal_strategy)
|
||||
return
|
||||
|
||||
if self.state == ScalesetState.halt:
|
||||
@ -643,6 +658,8 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
||||
for node in nodes:
|
||||
if node.machine_id in machine_ids:
|
||||
node.delete()
|
||||
if disposal_strategy == NodeDisaposalStrategy.scale_in:
|
||||
node.release_scale_in_protection()
|
||||
|
||||
def set_shutdown(self, now: bool) -> None:
|
||||
if now:
|
||||
@ -852,8 +869,6 @@ class Scaleset(BASE_SCALESET, ORMMixin):
|
||||
logging.error(capacity_failed)
|
||||
return capacity_failed
|
||||
|
||||
auto_scale_profile = create_auto_scale_profile(
|
||||
capacity, capacity, pool_queue_uri
|
||||
)
|
||||
auto_scale_profile = create_auto_scale_profile(1, capacity, pool_queue_uri)
|
||||
logging.info("Added auto scale resource to scaleset: %s" % self.scaleset_id)
|
||||
return add_auto_scale_to_vmss(self.scaleset_id, auto_scale_profile)
|
||||
|
Reference in New Issue
Block a user