mirror of
https://github.com/microsoft/onefuzz.git
synced 2025-06-18 20:58:06 +00:00
Enable scale in protection on VMSS instances (#1647)
* draft attempt at adding scaling protection * Service can now control scaling protection policy on VM instances * Improve logging a bit * draft attempt at adding scaling protection * Service can now control scaling protection policy on VM instances * Improve logging a bit * Error message was missing info * Linter * Don't schedule work if we can't protect the node * Last of the linter changes
This commit is contained in:
@ -39,6 +39,9 @@ def post(req: func.HttpRequest) -> func.HttpResponse:
|
|||||||
if work_stopped:
|
if work_stopped:
|
||||||
allowed = False
|
allowed = False
|
||||||
|
|
||||||
|
if allowed:
|
||||||
|
allowed = not isinstance(node.acquire_scale_in_protection(), Error)
|
||||||
|
|
||||||
return ok(CanSchedule(allowed=allowed, work_stopped=work_stopped))
|
return ok(CanSchedule(allowed=allowed, work_stopped=work_stopped))
|
||||||
|
|
||||||
|
|
||||||
|
@ -18,6 +18,7 @@ from azure.mgmt.compute.models import (
|
|||||||
ResourceSkuRestrictionsType,
|
ResourceSkuRestrictionsType,
|
||||||
VirtualMachineScaleSetVMInstanceIDs,
|
VirtualMachineScaleSetVMInstanceIDs,
|
||||||
VirtualMachineScaleSetVMInstanceRequiredIDs,
|
VirtualMachineScaleSetVMInstanceRequiredIDs,
|
||||||
|
VirtualMachineScaleSetVMProtectionPolicy,
|
||||||
)
|
)
|
||||||
from memoization import cached
|
from memoization import cached
|
||||||
from msrestazure.azure_exceptions import CloudError
|
from msrestazure.azure_exceptions import CloudError
|
||||||
@ -148,6 +149,54 @@ def get_instance_id(name: UUID, vm_id: UUID) -> Union[str, Error]:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@retry_on_auth_failure()
|
||||||
|
def update_scale_in_protection(
|
||||||
|
name: UUID, vm_id: UUID, protect_from_scale_in: bool
|
||||||
|
) -> Optional[Error]:
|
||||||
|
instance_id = get_instance_id(name, vm_id)
|
||||||
|
|
||||||
|
if isinstance(instance_id, Error):
|
||||||
|
return instance_id
|
||||||
|
|
||||||
|
compute_client = get_compute_client()
|
||||||
|
resource_group = get_base_resource_group()
|
||||||
|
|
||||||
|
try:
|
||||||
|
instance_vm = compute_client.virtual_machine_scale_set_vms.get(
|
||||||
|
resource_group, name, instance_id
|
||||||
|
)
|
||||||
|
except (ResourceNotFoundError, CloudError):
|
||||||
|
return Error(
|
||||||
|
code=ErrorCode.UNABLE_TO_FIND,
|
||||||
|
errors=["unable to find vm instance: %s:%s" % (name, instance_id)],
|
||||||
|
)
|
||||||
|
|
||||||
|
new_protection_policy = VirtualMachineScaleSetVMProtectionPolicy(
|
||||||
|
protect_from_scale_in=protect_from_scale_in
|
||||||
|
)
|
||||||
|
if instance_vm.protection_policy is not None:
|
||||||
|
new_protection_policy = instance_vm.protection_policy
|
||||||
|
new_protection_policy.protect_from_scale_in = protect_from_scale_in
|
||||||
|
|
||||||
|
instance_vm.protection_policy = new_protection_policy
|
||||||
|
|
||||||
|
try:
|
||||||
|
compute_client.virtual_machine_scale_set_vms.begin_update(
|
||||||
|
resource_group, name, instance_id, instance_vm
|
||||||
|
)
|
||||||
|
except (ResourceNotFoundError, CloudError):
|
||||||
|
return Error(
|
||||||
|
code=ErrorCode.UNABLE_TO_UPDATE,
|
||||||
|
errors=["unable to set protection policy on: %s:%s" % (vm_id, instance_id)],
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.info(
|
||||||
|
"Successfully set scale in protection on node %s to %s"
|
||||||
|
% (vm_id, protect_from_scale_in)
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class UnableToUpdate(Exception):
|
class UnableToUpdate(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ from onefuzztypes.primitives import PoolName
|
|||||||
from pydantic import Field
|
from pydantic import Field
|
||||||
|
|
||||||
from ..__version__ import __version__
|
from ..__version__ import __version__
|
||||||
from ..azure.vmss import get_instance_id
|
from ..azure.vmss import get_instance_id, update_scale_in_protection
|
||||||
from ..events import send_event
|
from ..events import send_event
|
||||||
from ..orm import MappingIntStrAny, ORMMixin, QueryFilter
|
from ..orm import MappingIntStrAny, ORMMixin, QueryFilter
|
||||||
from ..versions import is_minimum_version
|
from ..versions import is_minimum_version
|
||||||
@ -390,6 +390,7 @@ class Node(BASE_NODE, ORMMixin):
|
|||||||
# if we're going to reimage, make sure the node doesn't pick up new work
|
# if we're going to reimage, make sure the node doesn't pick up new work
|
||||||
# too.
|
# too.
|
||||||
self.send_stop_if_free()
|
self.send_stop_if_free()
|
||||||
|
self.release_scale_in_protection()
|
||||||
|
|
||||||
self.save()
|
self.save()
|
||||||
|
|
||||||
@ -502,6 +503,26 @@ class Node(BASE_NODE, ORMMixin):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def acquire_scale_in_protection(self) -> Optional[Error]:
|
||||||
|
if self.scaleset_node_exists() and self.scaleset_id:
|
||||||
|
logging.info("Setting scale-in protection on node %s", self.machine_id)
|
||||||
|
return update_scale_in_protection(
|
||||||
|
self.scaleset_id, self.machine_id, protect_from_scale_in=True
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def release_scale_in_protection(self) -> Optional[Error]:
|
||||||
|
if (
|
||||||
|
not self.debug_keep_node
|
||||||
|
and self.scaleset_node_exists()
|
||||||
|
and self.scaleset_id
|
||||||
|
):
|
||||||
|
logging.info("Removing scale-in protection on node %s", self.machine_id)
|
||||||
|
return update_scale_in_protection(
|
||||||
|
self.scaleset_id, self.machine_id, protect_from_scale_in=False
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class NodeTasks(BASE_NODE_TASK, ORMMixin):
|
class NodeTasks(BASE_NODE_TASK, ORMMixin):
|
||||||
@classmethod
|
@classmethod
|
||||||
|
Reference in New Issue
Block a user