Enable scale in protection on VMSS instances (#1647)

* draft attempt at adding scaling protection

* Service can now control scaling protection policy on VM instances

* Improve logging a bit

* draft attempt at adding scaling protection

* Service can now control scaling protection policy on VM instances

* Improve logging a bit

* Error message was missing info

* Linter

* Don't schedule work if we can't protect the node

* Last of the linter changes
This commit is contained in:
Teo Voinea
2022-02-14 09:56:55 -05:00
committed by GitHub
parent 77dcd57b46
commit 5d8516bd70
3 changed files with 74 additions and 1 deletions

View File

@ -39,6 +39,9 @@ def post(req: func.HttpRequest) -> func.HttpResponse:
if work_stopped: if work_stopped:
allowed = False allowed = False
if allowed:
allowed = not isinstance(node.acquire_scale_in_protection(), Error)
return ok(CanSchedule(allowed=allowed, work_stopped=work_stopped)) return ok(CanSchedule(allowed=allowed, work_stopped=work_stopped))

View File

@ -18,6 +18,7 @@ from azure.mgmt.compute.models import (
ResourceSkuRestrictionsType, ResourceSkuRestrictionsType,
VirtualMachineScaleSetVMInstanceIDs, VirtualMachineScaleSetVMInstanceIDs,
VirtualMachineScaleSetVMInstanceRequiredIDs, VirtualMachineScaleSetVMInstanceRequiredIDs,
VirtualMachineScaleSetVMProtectionPolicy,
) )
from memoization import cached from memoization import cached
from msrestazure.azure_exceptions import CloudError from msrestazure.azure_exceptions import CloudError
@ -148,6 +149,54 @@ def get_instance_id(name: UUID, vm_id: UUID) -> Union[str, Error]:
) )
@retry_on_auth_failure()
def update_scale_in_protection(
name: UUID, vm_id: UUID, protect_from_scale_in: bool
) -> Optional[Error]:
instance_id = get_instance_id(name, vm_id)
if isinstance(instance_id, Error):
return instance_id
compute_client = get_compute_client()
resource_group = get_base_resource_group()
try:
instance_vm = compute_client.virtual_machine_scale_set_vms.get(
resource_group, name, instance_id
)
except (ResourceNotFoundError, CloudError):
return Error(
code=ErrorCode.UNABLE_TO_FIND,
errors=["unable to find vm instance: %s:%s" % (name, instance_id)],
)
new_protection_policy = VirtualMachineScaleSetVMProtectionPolicy(
protect_from_scale_in=protect_from_scale_in
)
if instance_vm.protection_policy is not None:
new_protection_policy = instance_vm.protection_policy
new_protection_policy.protect_from_scale_in = protect_from_scale_in
instance_vm.protection_policy = new_protection_policy
try:
compute_client.virtual_machine_scale_set_vms.begin_update(
resource_group, name, instance_id, instance_vm
)
except (ResourceNotFoundError, CloudError):
return Error(
code=ErrorCode.UNABLE_TO_UPDATE,
errors=["unable to set protection policy on: %s:%s" % (vm_id, instance_id)],
)
logging.info(
"Successfully set scale in protection on node %s to %s"
% (vm_id, protect_from_scale_in)
)
return None
class UnableToUpdate(Exception): class UnableToUpdate(Exception):
pass pass

View File

@ -28,7 +28,7 @@ from onefuzztypes.primitives import PoolName
from pydantic import Field from pydantic import Field
from ..__version__ import __version__ from ..__version__ import __version__
from ..azure.vmss import get_instance_id from ..azure.vmss import get_instance_id, update_scale_in_protection
from ..events import send_event from ..events import send_event
from ..orm import MappingIntStrAny, ORMMixin, QueryFilter from ..orm import MappingIntStrAny, ORMMixin, QueryFilter
from ..versions import is_minimum_version from ..versions import is_minimum_version
@ -390,6 +390,7 @@ class Node(BASE_NODE, ORMMixin):
# if we're going to reimage, make sure the node doesn't pick up new work # if we're going to reimage, make sure the node doesn't pick up new work
# too. # too.
self.send_stop_if_free() self.send_stop_if_free()
self.release_scale_in_protection()
self.save() self.save()
@ -502,6 +503,26 @@ class Node(BASE_NODE, ORMMixin):
) )
) )
def acquire_scale_in_protection(self) -> Optional[Error]:
if self.scaleset_node_exists() and self.scaleset_id:
logging.info("Setting scale-in protection on node %s", self.machine_id)
return update_scale_in_protection(
self.scaleset_id, self.machine_id, protect_from_scale_in=True
)
return None
def release_scale_in_protection(self) -> Optional[Error]:
if (
not self.debug_keep_node
and self.scaleset_node_exists()
and self.scaleset_id
):
logging.info("Removing scale-in protection on node %s", self.machine_id)
return update_scale_in_protection(
self.scaleset_id, self.machine_id, protect_from_scale_in=False
)
return None
class NodeTasks(BASE_NODE_TASK, ORMMixin): class NodeTasks(BASE_NODE_TASK, ORMMixin):
@classmethod @classmethod