Create autoscale resources for scaleset (#1661)

* Initial progress to adding a auto scale resource

* auto scale API is ready

* When creating a scaleset, add an autoscale resource to it as well

* Auto scale is correctly linked with scaleset

* 🧹

* Lint

* Cleaned up
This commit is contained in:
Teo Voinea
2022-02-28 12:28:31 -05:00
committed by GitHub
parent c6f65c0f0e
commit 16166e1c14
8 changed files with 240 additions and 25 deletions

View File

@ -0,0 +1,145 @@
#!/usr/bin/env python
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import logging
import uuid
from datetime import timedelta
from typing import Any, Dict, Optional, Union
from uuid import UUID
from azure.core.exceptions import ResourceNotFoundError
from azure.mgmt.monitor.models import (
AutoscaleProfile,
AutoscaleSettingResource,
ComparisonOperationType,
MetricStatisticType,
MetricTrigger,
ScaleAction,
ScaleCapacity,
ScaleDirection,
ScaleRule,
ScaleType,
TimeAggregationType,
)
from msrestazure.azure_exceptions import CloudError
from onefuzztypes.enums import ErrorCode
from onefuzztypes.models import Error
from onefuzztypes.primitives import Region
from .creds import (
get_base_region,
get_base_resource_group,
get_subscription,
retry_on_auth_failure,
)
from .monitor import get_monitor_client
@retry_on_auth_failure()
def add_auto_scale_to_vmss(
vmss: UUID, auto_scale_profile: AutoscaleProfile
) -> Optional[Error]:
logging.info("Checking scaleset %s for existing auto scale resources" % vmss)
client = get_monitor_client()
resource_group = get_base_resource_group()
auto_scale_resource_id = None
try:
auto_scale_collections = client.autoscale_settings.list_by_resource_group(
resource_group
)
for auto_scale in auto_scale_collections:
if str(auto_scale.target_resource_uri).endswith(str(vmss)):
auto_scale_resource_id = auto_scale.id
break
except (ResourceNotFoundError, CloudError):
return Error(
code=ErrorCode.INVALID_CONFIGURATION,
errors=[
"Failed to check if scaleset %s already has an autoscale resource"
% vmss
],
)
if auto_scale_resource_id is not None:
logging.warning("Scaleset %s already has auto scale resource" % vmss)
return None
resource_creation = create_auto_scale_resource_for(
vmss, get_base_region(), auto_scale_profile
)
if isinstance(resource_creation, Error):
return resource_creation
return None
def create_auto_scale_resource_for(
resource_id: UUID, location: Region, profile: AutoscaleProfile
) -> Union[AutoscaleSettingResource, Error]:
logging.info("Creating auto scale resource for: %s" % resource_id)
client = get_monitor_client()
resource_group = get_base_resource_group()
subscription = get_subscription()
scaleset_uri = (
"/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s" # noqa: E501
% (subscription, resource_group, resource_id)
)
params: Dict[str, Any] = {
"location": location,
"profiles": [profile],
"target_resource_uri": scaleset_uri,
}
try:
auto_scale_resource = client.autoscale_settings.create_or_update(
resource_group, str(uuid.uuid4()), params
)
logging.info(
"Successfully created auto scale resource %s for %s"
% (auto_scale_resource.id, resource_id)
)
return auto_scale_resource
except (ResourceNotFoundError, CloudError):
return Error(
code=ErrorCode.UNABLE_TO_CREATE,
errors=[
"unable to create auto scale resource for resource: %s with profile: %s"
% (resource_id, profile)
],
)
def create_auto_scale_profile(min: int, max: int, queue_uri: str) -> AutoscaleProfile:
return AutoscaleProfile(
name=str(uuid.uuid4()),
capacity=ScaleCapacity(minimum=min, maximum=max, default=max),
rules=[
ScaleRule(
metric_trigger=MetricTrigger(
metric_name="ApproximateMessageCount",
metric_resource_uri=queue_uri,
# Check every minute
time_grain=timedelta(minutes=1),
# The average amount of messages there are in the pool queue
time_aggregation=TimeAggregationType.AVERAGE,
statistic=MetricStatisticType.COUNT,
# Over the past 10 minutes
time_window=timedelta(minutes=10),
# When there's more than 1 message in the pool queue
operator=ComparisonOperationType.GREATER_THAN,
threshold=1,
),
scale_action=ScaleAction(
direction=ScaleDirection.INCREASE,
type=ScaleType.CHANGE_COUNT,
value=1,
cooldown=timedelta(minutes=5),
),
)
],
)

View File

@ -0,0 +1,29 @@
#!/usr/bin/env python
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
from typing import Dict
from azure.mgmt.loganalytics import LogAnalyticsManagementClient
from memoization import cached
from .creds import get_base_resource_group, get_identity, get_subscription
@cached
def get_monitor_client() -> LogAnalyticsManagementClient:
return LogAnalyticsManagementClient(get_identity(), get_subscription())
@cached(ttl=60)
def get_monitor_settings() -> Dict[str, str]:
resource_group = get_base_resource_group()
workspace_name = os.environ["ONEFUZZ_MONITOR"]
client = get_monitor_client()
customer_id = client.workspaces.get(resource_group, workspace_name).customer_id
shared_key = client.shared_keys.get_shared_keys(
resource_group, workspace_name
).primary_shared_key
return {"id": customer_id, "key": shared_key}

View File

@ -1,29 +1,9 @@
#!/usr/bin/env python from azure.mgmt.monitor import MonitorManagementClient
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import os
from typing import Dict
from azure.mgmt.loganalytics import LogAnalyticsManagementClient
from memoization import cached from memoization import cached
from .creds import get_base_resource_group, get_identity, get_subscription from .creds import get_identity, get_subscription
@cached @cached
def get_monitor_client() -> LogAnalyticsManagementClient: def get_monitor_client() -> MonitorManagementClient:
return LogAnalyticsManagementClient(get_identity(), get_subscription()) return MonitorManagementClient(get_identity(), get_subscription())
@cached(ttl=60)
def get_monitor_settings() -> Dict[str, str]:
resource_group = get_base_resource_group()
workspace_name = os.environ["ONEFUZZ_MONITOR"]
client = get_monitor_client()
customer_id = client.workspaces.get(resource_group, workspace_name).customer_id
shared_key = client.shared_keys.get_shared_keys(
resource_group, workspace_name
).primary_shared_key
return {"id": customer_id, "key": shared_key}

View File

@ -195,3 +195,9 @@ def queue_object(
return True return True
except ResourceNotFoundError: except ResourceNotFoundError:
return False return False
def get_resource_id(queue_name: QueueNameType, storage_type: StorageType) -> str:
account_id = get_primary_account(storage_type)
resource_uri = "%s/services/queue/queues/%s" % (account_id, queue_name)
return resource_uri

View File

@ -25,7 +25,7 @@ from .azure.containers import (
save_blob, save_blob,
) )
from .azure.creds import get_instance_id, get_instance_url from .azure.creds import get_instance_id, get_instance_url
from .azure.monitor import get_monitor_settings from .azure.log_analytics import get_monitor_settings
from .azure.queue import get_queue_sas from .azure.queue import get_queue_sas
from .azure.storage import StorageType from .azure.storage import StorageType
from .config import InstanceConfig from .config import InstanceConfig

View File

@ -23,8 +23,11 @@ from onefuzztypes.primitives import PoolName, Region
from ..__version__ import __version__ from ..__version__ import __version__
from ..azure.auth import build_auth from ..azure.auth import build_auth
from ..azure.auto_scale import add_auto_scale_to_vmss, create_auto_scale_profile
from ..azure.image import get_os from ..azure.image import get_os
from ..azure.network import Network from ..azure.network import Network
from ..azure.queue import get_resource_id
from ..azure.storage import StorageType
from ..azure.vmss import ( from ..azure.vmss import (
UnableToUpdate, UnableToUpdate,
create_vmss, create_vmss,
@ -242,6 +245,7 @@ class Scaleset(BASE_SCALESET, ORMMixin):
self.set_failed(result) self.set_failed(result)
return return
else: else:
# TODO: Link up auto scale resource with diagnostics
logging.info( logging.info(
SCALESET_LOG_PREFIX + "creating scaleset scaleset_id:%s", SCALESET_LOG_PREFIX + "creating scaleset scaleset_id:%s",
self.scaleset_id, self.scaleset_id,
@ -257,6 +261,11 @@ class Scaleset(BASE_SCALESET, ORMMixin):
SCALESET_LOG_PREFIX + "scaleset running scaleset_id:%s", SCALESET_LOG_PREFIX + "scaleset running scaleset_id:%s",
self.scaleset_id, self.scaleset_id,
) )
auto_scaling = self.try_to_enable_auto_scaling()
if isinstance(auto_scaling, Error):
self.set_failed(auto_scaling)
return
identity_result = self.try_set_identity(vmss) identity_result = self.try_set_identity(vmss)
if identity_result: if identity_result:
self.set_failed(identity_result) self.set_failed(identity_result)
@ -823,3 +832,32 @@ class Scaleset(BASE_SCALESET, ORMMixin):
state=self.state, state=self.state,
) )
) )
def try_to_enable_auto_scaling(self) -> Optional[Error]:
from .pools import Pool
logging.info("Trying to add auto scaling for scaleset %s" % self.scaleset_id)
pool = Pool.get_by_name(self.pool_name)
if isinstance(pool, Error):
logging.error(
"Failed to get pool by name: %s error: %s" % (self.pool_name, pool)
)
return pool
pool_queue_id = pool.get_pool_queue()
pool_queue_uri = get_resource_id(pool_queue_id, StorageType.corpus)
capacity = get_vmss_size(self.scaleset_id)
if capacity is None:
capacity_failed = Error(
code=ErrorCode.UNABLE_TO_FIND,
errors=["Failed to get capacity for scaleset %s" % self.scaleset_id],
)
logging.error(capacity_failed)
return capacity_failed
auto_scale_profile = create_auto_scale_profile(
capacity, capacity, pool_queue_uri
)
logging.info("Added auto scale resource to scaleset: %s" % self.scaleset_id)
return add_auto_scale_to_vmss(self.scaleset_id, auto_scale_profile)

View File

@ -9,6 +9,7 @@ azure-keyvault-secrets~=4.3.0
azure-mgmt-compute==24.0.1 azure-mgmt-compute==24.0.1
azure-mgmt-core==1.3.0 azure-mgmt-core==1.3.0
azure-mgmt-loganalytics~=11.0.0 azure-mgmt-loganalytics~=11.0.0
azure-mgmt-monitor==3.0.0
azure-mgmt-network==19.0.0 azure-mgmt-network==19.0.0
azure-mgmt-storage~=18.0.0 azure-mgmt-storage~=18.0.0
azure-mgmt-resource~=18.1.0 azure-mgmt-resource~=18.1.0

View File

@ -67,6 +67,7 @@
"Storage Account Contributor": "17d1049b-9a84-46fb-8f53-869881c3d3ab", "Storage Account Contributor": "17d1049b-9a84-46fb-8f53-869881c3d3ab",
"Virtual Machine Contributor": "9980e02c-c2be-4d73-94e8-173b1dc7cf3c", "Virtual Machine Contributor": "9980e02c-c2be-4d73-94e8-173b1dc7cf3c",
"Storage Blob Data Reader": "2a2b9908-6ea1-4ae2-8e65-a410df84e7d1", "Storage Blob Data Reader": "2a2b9908-6ea1-4ae2-8e65-a410df84e7d1",
"OneFuzz Deployment": "d4f7c2d9-6c1e-4caa-a39b-cba6d76bc647",
"keyVaultName": "[concat('of-kv-', uniquestring(resourceGroup().id))]" "keyVaultName": "[concat('of-kv-', uniquestring(resourceGroup().id))]"
}, },
"functions": [ "functions": [
@ -819,6 +820,21 @@
"OWNER": "[parameters('owner')]" "OWNER": "[parameters('owner')]"
} }
}, },
{
"type": "Microsoft.Authorization/roleAssignments",
"apiVersion": "2017-09-01",
"name": "[guid(concat(resourceGroup().id, '-auto_scale'))]",
"properties": {
"roleDefinitionId": "[concat('/subscriptions/', subscription().subscriptionId, '/providers/Microsoft.Authorization/roleDefinitions/', variables('OneFuzz Deployment'))]",
"principalId": "[reference(resourceId('Microsoft.Web/sites', parameters('name')), '2018-02-01', 'Full').identity.principalId]"
},
"DependsOn": [
"[resourceId('Microsoft.Web/sites', parameters('name'))]"
],
"tags": {
"OWNER": "[parameters('owner')]"
}
},
{ {
"type": "Microsoft.Authorization/roleAssignments", "type": "Microsoft.Authorization/roleAssignments",
"apiVersion": "2018-07-01", "apiVersion": "2018-07-01",