diff --git a/src/api-service/__app__/onefuzzlib/azure/auto_scale.py b/src/api-service/__app__/onefuzzlib/azure/auto_scale.py new file mode 100644 index 000000000..6a743c96c --- /dev/null +++ b/src/api-service/__app__/onefuzzlib/azure/auto_scale.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import logging +import uuid +from datetime import timedelta +from typing import Any, Dict, Optional, Union +from uuid import UUID + +from azure.core.exceptions import ResourceNotFoundError +from azure.mgmt.monitor.models import ( + AutoscaleProfile, + AutoscaleSettingResource, + ComparisonOperationType, + MetricStatisticType, + MetricTrigger, + ScaleAction, + ScaleCapacity, + ScaleDirection, + ScaleRule, + ScaleType, + TimeAggregationType, +) +from msrestazure.azure_exceptions import CloudError +from onefuzztypes.enums import ErrorCode +from onefuzztypes.models import Error +from onefuzztypes.primitives import Region + +from .creds import ( + get_base_region, + get_base_resource_group, + get_subscription, + retry_on_auth_failure, +) +from .monitor import get_monitor_client + + +@retry_on_auth_failure() +def add_auto_scale_to_vmss( + vmss: UUID, auto_scale_profile: AutoscaleProfile +) -> Optional[Error]: + logging.info("Checking scaleset %s for existing auto scale resources" % vmss) + client = get_monitor_client() + resource_group = get_base_resource_group() + + auto_scale_resource_id = None + + try: + auto_scale_collections = client.autoscale_settings.list_by_resource_group( + resource_group + ) + for auto_scale in auto_scale_collections: + if str(auto_scale.target_resource_uri).endswith(str(vmss)): + auto_scale_resource_id = auto_scale.id + break + except (ResourceNotFoundError, CloudError): + return Error( + code=ErrorCode.INVALID_CONFIGURATION, + errors=[ + "Failed to check if scaleset %s already has an autoscale resource" + % vmss + ], + ) + + if auto_scale_resource_id is not None: + logging.warning("Scaleset %s already has auto scale resource" % vmss) + return None + + resource_creation = create_auto_scale_resource_for( + vmss, get_base_region(), auto_scale_profile + ) + if isinstance(resource_creation, Error): + return resource_creation + return None + + +def create_auto_scale_resource_for( + resource_id: UUID, location: Region, profile: AutoscaleProfile +) -> Union[AutoscaleSettingResource, Error]: + logging.info("Creating auto scale resource for: %s" % resource_id) + client = get_monitor_client() + resource_group = get_base_resource_group() + subscription = get_subscription() + + scaleset_uri = ( + "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s" # noqa: E501 + % (subscription, resource_group, resource_id) + ) + + params: Dict[str, Any] = { + "location": location, + "profiles": [profile], + "target_resource_uri": scaleset_uri, + } + + try: + auto_scale_resource = client.autoscale_settings.create_or_update( + resource_group, str(uuid.uuid4()), params + ) + logging.info( + "Successfully created auto scale resource %s for %s" + % (auto_scale_resource.id, resource_id) + ) + return auto_scale_resource + except (ResourceNotFoundError, CloudError): + return Error( + code=ErrorCode.UNABLE_TO_CREATE, + errors=[ + "unable to create auto scale resource for resource: %s with profile: %s" + % (resource_id, profile) + ], + ) + + +def create_auto_scale_profile(min: int, max: int, queue_uri: str) -> AutoscaleProfile: + return AutoscaleProfile( + name=str(uuid.uuid4()), + capacity=ScaleCapacity(minimum=min, maximum=max, default=max), + rules=[ + ScaleRule( + metric_trigger=MetricTrigger( + metric_name="ApproximateMessageCount", + metric_resource_uri=queue_uri, + # Check every minute + time_grain=timedelta(minutes=1), + # The average amount of messages there are in the pool queue + time_aggregation=TimeAggregationType.AVERAGE, + statistic=MetricStatisticType.COUNT, + # Over the past 10 minutes + time_window=timedelta(minutes=10), + # When there's more than 1 message in the pool queue + operator=ComparisonOperationType.GREATER_THAN, + threshold=1, + ), + scale_action=ScaleAction( + direction=ScaleDirection.INCREASE, + type=ScaleType.CHANGE_COUNT, + value=1, + cooldown=timedelta(minutes=5), + ), + ) + ], + ) diff --git a/src/api-service/__app__/onefuzzlib/azure/log_analytics.py b/src/api-service/__app__/onefuzzlib/azure/log_analytics.py new file mode 100644 index 000000000..c46a8a408 --- /dev/null +++ b/src/api-service/__app__/onefuzzlib/azure/log_analytics.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import os +from typing import Dict + +from azure.mgmt.loganalytics import LogAnalyticsManagementClient +from memoization import cached + +from .creds import get_base_resource_group, get_identity, get_subscription + + +@cached +def get_monitor_client() -> LogAnalyticsManagementClient: + return LogAnalyticsManagementClient(get_identity(), get_subscription()) + + +@cached(ttl=60) +def get_monitor_settings() -> Dict[str, str]: + resource_group = get_base_resource_group() + workspace_name = os.environ["ONEFUZZ_MONITOR"] + client = get_monitor_client() + customer_id = client.workspaces.get(resource_group, workspace_name).customer_id + shared_key = client.shared_keys.get_shared_keys( + resource_group, workspace_name + ).primary_shared_key + return {"id": customer_id, "key": shared_key} diff --git a/src/api-service/__app__/onefuzzlib/azure/monitor.py b/src/api-service/__app__/onefuzzlib/azure/monitor.py index c46a8a408..1508f92d3 100644 --- a/src/api-service/__app__/onefuzzlib/azure/monitor.py +++ b/src/api-service/__app__/onefuzzlib/azure/monitor.py @@ -1,29 +1,9 @@ -#!/usr/bin/env python -# -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import os -from typing import Dict - -from azure.mgmt.loganalytics import LogAnalyticsManagementClient +from azure.mgmt.monitor import MonitorManagementClient from memoization import cached -from .creds import get_base_resource_group, get_identity, get_subscription +from .creds import get_identity, get_subscription @cached -def get_monitor_client() -> LogAnalyticsManagementClient: - return LogAnalyticsManagementClient(get_identity(), get_subscription()) - - -@cached(ttl=60) -def get_monitor_settings() -> Dict[str, str]: - resource_group = get_base_resource_group() - workspace_name = os.environ["ONEFUZZ_MONITOR"] - client = get_monitor_client() - customer_id = client.workspaces.get(resource_group, workspace_name).customer_id - shared_key = client.shared_keys.get_shared_keys( - resource_group, workspace_name - ).primary_shared_key - return {"id": customer_id, "key": shared_key} +def get_monitor_client() -> MonitorManagementClient: + return MonitorManagementClient(get_identity(), get_subscription()) diff --git a/src/api-service/__app__/onefuzzlib/azure/queue.py b/src/api-service/__app__/onefuzzlib/azure/queue.py index 68778e780..3ccc33cec 100644 --- a/src/api-service/__app__/onefuzzlib/azure/queue.py +++ b/src/api-service/__app__/onefuzzlib/azure/queue.py @@ -195,3 +195,9 @@ def queue_object( return True except ResourceNotFoundError: return False + + +def get_resource_id(queue_name: QueueNameType, storage_type: StorageType) -> str: + account_id = get_primary_account(storage_type) + resource_uri = "%s/services/queue/queues/%s" % (account_id, queue_name) + return resource_uri diff --git a/src/api-service/__app__/onefuzzlib/extension.py b/src/api-service/__app__/onefuzzlib/extension.py index 97dc59cfd..5c2f96051 100644 --- a/src/api-service/__app__/onefuzzlib/extension.py +++ b/src/api-service/__app__/onefuzzlib/extension.py @@ -25,7 +25,7 @@ from .azure.containers import ( save_blob, ) from .azure.creds import get_instance_id, get_instance_url -from .azure.monitor import get_monitor_settings +from .azure.log_analytics import get_monitor_settings from .azure.queue import get_queue_sas from .azure.storage import StorageType from .config import InstanceConfig diff --git a/src/api-service/__app__/onefuzzlib/workers/scalesets.py b/src/api-service/__app__/onefuzzlib/workers/scalesets.py index a25e0d258..57ceb4642 100644 --- a/src/api-service/__app__/onefuzzlib/workers/scalesets.py +++ b/src/api-service/__app__/onefuzzlib/workers/scalesets.py @@ -23,8 +23,11 @@ from onefuzztypes.primitives import PoolName, Region from ..__version__ import __version__ from ..azure.auth import build_auth +from ..azure.auto_scale import add_auto_scale_to_vmss, create_auto_scale_profile from ..azure.image import get_os from ..azure.network import Network +from ..azure.queue import get_resource_id +from ..azure.storage import StorageType from ..azure.vmss import ( UnableToUpdate, create_vmss, @@ -242,6 +245,7 @@ class Scaleset(BASE_SCALESET, ORMMixin): self.set_failed(result) return else: + # TODO: Link up auto scale resource with diagnostics logging.info( SCALESET_LOG_PREFIX + "creating scaleset scaleset_id:%s", self.scaleset_id, @@ -257,6 +261,11 @@ class Scaleset(BASE_SCALESET, ORMMixin): SCALESET_LOG_PREFIX + "scaleset running scaleset_id:%s", self.scaleset_id, ) + auto_scaling = self.try_to_enable_auto_scaling() + if isinstance(auto_scaling, Error): + self.set_failed(auto_scaling) + return + identity_result = self.try_set_identity(vmss) if identity_result: self.set_failed(identity_result) @@ -823,3 +832,32 @@ class Scaleset(BASE_SCALESET, ORMMixin): state=self.state, ) ) + + def try_to_enable_auto_scaling(self) -> Optional[Error]: + from .pools import Pool + + logging.info("Trying to add auto scaling for scaleset %s" % self.scaleset_id) + + pool = Pool.get_by_name(self.pool_name) + if isinstance(pool, Error): + logging.error( + "Failed to get pool by name: %s error: %s" % (self.pool_name, pool) + ) + return pool + + pool_queue_id = pool.get_pool_queue() + pool_queue_uri = get_resource_id(pool_queue_id, StorageType.corpus) + capacity = get_vmss_size(self.scaleset_id) + if capacity is None: + capacity_failed = Error( + code=ErrorCode.UNABLE_TO_FIND, + errors=["Failed to get capacity for scaleset %s" % self.scaleset_id], + ) + logging.error(capacity_failed) + return capacity_failed + + auto_scale_profile = create_auto_scale_profile( + capacity, capacity, pool_queue_uri + ) + logging.info("Added auto scale resource to scaleset: %s" % self.scaleset_id) + return add_auto_scale_to_vmss(self.scaleset_id, auto_scale_profile) diff --git a/src/api-service/__app__/requirements.txt b/src/api-service/__app__/requirements.txt index b52f0ba8f..6c51a5c3b 100644 --- a/src/api-service/__app__/requirements.txt +++ b/src/api-service/__app__/requirements.txt @@ -9,6 +9,7 @@ azure-keyvault-secrets~=4.3.0 azure-mgmt-compute==24.0.1 azure-mgmt-core==1.3.0 azure-mgmt-loganalytics~=11.0.0 +azure-mgmt-monitor==3.0.0 azure-mgmt-network==19.0.0 azure-mgmt-storage~=18.0.0 azure-mgmt-resource~=18.1.0 diff --git a/src/deployment/azuredeploy.json b/src/deployment/azuredeploy.json index 9930c7dd2..13e545637 100644 --- a/src/deployment/azuredeploy.json +++ b/src/deployment/azuredeploy.json @@ -67,6 +67,7 @@ "Storage Account Contributor": "17d1049b-9a84-46fb-8f53-869881c3d3ab", "Virtual Machine Contributor": "9980e02c-c2be-4d73-94e8-173b1dc7cf3c", "Storage Blob Data Reader": "2a2b9908-6ea1-4ae2-8e65-a410df84e7d1", + "OneFuzz Deployment": "d4f7c2d9-6c1e-4caa-a39b-cba6d76bc647", "keyVaultName": "[concat('of-kv-', uniquestring(resourceGroup().id))]" }, "functions": [ @@ -819,6 +820,21 @@ "OWNER": "[parameters('owner')]" } }, + { + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2017-09-01", + "name": "[guid(concat(resourceGroup().id, '-auto_scale'))]", + "properties": { + "roleDefinitionId": "[concat('/subscriptions/', subscription().subscriptionId, '/providers/Microsoft.Authorization/roleDefinitions/', variables('OneFuzz Deployment'))]", + "principalId": "[reference(resourceId('Microsoft.Web/sites', parameters('name')), '2018-02-01', 'Full').identity.principalId]" + }, + "DependsOn": [ + "[resourceId('Microsoft.Web/sites', parameters('name'))]" + ], + "tags": { + "OWNER": "[parameters('owner')]" + } + }, { "type": "Microsoft.Authorization/roleAssignments", "apiVersion": "2018-07-01",