add missing scaleset nodes (#518)

This commit is contained in:
bmc-msft
2021-02-08 13:50:08 -05:00
committed by GitHub
parent 19655b50ae
commit 8c9f65c0be
2 changed files with 42 additions and 7 deletions

View File

@ -49,6 +49,7 @@ class Node(BASE_NODE, ORMMixin):
machine_id: UUID,
scaleset_id: Optional[UUID],
version: str,
new: bool = False,
) -> "Node":
node = cls(
pool_name=pool_name,
@ -56,14 +57,18 @@ class Node(BASE_NODE, ORMMixin):
scaleset_id=scaleset_id,
version=version,
)
node.save()
send_event(
EventNodeCreated(
machine_id=node.machine_id,
scaleset_id=node.scaleset_id,
pool_name=node.pool_name,
# `save` returns None if it's successfully saved. If `new` is set to
# True, `save` returns an Error if an object already exists. As such,
# only send an event if result is None
result = node.save(new=new)
if result is None:
send_event(
EventNodeCreated(
machine_id=node.machine_id,
scaleset_id=node.scaleset_id,
pool_name=node.pool_name,
)
)
)
return node
@classmethod

View File

@ -20,6 +20,7 @@ from onefuzztypes.models import ScalesetNodeState
from onefuzztypes.primitives import PoolName, Region
from pydantic import BaseModel, Field
from ..__version__ import __version__
from ..azure.auth import build_auth
from ..azure.image import get_os
from ..azure.network import Network
@ -300,6 +301,35 @@ class Scaleset(BASE_SCALESET, ORMMixin):
)
node.delete()
# Scalesets can have nodes that never check in (such as broken OS setup
# scripts).
#
# This will add nodes that Azure knows about but have not checked in
# such that the `dead node` detection will eventually reimage the node.
#
# NOTE: If node setup takes longer than NODE_EXPIRATION_TIME (1 hour),
# this will cause the nodes to continuously get reimaged.
node_machine_ids = [x.machine_id for x in nodes]
for machine_id in azure_nodes:
if machine_id in node_machine_ids:
continue
logging.info(
"scaleset - adding missing azure node: %s:%s",
self.scaleset_id,
machine_id,
)
# Note, using `new=True` makes it such that if a node already has
# checked in, this won't overwrite it.
Node.create(
pool_name=self.pool_name,
machine_id=machine_id,
scaleset_id=self.scaleset_id,
version=__version__,
new=True,
)
existing_nodes = [x for x in nodes if x.machine_id in azure_nodes]
nodes_to_reset = [
x for x in existing_nodes if x.state in NodeState.ready_for_reset()