add missing scaleset nodes (#518)

This commit is contained in:
bmc-msft
2021-02-08 13:50:08 -05:00
committed by GitHub
parent 19655b50ae
commit 8c9f65c0be
2 changed files with 42 additions and 7 deletions

View File

@ -49,6 +49,7 @@ class Node(BASE_NODE, ORMMixin):
machine_id: UUID, machine_id: UUID,
scaleset_id: Optional[UUID], scaleset_id: Optional[UUID],
version: str, version: str,
new: bool = False,
) -> "Node": ) -> "Node":
node = cls( node = cls(
pool_name=pool_name, pool_name=pool_name,
@ -56,14 +57,18 @@ class Node(BASE_NODE, ORMMixin):
scaleset_id=scaleset_id, scaleset_id=scaleset_id,
version=version, version=version,
) )
node.save() # `save` returns None if it's successfully saved. If `new` is set to
send_event( # True, `save` returns an Error if an object already exists. As such,
EventNodeCreated( # only send an event if result is None
machine_id=node.machine_id, result = node.save(new=new)
scaleset_id=node.scaleset_id, if result is None:
pool_name=node.pool_name, send_event(
EventNodeCreated(
machine_id=node.machine_id,
scaleset_id=node.scaleset_id,
pool_name=node.pool_name,
)
) )
)
return node return node
@classmethod @classmethod

View File

@ -20,6 +20,7 @@ from onefuzztypes.models import ScalesetNodeState
from onefuzztypes.primitives import PoolName, Region from onefuzztypes.primitives import PoolName, Region
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from ..__version__ import __version__
from ..azure.auth import build_auth from ..azure.auth import build_auth
from ..azure.image import get_os from ..azure.image import get_os
from ..azure.network import Network from ..azure.network import Network
@ -300,6 +301,35 @@ class Scaleset(BASE_SCALESET, ORMMixin):
) )
node.delete() node.delete()
# Scalesets can have nodes that never check in (such as broken OS setup
# scripts).
#
# This will add nodes that Azure knows about but have not checked in
# such that the `dead node` detection will eventually reimage the node.
#
# NOTE: If node setup takes longer than NODE_EXPIRATION_TIME (1 hour),
# this will cause the nodes to continuously get reimaged.
node_machine_ids = [x.machine_id for x in nodes]
for machine_id in azure_nodes:
if machine_id in node_machine_ids:
continue
logging.info(
"scaleset - adding missing azure node: %s:%s",
self.scaleset_id,
machine_id,
)
# Note, using `new=True` makes it such that if a node already has
# checked in, this won't overwrite it.
Node.create(
pool_name=self.pool_name,
machine_id=machine_id,
scaleset_id=self.scaleset_id,
version=__version__,
new=True,
)
existing_nodes = [x for x in nodes if x.machine_id in azure_nodes] existing_nodes = [x for x in nodes if x.machine_id in azure_nodes]
nodes_to_reset = [ nodes_to_reset = [
x for x in existing_nodes if x.state in NodeState.ready_for_reset() x for x in existing_nodes if x.state in NodeState.ready_for_reset()