Reimage dead nodes (#154)

This commit is contained in:
Cheick Keita 2020-10-20 13:58:02 -07:00 committed by GitHub
parent e5f723e9c9
commit 041c6ae130
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -62,6 +62,8 @@ from .azure.vmss import (
from .extension import fuzz_extensions
from .orm import MappingIntStrAny, ORMMixin, QueryFilter
NODE_EXPIRATION_TIME: datetime.timedelta = datetime.timedelta(hours=1)
# Future work:
#
# Enabling autoscaling for the scalesets based on the pool work queues.
@ -278,6 +280,18 @@ class Node(BASE_NODE, ORMMixin):
self.set_shutdown()
self.stop()
@classmethod
def get_dead_nodes(
cls, scaleset_id: UUID, expiration_period: datetime.timedelta
) -> List["Node"]:
time_filter = "heartbeat lt datetime'%s'" % (
(datetime.datetime.utcnow() - expiration_period).isoformat()
)
return cls.search(
query={"scaleset_id": [scaleset_id]},
raw_unchecked_filter=time_filter,
)
class NodeTasks(BASE_NODE_TASK, ORMMixin):
@classmethod
@ -743,6 +757,11 @@ class Scaleset(BASE_SCALESET, ORMMixin):
# only add nodes that are not already set to reschedule
to_reimage.append(node)
dead_nodes = Node.get_dead_nodes(self.scaleset_id, NODE_EXPIRATION_TIME)
for node in dead_nodes:
node.set_halt()
to_reimage.append(node)
# Perform operations until they fail due to scaleset getting locked
try:
if to_delete: