From 71d24d6e3366b28e1466e99322d58420b5ebbb67 Mon Sep 17 00:00:00 2001 From: Christina W Date: Tue, 20 Jun 2023 15:45:04 -0700 Subject: [PATCH] Parse container exit error message instead of status The previous implementation in #2170 of parsing the container status was too general, because it relied on the mistaken assumption that a container would have a status of `Stopped` if it was manually stopped. This turned out to be untrue, as manually stopped containers were also getting restarted by the Supervisor due to their inspect status of `exited`. With this, parsing the exit message became unavoidable as there are no other clear ways to discern a container that has been manually stopped and shouldn't be started from a container experiencing the Engine-host race condition issue (again, see #2170). Since we're just parsing the exit error message, we don't need to worry about different behaviors amongst restart policies, as any container with the error message on exit should be started. Change-type: patch Closes: #2178 Signed-off-by: Christina Ying Wang --- src/compose/app.ts | 34 +- src/compose/service.ts | 4 +- src/compose/utils.ts | 19 - .../compose/application-manager.spec.ts | 637 ++++-------------- test/unit/compose/utils.spec.ts | 29 - 5 files changed, 157 insertions(+), 566 deletions(-) diff --git a/src/compose/app.ts b/src/compose/app.ts index 31956936..cfb3b7b1 100644 --- a/src/compose/app.ts +++ b/src/compose/app.ts @@ -12,7 +12,6 @@ import { generateStep, CompositionStepAction, } from './composition-steps'; -import { isValidDateAndOlderThan } from './utils'; import * as targetStateCache from '../device-state/target-state-cache'; import { getNetworkGateway } from '../lib/docker-utils'; import * as constants from '../lib/constants'; @@ -26,8 +25,6 @@ import { ServiceComposeConfig, DeviceMetadata } from './types/service'; import { pathExistsOnRoot } from '../lib/host-utils'; import { isSupervisor } from '../lib/supervisor-metadata'; -const SECONDS_TO_WAIT_FOR_START = 30; - export interface AppConstructOpts { appId: number; appUuid?: string; @@ -643,25 +640,20 @@ export class App { // In the case where the Engine does not start the container despite the // restart policy (this can happen in cases of Engine race conditions with // host resources that are slower to be created but that a service relies on), - // we need to start the container after a delay. + // we need to start the container after a delay. The error message is parsed in this case. private requirementsMetForSpecialStart( current: Service, target: Service, ): boolean { - if ( - current.status !== 'exited' || - current.config.running !== false || - target.config.running !== true - ) { - return false; - } - const restartIsAlwaysOrUnlessStopped = [ - 'always', - 'unless-stopped', - ].includes(target.config.restart); - const restartIsOnFailureWithNonZeroExit = - target.config.restart === 'on-failure' && current.exitCode !== 0; - return restartIsAlwaysOrUnlessStopped || restartIsOnFailureWithNonZeroExit; + const hostRaceErrorRegex = new RegExp( + /userland proxy.*cannot assign requested address$/i, + ); + return ( + current.status === 'exited' && + current.config.running === false && + target.config.running === true && + hostRaceErrorRegex.test(current.exitErrorMessage ?? '') + ); } private generateContainerStep(current: Service, target: Service) { @@ -669,12 +661,6 @@ export class App { if (current.commit !== target.commit) { return generateStep('updateMetadata', { current, target }); } else if (target.config.running !== current.config.running) { - if ( - this.requirementsMetForSpecialStart(current, target) && - !isValidDateAndOlderThan(current.createdAt, SECONDS_TO_WAIT_FOR_START) - ) { - return generateStep('noop', {}); - } if (target.config.running) { return generateStep('start', { target }); } else { diff --git a/src/compose/service.ts b/src/compose/service.ts index 3906ccad..0e2207e2 100644 --- a/src/compose/service.ts +++ b/src/compose/service.ts @@ -61,7 +61,7 @@ export class Service { public serviceId: number; public imageName: string | null; public containerId: string | null; - public exitCode: number | null; + public exitErrorMessage: string | null; public dependsOn: string[] | null; @@ -504,7 +504,7 @@ export class Service { svc.createdAt = new Date(container.Created); svc.containerId = container.Id; - svc.exitCode = container.State.ExitCode; + svc.exitErrorMessage = container.State.Error; let hostname = container.Config.Hostname; if (hostname.length === 12 && _.startsWith(container.Id, hostname)) { diff --git a/src/compose/utils.ts b/src/compose/utils.ts index 786dae15..1b5d89f4 100644 --- a/src/compose/utils.ts +++ b/src/compose/utils.ts @@ -693,22 +693,3 @@ export function dockerMountToServiceMount( return mount as LongDefinition; } - -function isDate(d: unknown): d is Date { - return d instanceof Date; -} - -/** - * @param currentTime Date instance - * @param seconds time in seconds to check against currentTime - * @returns - */ -export function isValidDateAndOlderThan( - currentTime: unknown, - seconds: number, -): boolean { - if (!isDate(currentTime)) { - return false; - } - return new Date().getTime() - currentTime.getTime() > seconds * 1000; -} diff --git a/test/integration/compose/application-manager.spec.ts b/test/integration/compose/application-manager.spec.ts index 745521fd..3c7c38fe 100644 --- a/test/integration/compose/application-manager.spec.ts +++ b/test/integration/compose/application-manager.spec.ts @@ -16,6 +16,7 @@ import { createCurrentState, DEFAULT_NETWORK, } from '~/test-lib/state-helper'; +import { InstancedAppState } from '~/src/types'; // TODO: application manager inferNextSteps still queries some stuff from // the engine instead of receiving that information as parameter. Refactoring @@ -1421,36 +1422,135 @@ describe('compose/application-manager', () => { // In the case where a container requires a host resource such as a network interface that is not created by the time the Engine // comes up, the Engine will not attempt to restart the container which seems to be Docker's implemented behavior (if not the correct behavior). // An example of a host resource would be a port binding such as 192.168.88.1:3000:3000, where the IP is an interface delayed in creation by host. - // In this case, the Supervisor needs to wait a grace period for the Engine to start the container, and if this does not occur, the Supervisor - // deduces the existence of this race condition and generates another start step after a delay (SECONDS_TO_WAIT_FOR_START). + // In this case, the Supervisor parses the exit message of the container, and if it matches the error regex, start the container. describe('handling Engine restart policy inaction when host resource required by container is delayed in creation', () => { - const SECONDS_TO_WAIT_FOR_START = 30; - const newer = SECONDS_TO_WAIT_FOR_START - 2; - const older = SECONDS_TO_WAIT_FOR_START + 2; - - const getTimeNSecondsAgo = (date: Date, seconds: number) => { - const newDate = new Date(date); - newDate.setSeconds(newDate.getSeconds() - seconds); - return newDate; - }; - - it('should not infer any steps for a service with a status of "exited" if restart policy is "no"', async () => { - // Conditions: - // - restart: "no" - // - status: "exited" - const targetApps = createApps( - { - services: [ - await createService({ - image: 'image-1', + const getCurrentState = async (withHostError: boolean) => { + const exitErrorMessage = withHostError + ? 'driver failed programming external connectivity on endpoint one_1_1_deadbeef (deadca1f): Error starting userland proxy: listen tcp4 192.168.88.1:8081: bind: cannot assign requested address' + : 'My test error'; + return createCurrentState({ + services: [ + await createService( + { + image: 'test-image', serviceName: 'one', + running: false, + composition: { + restart: 'always', + }, + }, + { + state: { + status: 'exited', + exitErrorMessage, + }, + }, + ), + await createService( + { + image: 'test-image', + serviceName: 'two', + running: false, + composition: { + restart: 'unless-stopped', + }, + }, + { + state: { + status: 'exited', + exitErrorMessage, + }, + }, + ), + await createService( + { + image: 'test-image', + serviceName: 'three', + running: false, + composition: { + restart: 'on-failure', + }, + }, + { + state: { + status: 'exited', + exitErrorMessage, + }, + }, + ), + await createService( + { + image: 'test-image', + serviceName: 'four', + running: false, composition: { restart: 'no', }, + }, + { + state: { + status: 'exited', + exitErrorMessage, + }, + }, + ), + ], + networks: [DEFAULT_NETWORK], + images: [ + createImage({ + name: 'test-image', + serviceName: 'one', + }), + createImage({ + name: 'test-image', + serviceName: 'two', + }), + createImage({ + name: 'test-image', + serviceName: 'three', + }), + createImage({ + name: 'test-image', + serviceName: 'four', + }), + ], + }); + }; + + let targetApps: InstancedAppState; + + before(async () => { + targetApps = createApps( + { + services: [ + await createService({ + image: 'test-image', + serviceName: 'one', + running: true, + composition: { + restart: 'always', + }, }), await createService({ - image: 'image-2', + image: 'test-image', serviceName: 'two', + running: true, + composition: { + restart: 'unless-stopped', + }, + }), + await createService({ + image: 'test-image', + serviceName: 'three', + running: true, + composition: { + restart: 'on-failure', + }, + }), + await createService({ + image: 'test-image', + serviceName: 'four', + running: true, composition: { restart: 'no', }, @@ -1460,54 +1560,33 @@ describe('compose/application-manager', () => { }, true, ); + }); + it('should infer a start step for a service that exited with the "userland proxy" error for all restart policies', async () => { const { currentApps, availableImages, downloading, containerIdsByAppId } = - createCurrentState({ - services: [ - await createService( - { - image: 'image-1', - serviceName: 'one', - composition: { - restart: 'no', - }, - }, - { - state: { - status: 'exited', - createdAt: getTimeNSecondsAgo(new Date(), newer), - }, - }, - ), - await createService( - { - image: 'image-2', - serviceName: 'two', - composition: { - restart: 'no', - }, - }, - { - state: { - status: 'exited', - createdAt: getTimeNSecondsAgo(new Date(), older), - }, - }, - ), - ], - networks: [DEFAULT_NETWORK], - images: [ - createImage({ - name: 'image-1', - serviceName: 'one', - }), - createImage({ - name: 'image-2', - serviceName: 'two', - }), - ], + await getCurrentState(true); + + const [startStep1, startStep2, startStep3, startStep4, ...nextSteps] = + await applicationManager.inferNextSteps(currentApps, targetApps, { + downloading, + availableImages, + containerIdsByAppId, }); + [startStep1, startStep2, startStep3, startStep4].forEach((step) => { + expect(step).to.have.property('action').that.equals('start'); + expect(step) + .to.have.property('target') + .that.has.property('serviceName') + .that.is.oneOf(['one', 'two', 'three', 'four']); + }); + expect(nextSteps).to.have.lengthOf(0); + }); + + it('should not infer any steps for a service with a status of "exited" without the "userland proxy" error message', async () => { + const { currentApps, availableImages, downloading, containerIdsByAppId } = + await getCurrentState(false); + const [...steps] = await applicationManager.inferNextSteps( currentApps, targetApps, @@ -1520,431 +1599,5 @@ describe('compose/application-manager', () => { expect(steps).to.have.lengthOf(0); }); - - describe('restart policy "on-failure"', () => { - it('should not infer any steps for a service that exited with code 0', async () => { - // Conditions: - // - restart: "on-failure" - // - status: "exited" - // - exitCode: 0 - const targetApps = createApps( - { - services: [ - await createService({ - image: 'image-1', - serviceName: 'one', - composition: { - restart: 'on-failure', - }, - }), - await createService({ - image: 'image-2', - serviceName: 'two', - composition: { - restart: 'on-failure', - }, - }), - ], - networks: [DEFAULT_NETWORK], - }, - true, - ); - - const { - currentApps, - availableImages, - downloading, - containerIdsByAppId, - } = createCurrentState({ - services: [ - await createService( - { - image: 'image-1', - serviceName: 'one', - running: false, - composition: { - restart: 'on-failure', - }, - }, - { - state: { - status: 'exited', - exitCode: 0, - createdAt: getTimeNSecondsAgo(new Date(), newer), - }, - }, - ), - await createService( - { - image: 'image-2', - serviceName: 'two', - running: false, - composition: { - restart: 'on-failure', - }, - }, - { - state: { - status: 'exited', - exitCode: 0, - createdAt: getTimeNSecondsAgo(new Date(), older), - }, - }, - ), - ], - networks: [DEFAULT_NETWORK], - images: [ - createImage({ - name: 'image-1', - serviceName: 'one', - }), - createImage({ - name: 'image-2', - serviceName: 'two', - }), - ], - }); - - const [...steps] = await applicationManager.inferNextSteps( - currentApps, - targetApps, - { - downloading, - availableImages, - containerIdsByAppId, - }, - ); - - expect(steps).to.have.lengthOf(0); - }); - - it('should infer a noop step for a service that was created <= SECONDS_TO_WAIT_FOR_START ago and exited non-zero', async () => { - // Conditions: - // - restart: "on-failure" - // - status: "exited" - // - exitCode: non-zero - // - createdAt: <= SECONDS_TO_WAIT_FOR_START - const targetApps = createApps( - { - services: [ - await createService({ - image: 'image-1', - serviceName: 'one', - composition: { - restart: 'on-failure', - }, - }), - ], - networks: [DEFAULT_NETWORK], - }, - true, - ); - - const { - currentApps, - availableImages, - downloading, - containerIdsByAppId, - } = createCurrentState({ - services: [ - await createService( - { - image: 'image-1', - serviceName: 'one', - running: false, - composition: { - restart: 'on-failure', - }, - }, - { - state: { - status: 'exited', - exitCode: 1, - createdAt: getTimeNSecondsAgo(new Date(), newer), - }, - }, - ), - ], - networks: [DEFAULT_NETWORK], - images: [ - createImage({ - name: 'image-1', - serviceName: 'one', - }), - ], - }); - - const [noopStep, ...nextSteps] = - await applicationManager.inferNextSteps(currentApps, targetApps, { - downloading, - availableImages, - containerIdsByAppId, - }); - - expect(noopStep).to.have.property('action').that.equals('noop'); - expect(nextSteps).to.have.lengthOf(0); - }); - - it('should infer a start step for a service that was created > SECONDS_TO_WAIT_FOR_START ago and exited non-zero', async () => { - // Conditions: - // - restart: "on-failure" - // - status: "exited" - // - exitCode: non-zero\ - // - createdAt: > SECONDS_TO_WAIT_FOR_START - const targetApps = createApps( - { - services: [ - await createService({ - image: 'image-1', - serviceName: 'one', - composition: { - restart: 'on-failure', - }, - }), - ], - networks: [DEFAULT_NETWORK], - }, - true, - ); - - const { - currentApps, - availableImages, - downloading, - containerIdsByAppId, - } = createCurrentState({ - services: [ - await createService( - { - image: 'image-1', - serviceName: 'one', - running: false, - composition: { - restart: 'on-failure', - }, - }, - { - state: { - status: 'exited', - exitCode: 1, - createdAt: getTimeNSecondsAgo(new Date(), older), - }, - }, - ), - ], - networks: [DEFAULT_NETWORK], - images: [ - createImage({ - name: 'image-1', - serviceName: 'one', - }), - ], - }); - - const [startStep, ...nextSteps] = - await applicationManager.inferNextSteps(currentApps, targetApps, { - downloading, - availableImages, - containerIdsByAppId, - }); - - expect(startStep).to.have.property('action').that.equals('start'); - expect(nextSteps).to.have.lengthOf(0); - }); - }); - - describe('restart policy "always" or "unless-stopped"', () => { - it('should infer a noop step for a service that was created <= SECONDS_TO_WAIT_FOR_START ago with status of "exited"', async () => { - // Conditions: - // - restart: "always" || "unless-stopped" - // - status: "exited" - // - createdAt: <= SECONDS_TO_WAIT_FOR_START ago - const targetApps = createApps( - { - services: [ - await createService({ - image: 'image-1', - serviceName: 'one', - composition: { - restart: 'always', - }, - }), - await createService({ - image: 'image-2', - serviceName: 'two', - composition: { - restart: 'unless-stopped', - }, - }), - ], - networks: [DEFAULT_NETWORK], - }, - true, - ); - - const { - currentApps, - availableImages, - downloading, - containerIdsByAppId, - } = createCurrentState({ - services: [ - await createService( - { - image: 'image-1', - serviceName: 'one', - running: false, - composition: { - restart: 'always', - }, - }, - { - state: { - status: 'exited', - createdAt: getTimeNSecondsAgo(new Date(), newer), - }, - }, - ), - await createService( - { - image: 'image-2', - serviceName: 'two', - running: false, - composition: { - restart: 'unless-stopped', - }, - }, - { - state: { - status: 'exited', - createdAt: getTimeNSecondsAgo(new Date(), newer), - }, - }, - ), - ], - networks: [DEFAULT_NETWORK], - images: [ - createImage({ - name: 'image-1', - serviceName: 'one', - }), - createImage({ - name: 'image-2', - serviceName: 'two', - }), - ], - }); - - const [noopStep1, noopStep2, ...nextSteps] = - await applicationManager.inferNextSteps(currentApps, targetApps, { - downloading, - availableImages, - containerIdsByAppId, - }); - - expect(noopStep1).to.have.property('action').that.equals('noop'); - expect(noopStep2).to.have.property('action').that.equals('noop'); - - expect(nextSteps).to.have.lengthOf(0); - }); - - it('should infer a start step for a service that was created > SECONDS_TO_WAIT_FOR_START ago with status of "exited"', async () => { - // Conditions: - // - restart: "always" || "unless-stopped" - // - status: "exited" - // - createdAt: > SECONDS_TO_WAIT_FOR_START ago - const targetApps = createApps( - { - services: [ - await createService({ - image: 'image-1', - serviceName: 'one', - composition: { - restart: 'always', - }, - }), - await createService({ - image: 'image-2', - serviceName: 'two', - composition: { - restart: 'unless-stopped', - }, - }), - ], - networks: [DEFAULT_NETWORK], - }, - true, - ); - - const { - currentApps, - availableImages, - downloading, - containerIdsByAppId, - } = createCurrentState({ - services: [ - await createService( - { - image: 'image-1', - serviceName: 'one', - running: false, - composition: { - restart: 'always', - }, - }, - { - state: { - status: 'exited', - createdAt: getTimeNSecondsAgo(new Date(), older), - }, - }, - ), - await createService( - { - image: 'image-2', - serviceName: 'two', - running: false, - composition: { - restart: 'unless-stopped', - }, - }, - { - state: { - status: 'exited', - createdAt: getTimeNSecondsAgo(new Date(), older), - }, - }, - ), - ], - networks: [DEFAULT_NETWORK], - images: [ - createImage({ - name: 'image-1', - serviceName: 'one', - }), - createImage({ - name: 'image-2', - serviceName: 'two', - }), - ], - }); - - const [startStep1, startStep2, ...nextSteps] = - await applicationManager.inferNextSteps(currentApps, targetApps, { - downloading, - availableImages, - containerIdsByAppId, - }); - - [startStep1, startStep2].forEach((step) => { - expect(step).to.have.property('action').that.equals('start'); - expect(step) - .to.have.property('target') - .that.has.property('serviceName') - .that.is.oneOf(['one', 'two']); - }); - expect(nextSteps).to.have.lengthOf(0); - }); - }); }); }); diff --git a/test/unit/compose/utils.spec.ts b/test/unit/compose/utils.spec.ts index 2d78f530..3fc20b77 100644 --- a/test/unit/compose/utils.spec.ts +++ b/test/unit/compose/utils.spec.ts @@ -11,33 +11,4 @@ describe('compose/utils', () => { networks: ['test', 'test2'], }); }); - - it('should return whether a date is valid and older than an interval of seconds', () => { - const now = new Date(Date.now()); - expect(ComposeUtils.isValidDateAndOlderThan(now, 60)).to.equal(false); - const time59SecondsAgo = new Date(Date.now() - 59 * 1000); - expect(ComposeUtils.isValidDateAndOlderThan(time59SecondsAgo, 60)).to.equal( - false, - ); - const time61SecondsAgo = new Date(Date.now() - 61 * 1000); - expect(ComposeUtils.isValidDateAndOlderThan(time61SecondsAgo, 60)).to.equal( - true, - ); - - const notDates = [ - null, - undefined, - '', - 'test', - 123, - 0, - -1, - Infinity, - NaN, - {}, - ]; - notDates.forEach((n) => { - expect(ComposeUtils.isValidDateAndOlderThan(n, 0)).to.equal(false); - }); - }); });