Merge pull request #2179 from balena-os/parse-exit-message-for-engine-host-race

Parse container exit error message instead of status to fix Engine-host race
This commit is contained in:
flowzone-app[bot] 2023-06-23 18:45:27 +00:00 committed by GitHub
commit a0effca2c2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 157 additions and 566 deletions

View File

@ -12,7 +12,6 @@ import {
generateStep,
CompositionStepAction,
} from './composition-steps';
import { isValidDateAndOlderThan } from './utils';
import * as targetStateCache from '../device-state/target-state-cache';
import { getNetworkGateway } from '../lib/docker-utils';
import * as constants from '../lib/constants';
@ -26,8 +25,6 @@ import { ServiceComposeConfig, DeviceMetadata } from './types/service';
import { pathExistsOnRoot } from '../lib/host-utils';
import { isSupervisor } from '../lib/supervisor-metadata';
const SECONDS_TO_WAIT_FOR_START = 30;
export interface AppConstructOpts {
appId: number;
appUuid?: string;
@ -643,25 +640,20 @@ export class App {
// In the case where the Engine does not start the container despite the
// restart policy (this can happen in cases of Engine race conditions with
// host resources that are slower to be created but that a service relies on),
// we need to start the container after a delay.
// we need to start the container after a delay. The error message is parsed in this case.
private requirementsMetForSpecialStart(
current: Service,
target: Service,
): boolean {
if (
current.status !== 'exited' ||
current.config.running !== false ||
target.config.running !== true
) {
return false;
}
const restartIsAlwaysOrUnlessStopped = [
'always',
'unless-stopped',
].includes(target.config.restart);
const restartIsOnFailureWithNonZeroExit =
target.config.restart === 'on-failure' && current.exitCode !== 0;
return restartIsAlwaysOrUnlessStopped || restartIsOnFailureWithNonZeroExit;
const hostRaceErrorRegex = new RegExp(
/userland proxy.*cannot assign requested address$/i,
);
return (
current.status === 'exited' &&
current.config.running === false &&
target.config.running === true &&
hostRaceErrorRegex.test(current.exitErrorMessage ?? '')
);
}
private generateContainerStep(current: Service, target: Service) {
@ -669,12 +661,6 @@ export class App {
if (current.commit !== target.commit) {
return generateStep('updateMetadata', { current, target });
} else if (target.config.running !== current.config.running) {
if (
this.requirementsMetForSpecialStart(current, target) &&
!isValidDateAndOlderThan(current.createdAt, SECONDS_TO_WAIT_FOR_START)
) {
return generateStep('noop', {});
}
if (target.config.running) {
return generateStep('start', { target });
} else {

View File

@ -61,7 +61,7 @@ export class Service {
public serviceId: number;
public imageName: string | null;
public containerId: string | null;
public exitCode: number | null;
public exitErrorMessage: string | null;
public dependsOn: string[] | null;
@ -504,7 +504,7 @@ export class Service {
svc.createdAt = new Date(container.Created);
svc.containerId = container.Id;
svc.exitCode = container.State.ExitCode;
svc.exitErrorMessage = container.State.Error;
let hostname = container.Config.Hostname;
if (hostname.length === 12 && _.startsWith(container.Id, hostname)) {

View File

@ -693,22 +693,3 @@ export function dockerMountToServiceMount(
return mount as LongDefinition;
}
function isDate(d: unknown): d is Date {
return d instanceof Date;
}
/**
* @param currentTime Date instance
* @param seconds time in seconds to check against currentTime
* @returns
*/
export function isValidDateAndOlderThan(
currentTime: unknown,
seconds: number,
): boolean {
if (!isDate(currentTime)) {
return false;
}
return new Date().getTime() - currentTime.getTime() > seconds * 1000;
}

View File

@ -16,6 +16,7 @@ import {
createCurrentState,
DEFAULT_NETWORK,
} from '~/test-lib/state-helper';
import { InstancedAppState } from '~/src/types';
// TODO: application manager inferNextSteps still queries some stuff from
// the engine instead of receiving that information as parameter. Refactoring
@ -1421,36 +1422,135 @@ describe('compose/application-manager', () => {
// In the case where a container requires a host resource such as a network interface that is not created by the time the Engine
// comes up, the Engine will not attempt to restart the container which seems to be Docker's implemented behavior (if not the correct behavior).
// An example of a host resource would be a port binding such as 192.168.88.1:3000:3000, where the IP is an interface delayed in creation by host.
// In this case, the Supervisor needs to wait a grace period for the Engine to start the container, and if this does not occur, the Supervisor
// deduces the existence of this race condition and generates another start step after a delay (SECONDS_TO_WAIT_FOR_START).
// In this case, the Supervisor parses the exit message of the container, and if it matches the error regex, start the container.
describe('handling Engine restart policy inaction when host resource required by container is delayed in creation', () => {
const SECONDS_TO_WAIT_FOR_START = 30;
const newer = SECONDS_TO_WAIT_FOR_START - 2;
const older = SECONDS_TO_WAIT_FOR_START + 2;
const getTimeNSecondsAgo = (date: Date, seconds: number) => {
const newDate = new Date(date);
newDate.setSeconds(newDate.getSeconds() - seconds);
return newDate;
};
it('should not infer any steps for a service with a status of "exited" if restart policy is "no"', async () => {
// Conditions:
// - restart: "no"
// - status: "exited"
const targetApps = createApps(
{
services: [
await createService({
image: 'image-1',
const getCurrentState = async (withHostError: boolean) => {
const exitErrorMessage = withHostError
? 'driver failed programming external connectivity on endpoint one_1_1_deadbeef (deadca1f): Error starting userland proxy: listen tcp4 192.168.88.1:8081: bind: cannot assign requested address'
: 'My test error';
return createCurrentState({
services: [
await createService(
{
image: 'test-image',
serviceName: 'one',
running: false,
composition: {
restart: 'always',
},
},
{
state: {
status: 'exited',
exitErrorMessage,
},
},
),
await createService(
{
image: 'test-image',
serviceName: 'two',
running: false,
composition: {
restart: 'unless-stopped',
},
},
{
state: {
status: 'exited',
exitErrorMessage,
},
},
),
await createService(
{
image: 'test-image',
serviceName: 'three',
running: false,
composition: {
restart: 'on-failure',
},
},
{
state: {
status: 'exited',
exitErrorMessage,
},
},
),
await createService(
{
image: 'test-image',
serviceName: 'four',
running: false,
composition: {
restart: 'no',
},
},
{
state: {
status: 'exited',
exitErrorMessage,
},
},
),
],
networks: [DEFAULT_NETWORK],
images: [
createImage({
name: 'test-image',
serviceName: 'one',
}),
createImage({
name: 'test-image',
serviceName: 'two',
}),
createImage({
name: 'test-image',
serviceName: 'three',
}),
createImage({
name: 'test-image',
serviceName: 'four',
}),
],
});
};
let targetApps: InstancedAppState;
before(async () => {
targetApps = createApps(
{
services: [
await createService({
image: 'test-image',
serviceName: 'one',
running: true,
composition: {
restart: 'always',
},
}),
await createService({
image: 'image-2',
image: 'test-image',
serviceName: 'two',
running: true,
composition: {
restart: 'unless-stopped',
},
}),
await createService({
image: 'test-image',
serviceName: 'three',
running: true,
composition: {
restart: 'on-failure',
},
}),
await createService({
image: 'test-image',
serviceName: 'four',
running: true,
composition: {
restart: 'no',
},
@ -1460,54 +1560,33 @@ describe('compose/application-manager', () => {
},
true,
);
});
it('should infer a start step for a service that exited with the "userland proxy" error for all restart policies', async () => {
const { currentApps, availableImages, downloading, containerIdsByAppId } =
createCurrentState({
services: [
await createService(
{
image: 'image-1',
serviceName: 'one',
composition: {
restart: 'no',
},
},
{
state: {
status: 'exited',
createdAt: getTimeNSecondsAgo(new Date(), newer),
},
},
),
await createService(
{
image: 'image-2',
serviceName: 'two',
composition: {
restart: 'no',
},
},
{
state: {
status: 'exited',
createdAt: getTimeNSecondsAgo(new Date(), older),
},
},
),
],
networks: [DEFAULT_NETWORK],
images: [
createImage({
name: 'image-1',
serviceName: 'one',
}),
createImage({
name: 'image-2',
serviceName: 'two',
}),
],
await getCurrentState(true);
const [startStep1, startStep2, startStep3, startStep4, ...nextSteps] =
await applicationManager.inferNextSteps(currentApps, targetApps, {
downloading,
availableImages,
containerIdsByAppId,
});
[startStep1, startStep2, startStep3, startStep4].forEach((step) => {
expect(step).to.have.property('action').that.equals('start');
expect(step)
.to.have.property('target')
.that.has.property('serviceName')
.that.is.oneOf(['one', 'two', 'three', 'four']);
});
expect(nextSteps).to.have.lengthOf(0);
});
it('should not infer any steps for a service with a status of "exited" without the "userland proxy" error message', async () => {
const { currentApps, availableImages, downloading, containerIdsByAppId } =
await getCurrentState(false);
const [...steps] = await applicationManager.inferNextSteps(
currentApps,
targetApps,
@ -1520,431 +1599,5 @@ describe('compose/application-manager', () => {
expect(steps).to.have.lengthOf(0);
});
describe('restart policy "on-failure"', () => {
it('should not infer any steps for a service that exited with code 0', async () => {
// Conditions:
// - restart: "on-failure"
// - status: "exited"
// - exitCode: 0
const targetApps = createApps(
{
services: [
await createService({
image: 'image-1',
serviceName: 'one',
composition: {
restart: 'on-failure',
},
}),
await createService({
image: 'image-2',
serviceName: 'two',
composition: {
restart: 'on-failure',
},
}),
],
networks: [DEFAULT_NETWORK],
},
true,
);
const {
currentApps,
availableImages,
downloading,
containerIdsByAppId,
} = createCurrentState({
services: [
await createService(
{
image: 'image-1',
serviceName: 'one',
running: false,
composition: {
restart: 'on-failure',
},
},
{
state: {
status: 'exited',
exitCode: 0,
createdAt: getTimeNSecondsAgo(new Date(), newer),
},
},
),
await createService(
{
image: 'image-2',
serviceName: 'two',
running: false,
composition: {
restart: 'on-failure',
},
},
{
state: {
status: 'exited',
exitCode: 0,
createdAt: getTimeNSecondsAgo(new Date(), older),
},
},
),
],
networks: [DEFAULT_NETWORK],
images: [
createImage({
name: 'image-1',
serviceName: 'one',
}),
createImage({
name: 'image-2',
serviceName: 'two',
}),
],
});
const [...steps] = await applicationManager.inferNextSteps(
currentApps,
targetApps,
{
downloading,
availableImages,
containerIdsByAppId,
},
);
expect(steps).to.have.lengthOf(0);
});
it('should infer a noop step for a service that was created <= SECONDS_TO_WAIT_FOR_START ago and exited non-zero', async () => {
// Conditions:
// - restart: "on-failure"
// - status: "exited"
// - exitCode: non-zero
// - createdAt: <= SECONDS_TO_WAIT_FOR_START
const targetApps = createApps(
{
services: [
await createService({
image: 'image-1',
serviceName: 'one',
composition: {
restart: 'on-failure',
},
}),
],
networks: [DEFAULT_NETWORK],
},
true,
);
const {
currentApps,
availableImages,
downloading,
containerIdsByAppId,
} = createCurrentState({
services: [
await createService(
{
image: 'image-1',
serviceName: 'one',
running: false,
composition: {
restart: 'on-failure',
},
},
{
state: {
status: 'exited',
exitCode: 1,
createdAt: getTimeNSecondsAgo(new Date(), newer),
},
},
),
],
networks: [DEFAULT_NETWORK],
images: [
createImage({
name: 'image-1',
serviceName: 'one',
}),
],
});
const [noopStep, ...nextSteps] =
await applicationManager.inferNextSteps(currentApps, targetApps, {
downloading,
availableImages,
containerIdsByAppId,
});
expect(noopStep).to.have.property('action').that.equals('noop');
expect(nextSteps).to.have.lengthOf(0);
});
it('should infer a start step for a service that was created > SECONDS_TO_WAIT_FOR_START ago and exited non-zero', async () => {
// Conditions:
// - restart: "on-failure"
// - status: "exited"
// - exitCode: non-zero\
// - createdAt: > SECONDS_TO_WAIT_FOR_START
const targetApps = createApps(
{
services: [
await createService({
image: 'image-1',
serviceName: 'one',
composition: {
restart: 'on-failure',
},
}),
],
networks: [DEFAULT_NETWORK],
},
true,
);
const {
currentApps,
availableImages,
downloading,
containerIdsByAppId,
} = createCurrentState({
services: [
await createService(
{
image: 'image-1',
serviceName: 'one',
running: false,
composition: {
restart: 'on-failure',
},
},
{
state: {
status: 'exited',
exitCode: 1,
createdAt: getTimeNSecondsAgo(new Date(), older),
},
},
),
],
networks: [DEFAULT_NETWORK],
images: [
createImage({
name: 'image-1',
serviceName: 'one',
}),
],
});
const [startStep, ...nextSteps] =
await applicationManager.inferNextSteps(currentApps, targetApps, {
downloading,
availableImages,
containerIdsByAppId,
});
expect(startStep).to.have.property('action').that.equals('start');
expect(nextSteps).to.have.lengthOf(0);
});
});
describe('restart policy "always" or "unless-stopped"', () => {
it('should infer a noop step for a service that was created <= SECONDS_TO_WAIT_FOR_START ago with status of "exited"', async () => {
// Conditions:
// - restart: "always" || "unless-stopped"
// - status: "exited"
// - createdAt: <= SECONDS_TO_WAIT_FOR_START ago
const targetApps = createApps(
{
services: [
await createService({
image: 'image-1',
serviceName: 'one',
composition: {
restart: 'always',
},
}),
await createService({
image: 'image-2',
serviceName: 'two',
composition: {
restart: 'unless-stopped',
},
}),
],
networks: [DEFAULT_NETWORK],
},
true,
);
const {
currentApps,
availableImages,
downloading,
containerIdsByAppId,
} = createCurrentState({
services: [
await createService(
{
image: 'image-1',
serviceName: 'one',
running: false,
composition: {
restart: 'always',
},
},
{
state: {
status: 'exited',
createdAt: getTimeNSecondsAgo(new Date(), newer),
},
},
),
await createService(
{
image: 'image-2',
serviceName: 'two',
running: false,
composition: {
restart: 'unless-stopped',
},
},
{
state: {
status: 'exited',
createdAt: getTimeNSecondsAgo(new Date(), newer),
},
},
),
],
networks: [DEFAULT_NETWORK],
images: [
createImage({
name: 'image-1',
serviceName: 'one',
}),
createImage({
name: 'image-2',
serviceName: 'two',
}),
],
});
const [noopStep1, noopStep2, ...nextSteps] =
await applicationManager.inferNextSteps(currentApps, targetApps, {
downloading,
availableImages,
containerIdsByAppId,
});
expect(noopStep1).to.have.property('action').that.equals('noop');
expect(noopStep2).to.have.property('action').that.equals('noop');
expect(nextSteps).to.have.lengthOf(0);
});
it('should infer a start step for a service that was created > SECONDS_TO_WAIT_FOR_START ago with status of "exited"', async () => {
// Conditions:
// - restart: "always" || "unless-stopped"
// - status: "exited"
// - createdAt: > SECONDS_TO_WAIT_FOR_START ago
const targetApps = createApps(
{
services: [
await createService({
image: 'image-1',
serviceName: 'one',
composition: {
restart: 'always',
},
}),
await createService({
image: 'image-2',
serviceName: 'two',
composition: {
restart: 'unless-stopped',
},
}),
],
networks: [DEFAULT_NETWORK],
},
true,
);
const {
currentApps,
availableImages,
downloading,
containerIdsByAppId,
} = createCurrentState({
services: [
await createService(
{
image: 'image-1',
serviceName: 'one',
running: false,
composition: {
restart: 'always',
},
},
{
state: {
status: 'exited',
createdAt: getTimeNSecondsAgo(new Date(), older),
},
},
),
await createService(
{
image: 'image-2',
serviceName: 'two',
running: false,
composition: {
restart: 'unless-stopped',
},
},
{
state: {
status: 'exited',
createdAt: getTimeNSecondsAgo(new Date(), older),
},
},
),
],
networks: [DEFAULT_NETWORK],
images: [
createImage({
name: 'image-1',
serviceName: 'one',
}),
createImage({
name: 'image-2',
serviceName: 'two',
}),
],
});
const [startStep1, startStep2, ...nextSteps] =
await applicationManager.inferNextSteps(currentApps, targetApps, {
downloading,
availableImages,
containerIdsByAppId,
});
[startStep1, startStep2].forEach((step) => {
expect(step).to.have.property('action').that.equals('start');
expect(step)
.to.have.property('target')
.that.has.property('serviceName')
.that.is.oneOf(['one', 'two']);
});
expect(nextSteps).to.have.lengthOf(0);
});
});
});
});

View File

@ -11,33 +11,4 @@ describe('compose/utils', () => {
networks: ['test', 'test2'],
});
});
it('should return whether a date is valid and older than an interval of seconds', () => {
const now = new Date(Date.now());
expect(ComposeUtils.isValidDateAndOlderThan(now, 60)).to.equal(false);
const time59SecondsAgo = new Date(Date.now() - 59 * 1000);
expect(ComposeUtils.isValidDateAndOlderThan(time59SecondsAgo, 60)).to.equal(
false,
);
const time61SecondsAgo = new Date(Date.now() - 61 * 1000);
expect(ComposeUtils.isValidDateAndOlderThan(time61SecondsAgo, 60)).to.equal(
true,
);
const notDates = [
null,
undefined,
'',
'test',
123,
0,
-1,
Infinity,
NaN,
{},
];
notDates.forEach((n) => {
expect(ComposeUtils.isValidDateAndOlderThan(n, 0)).to.equal(false);
});
});
});