Retry DELTA_APPLY_RETRY_COUNT (3) times during delta apply fail before reverting to regular pull

This prevents an image download error loop where the delta image on the delta server is present,
but some aspect of the delta image or the base image on the device does not match up, causing
the delta to fail to be applied to the base image.

Delta apply errors don't raise status codes as they are thrown from the Engine (although they should),
so if an error with a status code is raised during this time, throw an error to the handler
indicating that the delta should be retried until success. Errors with status codes raised during
this time are largely network related, so falling back to a regular pull won't improve anything.

Upon delta apply errors exceeding DELTA_APPLY_RETRY_COUNT, revert to a regular pull.

Change-type: patch
Signed-off-by: Christina Ying Wang <christina@balena.io>
This commit is contained in:
Christina Ying Wang 2025-01-06 16:53:11 -08:00
parent 1fc242200f
commit 341111f1f9
2 changed files with 61 additions and 11 deletions

View File

@ -1,23 +1,23 @@
import type { ProgressCallback } from 'docker-progress';
import { DockerProgress } from 'docker-progress';
import type { ProgressCallback } from 'docker-progress';
import Dockerode from 'dockerode';
import _ from 'lodash';
import memoizee from 'memoizee';
import { applyDelta, OutOfSyncError } from 'docker-delta';
import type { SchemaReturn } from '../config/schema-type';
import log from './supervisor-console';
import { envArrayToObject } from './conversions';
import * as request from './request';
import {
DeltaStillProcessingError,
ImageAuthenticationError,
InvalidNetGatewayError,
DeltaServerError,
DeltaApplyError,
isStatusError,
} from './errors';
import * as request from './request';
import type { EnvVarObject } from '../types';
import log from './supervisor-console';
import type { SchemaReturn } from '../config/schema-type';
export type FetchOptions = SchemaReturn<'fetchOptions'>;
export type DeltaFetchOptions = FetchOptions & {
@ -42,6 +42,18 @@ type ImageNameParts = {
// (10 mins)
const DELTA_TOKEN_TIMEOUT = 10 * 60 * 1000;
// How many times to retry a v3 delta apply before falling back to a regular pull.
// A delta is applied to the base image when pulling, so a failure could be due to
// "layers from manifest don't match image configuration", which can occur before
// or after downloading delta image layers.
//
// Other causes of failure have not been documented as clearly as "layers from manifest"
// but could manifest as well, though unclear if they occur before, after, or during
// downloading delta image layers.
//
// See: https://github.com/balena-os/balena-engine/blob/master/distribution/pull_v2.go#L43
const DELTA_APPLY_RETRY_COUNT = 3;
export const docker = new Dockerode();
export const dockerProgress = new DockerProgress({
docker,
@ -140,7 +152,7 @@ export async function fetchDeltaWithProgress(
}
// Since the supevisor never calls this function with a source anymore,
// this should never happen, but w ehandle it anyway
// this should never happen, but we handle it anyway
if (deltaOpts.deltaSource == null) {
logFn('Falling back to regular pull due to lack of a delta source');
return fetchImageWithProgress(imgDest, deltaOpts, onProgress);
@ -226,29 +238,62 @@ export async function fetchDeltaWithProgress(
`Got an error when parsing delta server response for v3 delta: ${e}`,
);
}
id = await applyBalenaDelta(name, token, onProgress, logFn);
// Try to apply delta DELTA_APPLY_RETRY_COUNT times, then throw DeltaApplyError
let lastError: Error | undefined = undefined;
for (
let tryCount = 0;
tryCount < DELTA_APPLY_RETRY_COUNT;
tryCount++
) {
try {
id = await applyBalenaDelta(name, token, onProgress, logFn);
break;
} catch (e) {
if (isStatusError(e)) {
// A status error during delta pull indicates network issues,
// so we should throw an error to the handler that indicates that
// the delta pull should be retried until network issues are resolved,
// rather than falling back to a regular pull.
throw e;
}
lastError = e as Error;
logFn(
`Delta apply failed, retrying (${tryCount + 1}/${DELTA_APPLY_RETRY_COUNT})...`,
);
}
}
if (lastError) {
throw new DeltaApplyError(lastError.message);
}
}
break;
default:
throw new Error(`Unsupported delta version: ${deltaOpts.deltaVersion}`);
}
} catch (e) {
// Log appropriate message based on error type
if (e instanceof OutOfSyncError) {
logFn('Falling back to regular pull due to delta out of sync error');
return await fetchImageWithProgress(imgDest, deltaOpts, onProgress);
} else if (e instanceof DeltaServerError) {
logFn(
`Falling back to regular pull due to delta server error (${e.statusCode})${e.statusMessage ? `: ${e.statusMessage}` : ''}`,
);
return await fetchImageWithProgress(imgDest, deltaOpts, onProgress);
} else if (e instanceof DeltaApplyError) {
// A delta apply error is raised from the Engine and doesn't have a status code
logFn(
`Falling back to regular pull due to delta apply error ${e.message ? `: ${e.message}` : ''}`,
);
} else {
logFn(`Delta failed with ${e}`);
throw e;
}
// For handled errors, fall back to regular pull
return fetchImageWithProgress(imgDest, deltaOpts, onProgress);
}
logFn(`Delta applied successfully`);
return id;
return id!;
}
export async function fetchImageWithProgress(

View File

@ -71,6 +71,11 @@ export class InvalidNetGatewayError extends TypedError {}
export class DeltaStillProcessingError extends TypedError {}
export class DeltaServerError extends StatusError {}
export class DeltaApplyError extends Error {
constructor(message?: string) {
super(message);
}
}
export class UpdatesLockedError extends TypedError {}