From 1fc242200f78e4219aafc5bb91de8cf0916236af Mon Sep 17 00:00:00 2001 From: Christina Ying Wang Date: Mon, 6 Jan 2025 14:21:27 -0800 Subject: [PATCH 1/3] Revert to regular pull immediately on delta server failure (code 400s) If the delta server responds immediately with HTTP 4xx upon requesting a delta image, this means the server is not able to supply the resource, so fall back to a regular pull immediately. Change-type: patch Signed-off-by: Christina Ying Wang --- src/lib/docker-utils.ts | 16 +++++++++++----- src/lib/errors.ts | 2 ++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/lib/docker-utils.ts b/src/lib/docker-utils.ts index 25845154..056c3066 100644 --- a/src/lib/docker-utils.ts +++ b/src/lib/docker-utils.ts @@ -12,6 +12,7 @@ import { DeltaStillProcessingError, ImageAuthenticationError, InvalidNetGatewayError, + DeltaServerError, } from './errors'; import * as request from './request'; import type { EnvVarObject } from '../types'; @@ -113,11 +114,7 @@ export async function fetchDeltaWithProgress( onProgress: ProgressCallback, serviceName: string, ): Promise { - const deltaSourceId = - deltaOpts.deltaSourceId != null - ? deltaOpts.deltaSourceId - : deltaOpts.deltaSource; - + const deltaSourceId = deltaOpts.deltaSourceId ?? deltaOpts.deltaSource; const timeout = deltaOpts.deltaApplyTimeout; const logFn = (str: string) => @@ -210,6 +207,10 @@ export async function fetchDeltaWithProgress( } break; case 3: + // If 400s status code, throw a more specific error & revert immediately to a regular pull + if (res.statusCode >= 400 && res.statusCode < 500) { + throw new DeltaServerError(res.statusCode, res.statusMessage); + } if (res.statusCode !== 200) { throw new Error( `Got ${res.statusCode} when requesting v3 delta from delta server.`, @@ -235,6 +236,11 @@ export async function fetchDeltaWithProgress( if (e instanceof OutOfSyncError) { logFn('Falling back to regular pull due to delta out of sync error'); return await fetchImageWithProgress(imgDest, deltaOpts, onProgress); + } else if (e instanceof DeltaServerError) { + logFn( + `Falling back to regular pull due to delta server error (${e.statusCode})${e.statusMessage ? `: ${e.statusMessage}` : ''}`, + ); + return await fetchImageWithProgress(imgDest, deltaOpts, onProgress); } else { logFn(`Delta failed with ${e}`); throw e; diff --git a/src/lib/errors.ts b/src/lib/errors.ts index eb4a1a08..f7979f1f 100644 --- a/src/lib/errors.ts +++ b/src/lib/errors.ts @@ -70,6 +70,8 @@ export class InvalidNetGatewayError extends TypedError {} export class DeltaStillProcessingError extends TypedError {} +export class DeltaServerError extends StatusError {} + export class UpdatesLockedError extends TypedError {} export function isHttpConflictError(err: { statusCode: number }): boolean { From 341111f1f94cd9f17fd7be9b6f21e3bc22c9ad3a Mon Sep 17 00:00:00 2001 From: Christina Ying Wang Date: Mon, 6 Jan 2025 16:53:11 -0800 Subject: [PATCH 2/3] Retry DELTA_APPLY_RETRY_COUNT (3) times during delta apply fail before reverting to regular pull This prevents an image download error loop where the delta image on the delta server is present, but some aspect of the delta image or the base image on the device does not match up, causing the delta to fail to be applied to the base image. Delta apply errors don't raise status codes as they are thrown from the Engine (although they should), so if an error with a status code is raised during this time, throw an error to the handler indicating that the delta should be retried until success. Errors with status codes raised during this time are largely network related, so falling back to a regular pull won't improve anything. Upon delta apply errors exceeding DELTA_APPLY_RETRY_COUNT, revert to a regular pull. Change-type: patch Signed-off-by: Christina Ying Wang --- src/lib/docker-utils.ts | 67 ++++++++++++++++++++++++++++++++++------- src/lib/errors.ts | 5 +++ 2 files changed, 61 insertions(+), 11 deletions(-) diff --git a/src/lib/docker-utils.ts b/src/lib/docker-utils.ts index 056c3066..9b03c2b7 100644 --- a/src/lib/docker-utils.ts +++ b/src/lib/docker-utils.ts @@ -1,23 +1,23 @@ -import type { ProgressCallback } from 'docker-progress'; import { DockerProgress } from 'docker-progress'; +import type { ProgressCallback } from 'docker-progress'; import Dockerode from 'dockerode'; import _ from 'lodash'; import memoizee from 'memoizee'; - import { applyDelta, OutOfSyncError } from 'docker-delta'; -import type { SchemaReturn } from '../config/schema-type'; +import log from './supervisor-console'; import { envArrayToObject } from './conversions'; +import * as request from './request'; import { DeltaStillProcessingError, ImageAuthenticationError, InvalidNetGatewayError, DeltaServerError, + DeltaApplyError, + isStatusError, } from './errors'; -import * as request from './request'; import type { EnvVarObject } from '../types'; - -import log from './supervisor-console'; +import type { SchemaReturn } from '../config/schema-type'; export type FetchOptions = SchemaReturn<'fetchOptions'>; export type DeltaFetchOptions = FetchOptions & { @@ -42,6 +42,18 @@ type ImageNameParts = { // (10 mins) const DELTA_TOKEN_TIMEOUT = 10 * 60 * 1000; +// How many times to retry a v3 delta apply before falling back to a regular pull. +// A delta is applied to the base image when pulling, so a failure could be due to +// "layers from manifest don't match image configuration", which can occur before +// or after downloading delta image layers. +// +// Other causes of failure have not been documented as clearly as "layers from manifest" +// but could manifest as well, though unclear if they occur before, after, or during +// downloading delta image layers. +// +// See: https://github.com/balena-os/balena-engine/blob/master/distribution/pull_v2.go#L43 +const DELTA_APPLY_RETRY_COUNT = 3; + export const docker = new Dockerode(); export const dockerProgress = new DockerProgress({ docker, @@ -140,7 +152,7 @@ export async function fetchDeltaWithProgress( } // Since the supevisor never calls this function with a source anymore, - // this should never happen, but w ehandle it anyway + // this should never happen, but we handle it anyway if (deltaOpts.deltaSource == null) { logFn('Falling back to regular pull due to lack of a delta source'); return fetchImageWithProgress(imgDest, deltaOpts, onProgress); @@ -226,29 +238,62 @@ export async function fetchDeltaWithProgress( `Got an error when parsing delta server response for v3 delta: ${e}`, ); } - id = await applyBalenaDelta(name, token, onProgress, logFn); + // Try to apply delta DELTA_APPLY_RETRY_COUNT times, then throw DeltaApplyError + let lastError: Error | undefined = undefined; + for ( + let tryCount = 0; + tryCount < DELTA_APPLY_RETRY_COUNT; + tryCount++ + ) { + try { + id = await applyBalenaDelta(name, token, onProgress, logFn); + break; + } catch (e) { + if (isStatusError(e)) { + // A status error during delta pull indicates network issues, + // so we should throw an error to the handler that indicates that + // the delta pull should be retried until network issues are resolved, + // rather than falling back to a regular pull. + throw e; + } + lastError = e as Error; + logFn( + `Delta apply failed, retrying (${tryCount + 1}/${DELTA_APPLY_RETRY_COUNT})...`, + ); + } + } + if (lastError) { + throw new DeltaApplyError(lastError.message); + } } break; default: throw new Error(`Unsupported delta version: ${deltaOpts.deltaVersion}`); } } catch (e) { + // Log appropriate message based on error type if (e instanceof OutOfSyncError) { logFn('Falling back to regular pull due to delta out of sync error'); - return await fetchImageWithProgress(imgDest, deltaOpts, onProgress); } else if (e instanceof DeltaServerError) { logFn( `Falling back to regular pull due to delta server error (${e.statusCode})${e.statusMessage ? `: ${e.statusMessage}` : ''}`, ); - return await fetchImageWithProgress(imgDest, deltaOpts, onProgress); + } else if (e instanceof DeltaApplyError) { + // A delta apply error is raised from the Engine and doesn't have a status code + logFn( + `Falling back to regular pull due to delta apply error ${e.message ? `: ${e.message}` : ''}`, + ); } else { logFn(`Delta failed with ${e}`); throw e; } + + // For handled errors, fall back to regular pull + return fetchImageWithProgress(imgDest, deltaOpts, onProgress); } logFn(`Delta applied successfully`); - return id; + return id!; } export async function fetchImageWithProgress( diff --git a/src/lib/errors.ts b/src/lib/errors.ts index f7979f1f..70d87516 100644 --- a/src/lib/errors.ts +++ b/src/lib/errors.ts @@ -71,6 +71,11 @@ export class InvalidNetGatewayError extends TypedError {} export class DeltaStillProcessingError extends TypedError {} export class DeltaServerError extends StatusError {} +export class DeltaApplyError extends Error { + constructor(message?: string) { + super(message); + } +} export class UpdatesLockedError extends TypedError {} From 5936af37e7523758f7a13afb86112438db5bd643 Mon Sep 17 00:00:00 2001 From: Christina Ying Wang Date: Wed, 12 Feb 2025 13:49:09 -0800 Subject: [PATCH 3/3] Bump docker-progress to 5.2.4 Signed-off-by: Christina Ying Wang --- package-lock.json | 9 +++++---- package.json | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/package-lock.json b/package-lock.json index 4f9d68e8..2424aab9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -58,7 +58,7 @@ "copy-webpack-plugin": "^12.0.0", "deep-object-diff": "1.1.0", "docker-delta": "^4.1.0", - "docker-progress": "^5.2.3", + "docker-progress": "^5.2.4", "dockerode": "^4.0.2", "duration-js": "^4.0.0", "express": "^4.21.2", @@ -4794,10 +4794,11 @@ } }, "node_modules/docker-progress": { - "version": "5.2.3", - "resolved": "https://registry.npmjs.org/docker-progress/-/docker-progress-5.2.3.tgz", - "integrity": "sha512-tsiqpC61pzaDOkKhbvr7ABQB2bL3bx+sVa7r4IZFf3tzwcMIhcU/sr5fqsXOKzIspxiCL+UHNS9gNO5ly9JxWg==", + "version": "5.2.4", + "resolved": "https://registry.npmjs.org/docker-progress/-/docker-progress-5.2.4.tgz", + "integrity": "sha512-sgEXTJh78YOj8pIBIzZHLo3KpamJ5N0/3pU7DkpZBBvxZ9PmO0d9ND6x7TExQZf4hgvlFRBS41aN+GHx6vu5KQ==", "dev": true, + "license": "Apache-2.0", "dependencies": { "@types/dockerode": "^3.3.23", "JSONStream": "^1.3.5", diff --git a/package.json b/package.json index 7e87eeaa..49ad485b 100644 --- a/package.json +++ b/package.json @@ -84,7 +84,7 @@ "copy-webpack-plugin": "^12.0.0", "deep-object-diff": "1.1.0", "docker-delta": "^4.1.0", - "docker-progress": "^5.2.3", + "docker-progress": "^5.2.4", "dockerode": "^4.0.2", "duration-js": "^4.0.0", "express": "^4.21.2",