Backoff on image download error

Change-type: patch
Closes: #873
Signed-off-by: Cameron Diver <cameron@balena.io>
This commit is contained in:
Cameron Diver 2019-02-04 14:47:37 +00:00
parent b606332312
commit 146267b402
No known key found for this signature in database
GPG Key ID: 49690ED87032539F
4 changed files with 41 additions and 2 deletions

View File

@ -19,3 +19,5 @@ export class ResourceRecreationAttemptError extends TypedError {
}
export class InvalidNetworkConfigurationError extends TypedError {}
export class ImageDownloadBackoffError extends TypedError {}

View File

@ -15,6 +15,7 @@ import { DeltaStillProcessingError, NotFoundError } from '../lib/errors';
import * as LogTypes from '../lib/log-types';
import * as validation from '../lib/validation';
import Logger from '../logger';
import { ImageDownloadBackoffError } from './errors';
interface ImageEvents {
change: void;
@ -64,6 +65,10 @@ export class Images extends (EventEmitter as {
private logger: Logger;
private db: Database;
private imageFetchFailures: Dictionary<number> = {};
private imageFetchLastFailureTime: Dictionary<
ReturnType<typeof process.hrtime>
> = {};
private imageCleanupFailures: Dictionary<number> = {};
// A store of volatile state for images (e.g. download progress), indexed by imageId
private volatileState: { [imageId: number]: Image } = {};
@ -81,6 +86,24 @@ export class Images extends (EventEmitter as {
opts: FetchOptions,
onFinish = _.noop,
): Promise<null> {
if (this.imageFetchFailures[image.name] != null) {
// If we are retrying a pull within the backoff time of the last failure,
// we need to throw an error, which will be caught in the device-state
// engine, and ensure that we wait a bit lnger
const minDelay = Math.min(
2 ** this.imageFetchFailures[image.name] * constants.backoffIncrement,
constants.maxBackoffTime,
);
const timeSinceLastError = process.hrtime(
this.imageFetchLastFailureTime[image.name],
);
const timeSinceLastErrorMs =
timeSinceLastError[0] * 1000 + timeSinceLastError[1] / 1e6;
if (timeSinceLastErrorMs < minDelay) {
throw new ImageDownloadBackoffError();
}
}
const onProgress = (progress: FetchProgressEvent) => {
// Only report the percentage if we haven't finished fetching
if (this.volatileState[image.imageId] != null) {
@ -108,6 +131,13 @@ export class Images extends (EventEmitter as {
return null;
} catch (e) {
if (!NotFoundError(e)) {
if (!(e instanceof ImageDownloadBackoffError)) {
this.imageFetchLastFailureTime[image.name] = process.hrtime();
this.imageFetchFailures[image.name] =
this.imageFetchFailures[image.name] != null
? this.imageFetchFailures[image.name] + 1
: 1;
}
throw e;
}
this.reportChange(
@ -130,6 +160,8 @@ export class Images extends (EventEmitter as {
this.logger.logSystemEvent(LogTypes.downloadImageSuccess, { image });
success = true;
delete this.imageFetchFailures[image.name];
delete this.imageFetchLastFailureTime[image.name];
} catch (err) {
if (err instanceof DeltaStillProcessingError) {
// If this is a delta image pull, and the delta still hasn't finished generating,

View File

@ -570,7 +570,7 @@ module.exports = class DeviceState extends EventEmitter
if @scheduledApply?
console.log("Updating failed, but there's another update scheduled immediately: ", err)
else
delay = Math.min((2 ** @failedUpdates) * 500, 30000)
delay = Math.min((2 ** @failedUpdates) * constants.backoffIncrement, constants.maxBackoffTime)
# If there was an error then schedule another attempt briefly in the future.
console.log('Scheduling another update attempt due to failure: ', delay, err)
@triggerApplyTarget({ force, delay, initial })

View File

@ -32,7 +32,7 @@ const constants = {
proxyvisorHookReceiver: 'http://0.0.0.0:1337',
configJsonNonAtomicPath: '/boot/config.json',
defaultMixpanelToken: process.env.DEFAULT_MIXPANEL_TOKEN,
supervisorNetworkInterface: supervisorNetworkInterface,
supervisorNetworkInterface,
allowedInterfaces: [
'resin-vpn',
'tun0',
@ -50,6 +50,11 @@ const constants = {
bootBlockDevice: '/dev/mmcblk0p1',
hostConfigVarPrefix: 'HOST_',
migrationBackupFile: 'backup.tgz',
// Use this failure multiplied by 2**Number of failures to increase
// the backoff on subsequent failures
backoffIncrement: 500,
// The maximum time to backoff on repeated failure
maxBackoffTime: 30000,
};
if (process.env.DOCKER_HOST == null) {