Merge pull request #888 from balena-io/873-backoff-on-image-failure

Backoff on image download error
This commit is contained in:
CameronDiver 2019-02-05 19:53:44 +00:00 committed by GitHub
commit f6bbf6dd50
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 59 additions and 7 deletions

View File

@ -64,7 +64,7 @@ createApplicationManagerRouter = (applications) ->
module.exports = class ApplicationManager extends EventEmitter
constructor: ({ @logger, @config, @db, @eventTracker, @deviceState }) ->
@docker = new Docker()
@images = new Images({ @docker, @logger, @db })
@images = new Images({ @docker, @logger, @db, @config })
@services = new ServiceManager({ @docker, @logger, @images, @config })
@networks = new NetworkManager({ @docker, @logger })
@volumes = new Volumes({ @docker, @logger })
@ -74,6 +74,11 @@ module.exports = class ApplicationManager extends EventEmitter
@fetchesInProgress = 0
@_targetVolatilePerImageId = {}
@_containerStarted = {}
@config.on 'change', (changedConfig) =>
if changedConfig.appUpdatePollInterval
@images.appUpdatePollInterval = changedConfig.appUpdatePollInterval
@actionExecutors = {
stop: (step, { force = false, skipLock = false } = {}) =>
@_lockingIfNecessary step.current.appId, { force, skipLock: skipLock or step.options?.skipLock }, =>
@ -178,6 +183,9 @@ module.exports = class ApplicationManager extends EventEmitter
@emit('change', data)
init: =>
@config.get('appUpdatePollInterval')
.then (interval) =>
@images.appUpdatePollInterval = interval
@images.cleanupDatabase()
.then =>
@localModeManager.init()

View File

@ -19,3 +19,5 @@ export class ResourceRecreationAttemptError extends TypedError {
}
export class InvalidNetworkConfigurationError extends TypedError {}
export class ImageDownloadBackoffError extends TypedError {}

View File

@ -4,6 +4,7 @@ import { EventEmitter } from 'events';
import * as _ from 'lodash';
import StrictEventEmitter from 'strict-event-emitter-types';
import Config from '../config';
import Database from '../db';
import * as constants from '../lib/constants';
import {
@ -15,6 +16,7 @@ import { DeltaStillProcessingError, NotFoundError } from '../lib/errors';
import * as LogTypes from '../lib/log-types';
import * as validation from '../lib/validation';
import Logger from '../logger';
import { ImageDownloadBackoffError } from './errors';
interface ImageEvents {
change: void;
@ -26,6 +28,7 @@ interface ImageConstructOpts {
docker: DockerUtils;
logger: Logger;
db: Database;
config: Config;
}
interface FetchProgressEvent {
@ -64,6 +67,12 @@ export class Images extends (EventEmitter as {
private logger: Logger;
private db: Database;
public appUpdatePollInterval: number;
private imageFetchFailures: Dictionary<number> = {};
private imageFetchLastFailureTime: Dictionary<
ReturnType<typeof process.hrtime>
> = {};
private imageCleanupFailures: Dictionary<number> = {};
// A store of volatile state for images (e.g. download progress), indexed by imageId
private volatileState: { [imageId: number]: Image } = {};
@ -81,6 +90,24 @@ export class Images extends (EventEmitter as {
opts: FetchOptions,
onFinish = _.noop,
): Promise<null> {
if (this.imageFetchFailures[image.name] != null) {
// If we are retrying a pull within the backoff time of the last failure,
// we need to throw an error, which will be caught in the device-state
// engine, and ensure that we wait a bit lnger
const minDelay = Math.min(
2 ** this.imageFetchFailures[image.name] * constants.backoffIncrement,
this.appUpdatePollInterval,
);
const timeSinceLastError = process.hrtime(
this.imageFetchLastFailureTime[image.name],
);
const timeSinceLastErrorMs =
timeSinceLastError[0] * 1000 + timeSinceLastError[1] / 1e6;
if (timeSinceLastErrorMs < minDelay) {
throw new ImageDownloadBackoffError();
}
}
const onProgress = (progress: FetchProgressEvent) => {
// Only report the percentage if we haven't finished fetching
if (this.volatileState[image.imageId] != null) {
@ -108,6 +135,13 @@ export class Images extends (EventEmitter as {
return null;
} catch (e) {
if (!NotFoundError(e)) {
if (!(e instanceof ImageDownloadBackoffError)) {
this.imageFetchLastFailureTime[image.name] = process.hrtime();
this.imageFetchFailures[image.name] =
this.imageFetchFailures[image.name] != null
? this.imageFetchFailures[image.name] + 1
: 1;
}
throw e;
}
this.reportChange(
@ -130,6 +164,8 @@ export class Images extends (EventEmitter as {
this.logger.logSystemEvent(LogTypes.downloadImageSuccess, { image });
success = true;
delete this.imageFetchFailures[image.name];
delete this.imageFetchLastFailureTime[image.name];
} catch (err) {
if (err instanceof DeltaStillProcessingError) {
// If this is a delta image pull, and the delta still hasn't finished generating,

View File

@ -138,11 +138,11 @@ module.exports = class DeviceState extends EventEmitter
@applications.on('change', @reportCurrentState)
healthcheck: =>
@config.getMany([ 'appUpdatePollInterval', 'unmanaged' ])
@config.getMany([ 'unmanaged' ])
.then (conf) =>
cycleTime = process.hrtime(@lastApplyStart)
cycleTimeMs = cycleTime[0] * 1000 + cycleTime[1] / 1e6
cycleTimeWithinInterval = cycleTimeMs - @applications.timeSpentFetching < 2 * conf.appUpdatePollInterval
cycleTimeWithinInterval = cycleTimeMs - @applications.timeSpentFetching < 2 * @maxPollTime
applyTargetHealthy = conf.unmanaged or !@applyInProgress or @applications.fetchesInProgress > 0 or cycleTimeWithinInterval
return applyTargetHealthy
@ -249,13 +249,16 @@ module.exports = class DeviceState extends EventEmitter
@logger.enable(changedConfig.loggingEnabled)
if changedConfig.apiSecret?
@reportCurrentState(api_secret: changedConfig.apiSecret)
if changedConfig.appUpdatePollInterval?
@maxPollTime = changedConfig.appUpdatePollInterval
@config.getMany([
'initialConfigSaved', 'listenPort', 'apiSecret', 'osVersion', 'osVariant',
'version', 'provisioned', 'apiEndpoint', 'connectivityCheckEnabled', 'legacyAppsPresent',
'targetStateSet', 'unmanaged'
'targetStateSet', 'unmanaged', 'appUpdatePollInterval'
])
.then (conf) =>
@maxPollTime = conf.appUpdatePollInterval
@applications.init()
.then =>
if !conf.initialConfigSaved
@ -570,7 +573,7 @@ module.exports = class DeviceState extends EventEmitter
if @scheduledApply?
console.log("Updating failed, but there's another update scheduled immediately: ", err)
else
delay = Math.min((2 ** @failedUpdates) * 500, 30000)
delay = Math.min((2 ** @failedUpdates) * constants.backoffIncrement, @maxPollTime)
# If there was an error then schedule another attempt briefly in the future.
console.log('Scheduling another update attempt due to failure: ', delay, err)
@triggerApplyTarget({ force, delay, initial })

View File

@ -32,7 +32,7 @@ const constants = {
proxyvisorHookReceiver: 'http://0.0.0.0:1337',
configJsonNonAtomicPath: '/boot/config.json',
defaultMixpanelToken: process.env.DEFAULT_MIXPANEL_TOKEN,
supervisorNetworkInterface: supervisorNetworkInterface,
supervisorNetworkInterface,
allowedInterfaces: [
'resin-vpn',
'tun0',
@ -50,6 +50,9 @@ const constants = {
bootBlockDevice: '/dev/mmcblk0p1',
hostConfigVarPrefix: 'HOST_',
migrationBackupFile: 'backup.tgz',
// Use this failure multiplied by 2**Number of failures to increase
// the backoff on subsequent failures
backoffIncrement: 500,
};
if (process.env.DOCKER_HOST == null) {