mirror of
https://github.com/balena-os/balena-supervisor.git
synced 2025-01-17 10:19:47 +00:00
Merge pull request #888 from balena-io/873-backoff-on-image-failure
Backoff on image download error
This commit is contained in:
commit
f6bbf6dd50
@ -64,7 +64,7 @@ createApplicationManagerRouter = (applications) ->
|
||||
module.exports = class ApplicationManager extends EventEmitter
|
||||
constructor: ({ @logger, @config, @db, @eventTracker, @deviceState }) ->
|
||||
@docker = new Docker()
|
||||
@images = new Images({ @docker, @logger, @db })
|
||||
@images = new Images({ @docker, @logger, @db, @config })
|
||||
@services = new ServiceManager({ @docker, @logger, @images, @config })
|
||||
@networks = new NetworkManager({ @docker, @logger })
|
||||
@volumes = new Volumes({ @docker, @logger })
|
||||
@ -74,6 +74,11 @@ module.exports = class ApplicationManager extends EventEmitter
|
||||
@fetchesInProgress = 0
|
||||
@_targetVolatilePerImageId = {}
|
||||
@_containerStarted = {}
|
||||
|
||||
@config.on 'change', (changedConfig) =>
|
||||
if changedConfig.appUpdatePollInterval
|
||||
@images.appUpdatePollInterval = changedConfig.appUpdatePollInterval
|
||||
|
||||
@actionExecutors = {
|
||||
stop: (step, { force = false, skipLock = false } = {}) =>
|
||||
@_lockingIfNecessary step.current.appId, { force, skipLock: skipLock or step.options?.skipLock }, =>
|
||||
@ -178,6 +183,9 @@ module.exports = class ApplicationManager extends EventEmitter
|
||||
@emit('change', data)
|
||||
|
||||
init: =>
|
||||
@config.get('appUpdatePollInterval')
|
||||
.then (interval) =>
|
||||
@images.appUpdatePollInterval = interval
|
||||
@images.cleanupDatabase()
|
||||
.then =>
|
||||
@localModeManager.init()
|
||||
|
@ -19,3 +19,5 @@ export class ResourceRecreationAttemptError extends TypedError {
|
||||
}
|
||||
|
||||
export class InvalidNetworkConfigurationError extends TypedError {}
|
||||
|
||||
export class ImageDownloadBackoffError extends TypedError {}
|
||||
|
@ -4,6 +4,7 @@ import { EventEmitter } from 'events';
|
||||
import * as _ from 'lodash';
|
||||
import StrictEventEmitter from 'strict-event-emitter-types';
|
||||
|
||||
import Config from '../config';
|
||||
import Database from '../db';
|
||||
import * as constants from '../lib/constants';
|
||||
import {
|
||||
@ -15,6 +16,7 @@ import { DeltaStillProcessingError, NotFoundError } from '../lib/errors';
|
||||
import * as LogTypes from '../lib/log-types';
|
||||
import * as validation from '../lib/validation';
|
||||
import Logger from '../logger';
|
||||
import { ImageDownloadBackoffError } from './errors';
|
||||
|
||||
interface ImageEvents {
|
||||
change: void;
|
||||
@ -26,6 +28,7 @@ interface ImageConstructOpts {
|
||||
docker: DockerUtils;
|
||||
logger: Logger;
|
||||
db: Database;
|
||||
config: Config;
|
||||
}
|
||||
|
||||
interface FetchProgressEvent {
|
||||
@ -64,6 +67,12 @@ export class Images extends (EventEmitter as {
|
||||
private logger: Logger;
|
||||
private db: Database;
|
||||
|
||||
public appUpdatePollInterval: number;
|
||||
|
||||
private imageFetchFailures: Dictionary<number> = {};
|
||||
private imageFetchLastFailureTime: Dictionary<
|
||||
ReturnType<typeof process.hrtime>
|
||||
> = {};
|
||||
private imageCleanupFailures: Dictionary<number> = {};
|
||||
// A store of volatile state for images (e.g. download progress), indexed by imageId
|
||||
private volatileState: { [imageId: number]: Image } = {};
|
||||
@ -81,6 +90,24 @@ export class Images extends (EventEmitter as {
|
||||
opts: FetchOptions,
|
||||
onFinish = _.noop,
|
||||
): Promise<null> {
|
||||
if (this.imageFetchFailures[image.name] != null) {
|
||||
// If we are retrying a pull within the backoff time of the last failure,
|
||||
// we need to throw an error, which will be caught in the device-state
|
||||
// engine, and ensure that we wait a bit lnger
|
||||
const minDelay = Math.min(
|
||||
2 ** this.imageFetchFailures[image.name] * constants.backoffIncrement,
|
||||
this.appUpdatePollInterval,
|
||||
);
|
||||
const timeSinceLastError = process.hrtime(
|
||||
this.imageFetchLastFailureTime[image.name],
|
||||
);
|
||||
const timeSinceLastErrorMs =
|
||||
timeSinceLastError[0] * 1000 + timeSinceLastError[1] / 1e6;
|
||||
if (timeSinceLastErrorMs < minDelay) {
|
||||
throw new ImageDownloadBackoffError();
|
||||
}
|
||||
}
|
||||
|
||||
const onProgress = (progress: FetchProgressEvent) => {
|
||||
// Only report the percentage if we haven't finished fetching
|
||||
if (this.volatileState[image.imageId] != null) {
|
||||
@ -108,6 +135,13 @@ export class Images extends (EventEmitter as {
|
||||
return null;
|
||||
} catch (e) {
|
||||
if (!NotFoundError(e)) {
|
||||
if (!(e instanceof ImageDownloadBackoffError)) {
|
||||
this.imageFetchLastFailureTime[image.name] = process.hrtime();
|
||||
this.imageFetchFailures[image.name] =
|
||||
this.imageFetchFailures[image.name] != null
|
||||
? this.imageFetchFailures[image.name] + 1
|
||||
: 1;
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
this.reportChange(
|
||||
@ -130,6 +164,8 @@ export class Images extends (EventEmitter as {
|
||||
|
||||
this.logger.logSystemEvent(LogTypes.downloadImageSuccess, { image });
|
||||
success = true;
|
||||
delete this.imageFetchFailures[image.name];
|
||||
delete this.imageFetchLastFailureTime[image.name];
|
||||
} catch (err) {
|
||||
if (err instanceof DeltaStillProcessingError) {
|
||||
// If this is a delta image pull, and the delta still hasn't finished generating,
|
||||
|
@ -138,11 +138,11 @@ module.exports = class DeviceState extends EventEmitter
|
||||
@applications.on('change', @reportCurrentState)
|
||||
|
||||
healthcheck: =>
|
||||
@config.getMany([ 'appUpdatePollInterval', 'unmanaged' ])
|
||||
@config.getMany([ 'unmanaged' ])
|
||||
.then (conf) =>
|
||||
cycleTime = process.hrtime(@lastApplyStart)
|
||||
cycleTimeMs = cycleTime[0] * 1000 + cycleTime[1] / 1e6
|
||||
cycleTimeWithinInterval = cycleTimeMs - @applications.timeSpentFetching < 2 * conf.appUpdatePollInterval
|
||||
cycleTimeWithinInterval = cycleTimeMs - @applications.timeSpentFetching < 2 * @maxPollTime
|
||||
applyTargetHealthy = conf.unmanaged or !@applyInProgress or @applications.fetchesInProgress > 0 or cycleTimeWithinInterval
|
||||
return applyTargetHealthy
|
||||
|
||||
@ -249,13 +249,16 @@ module.exports = class DeviceState extends EventEmitter
|
||||
@logger.enable(changedConfig.loggingEnabled)
|
||||
if changedConfig.apiSecret?
|
||||
@reportCurrentState(api_secret: changedConfig.apiSecret)
|
||||
if changedConfig.appUpdatePollInterval?
|
||||
@maxPollTime = changedConfig.appUpdatePollInterval
|
||||
|
||||
@config.getMany([
|
||||
'initialConfigSaved', 'listenPort', 'apiSecret', 'osVersion', 'osVariant',
|
||||
'version', 'provisioned', 'apiEndpoint', 'connectivityCheckEnabled', 'legacyAppsPresent',
|
||||
'targetStateSet', 'unmanaged'
|
||||
'targetStateSet', 'unmanaged', 'appUpdatePollInterval'
|
||||
])
|
||||
.then (conf) =>
|
||||
@maxPollTime = conf.appUpdatePollInterval
|
||||
@applications.init()
|
||||
.then =>
|
||||
if !conf.initialConfigSaved
|
||||
@ -570,7 +573,7 @@ module.exports = class DeviceState extends EventEmitter
|
||||
if @scheduledApply?
|
||||
console.log("Updating failed, but there's another update scheduled immediately: ", err)
|
||||
else
|
||||
delay = Math.min((2 ** @failedUpdates) * 500, 30000)
|
||||
delay = Math.min((2 ** @failedUpdates) * constants.backoffIncrement, @maxPollTime)
|
||||
# If there was an error then schedule another attempt briefly in the future.
|
||||
console.log('Scheduling another update attempt due to failure: ', delay, err)
|
||||
@triggerApplyTarget({ force, delay, initial })
|
||||
|
@ -32,7 +32,7 @@ const constants = {
|
||||
proxyvisorHookReceiver: 'http://0.0.0.0:1337',
|
||||
configJsonNonAtomicPath: '/boot/config.json',
|
||||
defaultMixpanelToken: process.env.DEFAULT_MIXPANEL_TOKEN,
|
||||
supervisorNetworkInterface: supervisorNetworkInterface,
|
||||
supervisorNetworkInterface,
|
||||
allowedInterfaces: [
|
||||
'resin-vpn',
|
||||
'tun0',
|
||||
@ -50,6 +50,9 @@ const constants = {
|
||||
bootBlockDevice: '/dev/mmcblk0p1',
|
||||
hostConfigVarPrefix: 'HOST_',
|
||||
migrationBackupFile: 'backup.tgz',
|
||||
// Use this failure multiplied by 2**Number of failures to increase
|
||||
// the backoff on subsequent failures
|
||||
backoffIncrement: 500,
|
||||
};
|
||||
|
||||
if (process.env.DOCKER_HOST == null) {
|
||||
|
Loading…
Reference in New Issue
Block a user