Improve the update strategies:

* On handover, fetch old app from DB before starting the new app (and overwriting the DB record)
* Tidy up the logging
* Fix waitToKill so that it actually works
* Several other fixups
This commit is contained in:
Pablo Carranza Vélez 2015-11-16 10:31:10 -08:00 committed by Pablo Carranza Velez
parent 1d6811a423
commit cbb3e2f461

View File

@ -13,6 +13,10 @@ logger = require './lib/logger'
device = require './device' device = require './device'
lockFile = Promise.promisifyAll(require('lockfile')) lockFile = Promise.promisifyAll(require('lockfile'))
bootstrap = require './bootstrap' bootstrap = require './bootstrap'
TypedError = require 'typed-error'
fs = Promise.promisifyAll(require('fs'))
class UpdatesLockedError extends TypedError
{ docker } = dockerUtils { docker } = dockerUtils
@ -82,7 +86,7 @@ logSystemEvent = (logType, app, error) ->
application = {} application = {}
application.kill = kill = (app) -> application.kill = kill = (app, updateDB = true) ->
logSystemEvent(logTypes.stopApp, app) logSystemEvent(logTypes.stopApp, app)
device.updateState(status: 'Stopping') device.updateState(status: 'Stopping')
container = docker.getContainer(app.containerId) container = docker.getContainer(app.containerId)
@ -110,8 +114,9 @@ application.kill = kill = (app) ->
lockFile.unlockAsync(lockPath(app)) lockFile.unlockAsync(lockPath(app))
.tap -> .tap ->
logSystemEvent(logTypes.stopAppSuccess, app) logSystemEvent(logTypes.stopAppSuccess, app)
app.containerId = null if updateDB == true
knex('app').update(app).where(appId: app.appId) app.containerId = null
knex('app').update(app).where(appId: app.appId)
.catch (err) -> .catch (err) ->
logSystemEvent(logTypes.stopAppError, app, err) logSystemEvent(logTypes.stopAppError, app, err)
throw err throw err
@ -274,9 +279,7 @@ application.lockUpdates = lockUpdates = do ->
.catch ENOENT, _.noop .catch ENOENT, _.noop
.catch (err) -> .catch (err) ->
release() release()
err = new Error("Updates are locked: #{err.message}") throw new UpdatesLockedError("Updates are locked: #{err.message}")
err.isLocked = true
throw err
.disposer (release) -> .disposer (release) ->
Promise.try -> Promise.try ->
lockFile.unlockAsync(lockName) if force != true lockFile.unlockAsync(lockName) if force != true
@ -331,27 +334,31 @@ wrapAsError = (err) ->
return err if _.isError(err) return err if _.isError(err)
return new Error(err.message ? err) return new Error(err.message ? err)
selectAndKill = (appId) -> select = (appId) ->
knex('app').select().where({ appId }) knex('app').select().where({ appId })
.then ([ app ]) -> .then ([ app ]) ->
if !app? if !app?
throw new Error('App not found') throw new Error('App not found')
kill(app) return app
# Wait for app to signal it's ready to die, or timeout to complete (if it is defined and not-empty) # Wait for app to signal it's ready to die, or timeout to complete.
# timeout defaults to 1 minute.
waitToKill = (app, timeout) -> waitToKill = (app, timeout) ->
startTime = Date.now() startTime = Date.now()
pollInterval = 100 pollInterval = 100
timeout = parseInt(timeout) timeout = parseInt(timeout)
timeout = 60000 if isNaN(timeout)
checkFileOrTimeout = -> checkFileOrTimeout = ->
fs.statAsync(killmePath(app)) fs.statAsync(killmePath(app))
.catch (err) -> .catch (err) ->
throw err if isNaN(timeout) or (Date.now() - startTime) < timeout throw err unless (Date.now() - startTime) > timeout
.then -> .then ->
fs.unlinkAsync(killmePath(app)).catch(_.noop) fs.unlinkAsync(killmePath(app)).catch(_.noop)
checkFileOrTimeout() retryCheck = ->
.catch -> checkFileOrTimeout()
Promise.delay(pollInterval).then(checkFileOrTimeout) .catch ->
Promise.delay(pollInterval).then(retryCheck)
retryCheck()
UPDATE_IDLE = 0 UPDATE_IDLE = 0
UPDATE_UPDATING = 1 UPDATE_UPDATING = 1
@ -364,49 +371,53 @@ updateStatus =
intervalHandle: null intervalHandle: null
updateStrategies = updateStrategies =
'normal-update': (localApp, app, needsDownload, force, timeout) -> 'download-then-kill': ({ localApp, app, needsDownload, force }) ->
Promise.try -> Promise.try ->
fetch(app) if needsDownload fetch(app) if needsDownload
.then -> .then ->
Promise.using lockUpdates(localApp, force), -> Promise.using lockUpdates(localApp, force), ->
logSystemEvent(logTypes.updateApp, app) if localApp.imageId == app.imageId logSystemEvent(logTypes.updateApp, app) if localApp.imageId == app.imageId
selectAndKill(localApp.appId) select(localApp.appId)
.then(kill)
.then -> .then ->
start(app) start(app)
.catch (err) -> .catch (err) ->
logSystemEvent(logTypes.updateAppError, app, err) unless err.isLocked? logSystemEvent(logTypes.updateAppError, app, err) unless err instanceof UpdatesLockedError
throw err throw err
'kill-before-download': (localApp, app, needsDownload, force, timeout) -> 'kill-then-download': ({ localApp, app, needsDownload, force }) ->
logSystemEvent(logTypes.updateApp, app) if localApp.imageId == app.imageId
Promise.using lockUpdates(localApp, force), -> Promise.using lockUpdates(localApp, force), ->
selectAndKill(localApp.appId) logSystemEvent(logTypes.updateApp, app) if localApp.imageId == app.imageId
select(localApp.appId)
.then(kill)
.then -> .then ->
fetch(app) if needsDownload fetch(app) if needsDownload
.then -> .then ->
start(app) start(app)
.catch (err) -> .catch (err) ->
logSystemEvent(logTypes.updateAppError, app, err) unless err.isLocked? logSystemEvent(logTypes.updateAppError, app, err) unless err instanceof UpdatesLockedError
throw err throw err
'hand-over': (localApp, app, needsDownload, force, timeout) -> 'hand-over': ({ localApp, app, needsDownload, force, timeout }) ->
Promise.using lockUpdates(localApp, force), -> Promise.using lockUpdates(localApp, force), ->
Promise.try -> select(localApp.appId)
fetch(app) if needsDownload .then (localApp) ->
.then -> Promise.try ->
logSystemEvent(logTypes.updateApp, app) if localApp.imageId == app.imageId fetch(app) if needsDownload
start(app) .then ->
.then -> logSystemEvent(logTypes.updateApp, app) if localApp.imageId == app.imageId
waitToKill(localApp, timeout) start(app)
.then -> .then ->
selectAndKill(localApp.appId) waitToKill(localApp, timeout)
.then ->
kill(localApp, false)
.catch (err) -> .catch (err) ->
logSystemEvent(logTypes.updateAppError, app, err) unless err.isLocked? logSystemEvent(logTypes.updateAppError, app, err) unless err instanceof UpdatesLockedError
throw err throw err
updateUsingStrategy = (strategy, localApp, app, needsDownload, force, timeout) -> updateUsingStrategy = (strategy, options) ->
if strategy not in _.keys(updateStrategies) if not _.has(updateStrategies, strategy)
strategy = 'normal-update' strategy = 'download-then-kill'
updateStrategies[strategy](localApp, app, needsDownload, force, timeout) updateStrategies[strategy](options)
getRemoteApps = (uuid, apiKey) -> getRemoteApps = (uuid, apiKey) ->
cachedResinApi.get cachedResinApi.get
@ -430,12 +441,11 @@ getEnvAndFormatRemoteApps = (deviceId, remoteApps, uuid, apiKey) ->
.then (environment) -> .then (environment) ->
app.environment_variable = environment app.environment_variable = environment
utils.extendEnvVars(app.environment_variable, uuid) utils.extendEnvVars(app.environment_variable, uuid)
.then (env) -> .then (fullEnv) ->
fullEnv = env env = _.omit(fullEnv, _.keys(specialActionEnvVars))
env = _.omit(env, _.keys(specialActionEnvVars))
return [ return [
{ {
appId: app.id appId: '' + app.id
env: fullEnv env: fullEnv
}, },
{ {
@ -445,6 +455,7 @@ getEnvAndFormatRemoteApps = (deviceId, remoteApps, uuid, apiKey) ->
env: JSON.stringify(env) # The env has to be stored as a JSON string for knex env: JSON.stringify(env) # The env has to be stored as a JSON string for knex
} }
] ]
.then(_.flatten)
.then(_.zip) .then(_.zip)
.then ([ remoteAppEnvs, remoteApps ]) -> .then ([ remoteAppEnvs, remoteApps ]) ->
return [_.mapValues(_.indexBy(remoteAppEnvs, 'appId'), 'env'), _.indexBy(remoteApps, 'appId')] return [_.mapValues(_.indexBy(remoteAppEnvs, 'appId'), 'env'), _.indexBy(remoteApps, 'appId')]
@ -476,6 +487,9 @@ compareForUpdate = (localApps, remoteApps, localAppEnvs, remoteAppEnvs) ->
allAppIds = _.union(localAppIds, remoteAppIds) allAppIds = _.union(localAppIds, remoteAppIds)
return { toBeRemoved, toBeDownloaded, toBeInstalled, toBeUpdated, appsWithChangedEnvs, allAppIds } return { toBeRemoved, toBeDownloaded, toBeInstalled, toBeUpdated, appsWithChangedEnvs, allAppIds }
getConfig = (key) ->
knex('config').select('value').where({ key }).get(0).get('value')
application.update = update = (force) -> application.update = update = (force) ->
if updateStatus.state isnt UPDATE_IDLE if updateStatus.state isnt UPDATE_IDLE
# Mark an update required after the current. # Mark an update required after the current.
@ -484,15 +498,7 @@ application.update = update = (force) ->
return return
updateStatus.state = UPDATE_UPDATING updateStatus.state = UPDATE_UPDATING
bootstrap.done.then -> bootstrap.done.then ->
Promise.all([ Promise.join getConfig('apiKey'), getConfig('uuid'), knex('app').select(), (apiKey, uuid, apps) ->
knex('config').select('value').where(key: 'apiKey')
knex('config').select('value').where(key: 'uuid')
knex('app').select()
])
.then ([ [ apiKey ], [ uuid ], apps ]) ->
apiKey = apiKey.value
uuid = uuid.value
deviceId = device.getID() deviceId = device.getID()
remoteApps = getRemoteApps(uuid, apiKey) remoteApps = getRemoteApps(uuid, apiKey)
@ -523,10 +529,11 @@ application.update = update = (force) ->
Promise.try -> Promise.try ->
needsDownload = _.includes(toBeDownloaded, appId) needsDownload = _.includes(toBeDownloaded, appId)
if _.includes(toBeRemoved, appId) if _.includes(toBeRemoved, appId)
Promise.using lockUpdates(apps[appId], force), -> Promise.using lockUpdates(localApps[appId], force), ->
# We get the app from the DB again in case someone restarted it # We get the app from the DB again in case someone restarted it
# (which would have changed its containerId) # (which would have changed its containerId)
selectAndKill(appId) select(appId)
.then(kill)
.then -> .then ->
knex('app').where('appId', appId).delete() knex('app').where('appId', appId).delete()
.catch (err) -> .catch (err) ->
@ -544,13 +551,22 @@ application.update = update = (force) ->
app = remoteApps[appId] app = remoteApps[appId]
# Restore the complete environment so that it's persisted in the DB # Restore the complete environment so that it's persisted in the DB
app.env = JSON.stringify(remoteAppEnvs[appId]) app.env = JSON.stringify(remoteAppEnvs[appId])
forceThisApp = remoteAppEnvs[appId]['RESIN_SUPERVISOR_OVERRIDE_LOCK'] == '1' || remoteAppEnvs[appId]['RESIN_OVERRIDE_LOCK'] == '1' forceThisApp =
remoteAppEnvs[appId]['RESIN_SUPERVISOR_OVERRIDE_LOCK'] == '1' ||
remoteAppEnvs[appId]['RESIN_OVERRIDE_LOCK'] == '1'
strategy = remoteAppEnvs[appId]['RESIN_SUPERVISOR_UPDATE_STRATEGY'] strategy = remoteAppEnvs[appId]['RESIN_SUPERVISOR_UPDATE_STRATEGY']
timeout = remoteAppEnvs[appId]['RESIN_SUPERVISOR_HANDOVER_TIMEOUT'] timeout = remoteAppEnvs[appId]['RESIN_SUPERVISOR_HANDOVER_TIMEOUT']
updateUsingStrategy(strategy, apps[appId], app, needsDownload, force || forceThisApp, timeout) updateUsingStrategy strategy, {
localApp: localApps[appId]
app
needsDownload
force: force || forceThisApp
timeout
}
.catch(wrapAsError) .catch(wrapAsError)
.filter(_.isError) .filter(_.isError)
.then (failures) -> .then (failures) ->
_.each(failures, (err) -> console.error('Error:', err, err.stack))
throw new Error(joinErrorMessages(failures)) if failures.length > 0 throw new Error(joinErrorMessages(failures)) if failures.length > 0
.then -> .then ->
updateStatus.failed = 0 updateStatus.failed = 0