diff --git a/Dockerfile b/Dockerfile index ff65cf09..6a9d225d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -209,4 +209,6 @@ ENV CONFIG_MOUNT_POINT=/boot/config.json \ DEFAULT_PUBNUB_SUBSCRIBE_KEY=$DEFAULT_PUBNUB_SUBSCRIBE_KEY \ DEFAULT_MIXPANEL_TOKEN=$DEFAULT_MIXPANEL_TOKEN +HEALTHCHECK CMD wget -qO- http://127.0.0.1:${LISTEN_PORT:-48484}/v1/healthy || exit 1 + CMD [ "/sbin/init" ] diff --git a/docs/API.md b/docs/API.md index 22138682..15631b62 100644 --- a/docs/API.md +++ b/docs/API.md @@ -483,3 +483,30 @@ $ curl -X POST --header "Content-Type:application/json" \ --data '{"deviceId": , "appId": , "method": "GET"}' \ "https://api.resin.io/supervisor/v1/apps/" ``` + +
+ +### GET /v1/healthy + +Added in supervisor v6.5.0. + +Used internally to check whether the supervisor is running correctly, according to some heuristics that help determine +whether the internal components, application updates and reporting to the Resin API are functioning. + +Responds with an empty 200 response if the supervisor is healthy, or a 500 status code if something is not working +correctly. + +#### Examples: +From the app on the device: +```bash +$ curl "$RESIN_SUPERVISOR_ADDRESS/v1/healthy" +``` +(Empty response) + +Remotely via the API proxy: +```bash +$ curl -X POST --header "Content-Type:application/json" \ + --header "Authorization: Bearer " \ + --data '{"deviceId": , "appId": , "method": "GET"}' \ + "https://api.resin.io/supervisor/v1/healthy" +``` diff --git a/src/api.coffee b/src/api.coffee index ccad1e00..7810304e 100644 --- a/src/api.coffee +++ b/src/api.coffee @@ -9,12 +9,7 @@ _ = require 'lodash' proxyvisor = require './proxyvisor' module.exports = (application) -> - api = express() - unparsedRouter = express.Router() - parsedRouter = express.Router() - parsedRouter.use(bodyParser()) - - api.use (req, res, next) -> + authenticate = (req, res, next) -> queryKey = req.query.apikey header = req.get('Authorization') ? '' match = header.match(/^ApiKey (\w+)$/) @@ -33,6 +28,15 @@ module.exports = (application) -> # This should never happen... res.status(503).send('Invalid API key in supervisor') + api = express() + unparsedRouter = express.Router() + parsedRouter = express.Router() + unauthenticatedRouter = express.Router() + + parsedRouter.use(bodyParser()) + parsedRouter.use(authenticate) + unparsedRouter.use(authenticate) + unparsedRouter.get '/ping', (req, res) -> res.send('OK') @@ -218,6 +222,20 @@ module.exports = (application) -> unparsedRouter.get '/v1/device', (req, res) -> res.json(device.getState()) + unauthenticatedRouter.get '/v1/healthy', (req, res) -> + # Has the update cycle not hung? (unless we're downloading an image) + healthy = application.healthy() + # If we're connected and we know it, has the current state been reported? + healthy and= device.stateReportHealthy() + # As far as we know, is gosuper healthy? + healthy and= device.gosuperHealthy() + + if healthy + res.sendStatus(200) + else + res.sendStatus(500) + + api.use(unauthenticatedRouter) api.use(unparsedRouter) api.use(parsedRouter) api.use(proxyvisor.router) diff --git a/src/app.coffee b/src/app.coffee index ce00369d..875e47a7 100644 --- a/src/app.coffee +++ b/src/app.coffee @@ -7,7 +7,6 @@ knex = require './db' utils = require './utils' bootstrap = require './bootstrap' config = require './config' -_ = require 'lodash' knex.init.then -> utils.mixpanelTrack('Supervisor start') @@ -55,11 +54,14 @@ knex.init.then -> updateIpAddr = -> utils.gosuper.getAsync('/v1/ipaddr', { json: true }) .spread (response, body) -> - if response.statusCode == 200 && body.Data.IPAddresses? - device.updateState( - ip_address: body.Data.IPAddresses.join(' ') - ) - .catch(_.noop) + if response.statusCode != 200 || !body.Data.IPAddresses? + throw new Error('Invalid response from gosuper') + device.updateState( + ip_address: body.Data.IPAddresses.join(' ') + ) + .catch -> + device.reportUnhealthyGosuper() + console.log('Starting periodic check for IP addresses..') setInterval(updateIpAddr, 30 * 1000) # Every 30s updateIpAddr() diff --git a/src/application.coffee b/src/application.coffee index 8304a272..6d1eb198 100644 --- a/src/application.coffee +++ b/src/application.coffee @@ -7,7 +7,7 @@ dockerUtils = require './docker-utils' Promise = require 'bluebird' utils = require './utils' logger = require './lib/logger' -{ cachedResinApi, request } = require './request' +{ cachedResinApi } = require './request' device = require './device' lockFile = Promise.promisifyAll(require('lockfile')) bootstrap = require './bootstrap' @@ -20,6 +20,19 @@ osRelease = require './lib/os-release' deviceConfig = require './device-config' randomHexString = require './lib/random-hex-string' +UPDATE_IDLE = 0 +UPDATE_UPDATING = 1 +UPDATE_REQUIRED = 2 +UPDATE_SCHEDULED = 3 + +updateStatus = + state: UPDATE_IDLE + failed: 0 + forceNext: false + intervalHandle: null + lastFullUpdateCycle: process.hrtime()[0] + currentlyDownloading: false + class UpdatesLockedError extends TypedError ImageNotFoundError = (err) -> return "#{err.statusCode}" is '404' @@ -121,6 +134,10 @@ application = {} application.UpdatesLockedError = UpdatesLockedError application.localMode = false +application.healthy = -> + timeSinceLastCycle = (process.hrtime()[0] - updateStatus.lastFullUpdateCycle) * 1000 + return updateStatus.currentlyDownloading or timeSinceLastCycle <= 2 * config.apiPollInterval + application.logSystemMessage = logSystemMessage = (message, obj, eventName) -> logger.log({ m: message, s: 1 }) utils.mixpanelTrack(eventName ? message, obj) @@ -217,6 +234,7 @@ fetch = (app, { deltaSource, setDeviceUpdateState = true } = {}) -> docker.getImage(app.imageId).inspect() .catch ImageNotFoundError, -> + updateStatus.currentlyDownloading = true device.updateState(status: 'Downloading', download_progress: 0) Promise.try -> @@ -244,6 +262,8 @@ fetch = (app, { deltaSource, setDeviceUpdateState = true } = {}) -> .catch (err) -> logSystemEvent(logTypes.downloadAppError, app, err) throw err + .finally -> + updateStatus.currentlyDownloading = false shouldMountKmod = (image) -> device.isResinOSv1().then (isV1) -> @@ -576,17 +596,6 @@ waitToKill = (app, timeout) -> Promise.delay(pollInterval).then(retryCheck) retryCheck() -UPDATE_IDLE = 0 -UPDATE_UPDATING = 1 -UPDATE_REQUIRED = 2 -UPDATE_SCHEDULED = 3 - -updateStatus = - state: UPDATE_IDLE - failed: 0 - forceNext: false - intervalHandle: null - updateStrategies = 'download-then-kill': ({ localApp, app, needsDownload, force, deltaSource }) -> Promise.try -> @@ -885,6 +894,7 @@ application.update = update = (force, scheduled = false) -> else updateStatus.state = UPDATE_IDLE device.updateState(status: 'Idle') + updateStatus.lastFullUpdateCycle = process.hrtime()[0] return sanitiseContainerName = (name) -> name.replace(/^\//, '') diff --git a/src/device.coffee b/src/device.coffee index 17851b82..dd3d5b5b 100644 --- a/src/device.coffee +++ b/src/device.coffee @@ -185,6 +185,10 @@ do -> targetState = {} actualState = {} updateState = { update_pending: false, update_failed: false, update_downloaded: false } + reportErrors = 0 + + exports.stateReportHealthy = -> + return !(utils.isConnectivityCheckEnabled() and utils.connected() and reportErrors > 3) getStateDiff = -> _.omitBy targetState, (value, key) -> @@ -211,11 +215,13 @@ do -> apikey: apiKey .timeout(config.apiTimeout) .then -> + reportErrors = 0 # Update the actual state. _.merge(actualState, stateDiff) ) .delay(APPLY_STATE_SUCCESS_DELAY) .catch (error) -> + reportErrors += 1 utils.mixpanelTrack('Device info update failure', { error, stateDiff }) # Delay 5s before retrying a failed update Promise.delay(APPLY_STATE_RETRY_DELAY) @@ -254,3 +260,10 @@ exports.isResinOSv1 = memoizePromise -> exports.getOSVariant = memoizePromise -> osRelease.getOSVariant(config.hostOSVersionPath) + +do -> + _gosuperHealthy = true + exports.gosuperHealthy = -> + return _gosuperHealthy + exports.reportUnhealthyGosuper = -> + _gosuperHealthy = false diff --git a/src/utils.coffee b/src/utils.coffee index 69448132..a9fbc98c 100644 --- a/src/utils.coffee +++ b/src/utils.coffee @@ -70,11 +70,14 @@ disableConnectivityCheck = false # options: An object of net.connect options, with the addition of: # timeout: 10s checkHost = (options) -> - if disableConnectivityCheck or pauseConnectivityCheck + if !isConnectivityCheckEnabled() return true else return networkCheck.checkHost(options) +exports.isConnectivityCheckEnabled = isConnectivityCheckEnabled = -> + return !disableConnectivityCheck and !pauseConnectivityCheck + # Custom monitor that uses checkHost function above. customMonitor = (options, fn) -> networkCheck.monitor(checkHost, options, fn) @@ -98,30 +101,36 @@ vpnStatusInotifyCallback = -> # Use the following to catch EEXIST errors EEXIST = (err) -> err.code is 'EEXIST' -exports.connectivityCheck = _.once -> - if !config.apiEndpoint? - console.log('No apiEndpoint specified, skipping connectivity check') - return - parsedUrl = url.parse(config.apiEndpoint) - fs.mkdirAsync(config.vpnStatusPath) - .catch EEXIST, (err) -> - console.log('VPN status path exists.') - .then -> - fs.watch(config.vpnStatusPath, vpnStatusInotifyCallback) +do -> + _connected = true + exports.connected = -> + return _connected - # Manually trigger the call back to detect cases when VPN was switched on before the supervisor starts. - vpnStatusInotifyCallback() - customMonitor - host: parsedUrl.hostname - port: parsedUrl.port ? (if parsedUrl.protocol is 'https:' then 443 else 80) - interval: 10 * 1000 - (connected) -> - if connected - console.log('Internet Connectivity: OK') - blink.pattern.stop() - else - console.log('Waiting for connectivity...') - blink.pattern.start(networkPattern) + exports.connectivityCheck = _.once -> + if !config.apiEndpoint? + console.log('No apiEndpoint specified, skipping connectivity check') + return + parsedUrl = url.parse(config.apiEndpoint) + fs.mkdirAsync(config.vpnStatusPath) + .catch EEXIST, (err) -> + console.log('VPN status path exists.') + .then -> + fs.watch(config.vpnStatusPath, vpnStatusInotifyCallback) + + # Manually trigger the call back to detect cases when VPN was switched on before the supervisor starts. + vpnStatusInotifyCallback() + customMonitor + host: parsedUrl.hostname + port: parsedUrl.port ? (if parsedUrl.protocol is 'https:' then 443 else 80) + interval: 10 * 1000 + (connected) -> + _connected = connected + if connected + console.log('Internet Connectivity: OK') + blink.pattern.stop() + else + console.log('Waiting for connectivity...') + blink.pattern.start(networkPattern) secretPromises = {}