mirror of
https://github.com/balena-os/balena-supervisor.git
synced 2025-01-29 15:44:13 +00:00
Add a /v1/healthy endpoint that fails if the supervisor is unhealthy, and a HEALTHCHECK command to the Dockerfile that uses it
We add an endpoint to the supervisor API that checks the following conditions to determine whether the supervisor is healthy: * That the update cycle has run fully, in a time that's less than twice the poll interval. Unless we're downloading an image, in which case we assume it's healthy (otherwise we'd get into the issue of determining a reasonable timeout for the image download, which is already done in a configurable way with delta options and the like). * That the current state report to the Resin API hasn't failed more than 3 times. Unless the device has no connectivity, or the connectivity check is disabled, in which case we don't know if the report failed simply because there's no network. * That the gosuper component is working (since we periodically hit its API to get the IP addresses, we mark it as not working if this API call fails). We need this endpoint to be unauthenticated for the docker daemon to be able to hit it (though, as the rest of the API, it is protected with iptables rules). Change-Type: minor Signed-off-by: Pablo Carranza Velez <pablo@resin.io>
This commit is contained in:
parent
29616a6c29
commit
95bbe6ea49
@ -209,4 +209,6 @@ ENV CONFIG_MOUNT_POINT=/boot/config.json \
|
||||
DEFAULT_PUBNUB_SUBSCRIBE_KEY=$DEFAULT_PUBNUB_SUBSCRIBE_KEY \
|
||||
DEFAULT_MIXPANEL_TOKEN=$DEFAULT_MIXPANEL_TOKEN
|
||||
|
||||
HEALTHCHECK CMD wget -qO- http://127.0.0.1:${LISTEN_PORT:-48484}/v1/healthy || exit 1
|
||||
|
||||
CMD [ "/sbin/init" ]
|
||||
|
27
docs/API.md
27
docs/API.md
@ -483,3 +483,30 @@ $ curl -X POST --header "Content-Type:application/json" \
|
||||
--data '{"deviceId": <deviceId>, "appId": <appId>, "method": "GET"}' \
|
||||
"https://api.resin.io/supervisor/v1/apps/<appId>"
|
||||
```
|
||||
|
||||
<hr>
|
||||
|
||||
### GET /v1/healthy
|
||||
|
||||
Added in supervisor v6.5.0.
|
||||
|
||||
Used internally to check whether the supervisor is running correctly, according to some heuristics that help determine
|
||||
whether the internal components, application updates and reporting to the Resin API are functioning.
|
||||
|
||||
Responds with an empty 200 response if the supervisor is healthy, or a 500 status code if something is not working
|
||||
correctly.
|
||||
|
||||
#### Examples:
|
||||
From the app on the device:
|
||||
```bash
|
||||
$ curl "$RESIN_SUPERVISOR_ADDRESS/v1/healthy"
|
||||
```
|
||||
(Empty response)
|
||||
|
||||
Remotely via the API proxy:
|
||||
```bash
|
||||
$ curl -X POST --header "Content-Type:application/json" \
|
||||
--header "Authorization: Bearer <auth token>" \
|
||||
--data '{"deviceId": <deviceId>, "appId": <appId>, "method": "GET"}' \
|
||||
"https://api.resin.io/supervisor/v1/healthy"
|
||||
```
|
||||
|
@ -9,12 +9,7 @@ _ = require 'lodash'
|
||||
proxyvisor = require './proxyvisor'
|
||||
|
||||
module.exports = (application) ->
|
||||
api = express()
|
||||
unparsedRouter = express.Router()
|
||||
parsedRouter = express.Router()
|
||||
parsedRouter.use(bodyParser())
|
||||
|
||||
api.use (req, res, next) ->
|
||||
authenticate = (req, res, next) ->
|
||||
queryKey = req.query.apikey
|
||||
header = req.get('Authorization') ? ''
|
||||
match = header.match(/^ApiKey (\w+)$/)
|
||||
@ -33,6 +28,15 @@ module.exports = (application) ->
|
||||
# This should never happen...
|
||||
res.status(503).send('Invalid API key in supervisor')
|
||||
|
||||
api = express()
|
||||
unparsedRouter = express.Router()
|
||||
parsedRouter = express.Router()
|
||||
unauthenticatedRouter = express.Router()
|
||||
|
||||
parsedRouter.use(bodyParser())
|
||||
parsedRouter.use(authenticate)
|
||||
unparsedRouter.use(authenticate)
|
||||
|
||||
unparsedRouter.get '/ping', (req, res) ->
|
||||
res.send('OK')
|
||||
|
||||
@ -218,6 +222,20 @@ module.exports = (application) ->
|
||||
unparsedRouter.get '/v1/device', (req, res) ->
|
||||
res.json(device.getState())
|
||||
|
||||
unauthenticatedRouter.get '/v1/healthy', (req, res) ->
|
||||
# Has the update cycle not hung? (unless we're downloading an image)
|
||||
healthy = application.healthy()
|
||||
# If we're connected and we know it, has the current state been reported?
|
||||
healthy and= device.stateReportHealthy()
|
||||
# As far as we know, is gosuper healthy?
|
||||
healthy and= device.gosuperHealthy()
|
||||
|
||||
if healthy
|
||||
res.sendStatus(200)
|
||||
else
|
||||
res.sendStatus(500)
|
||||
|
||||
api.use(unauthenticatedRouter)
|
||||
api.use(unparsedRouter)
|
||||
api.use(parsedRouter)
|
||||
api.use(proxyvisor.router)
|
||||
|
@ -7,7 +7,6 @@ knex = require './db'
|
||||
utils = require './utils'
|
||||
bootstrap = require './bootstrap'
|
||||
config = require './config'
|
||||
_ = require 'lodash'
|
||||
|
||||
knex.init.then ->
|
||||
utils.mixpanelTrack('Supervisor start')
|
||||
@ -55,11 +54,14 @@ knex.init.then ->
|
||||
updateIpAddr = ->
|
||||
utils.gosuper.getAsync('/v1/ipaddr', { json: true })
|
||||
.spread (response, body) ->
|
||||
if response.statusCode == 200 && body.Data.IPAddresses?
|
||||
device.updateState(
|
||||
ip_address: body.Data.IPAddresses.join(' ')
|
||||
)
|
||||
.catch(_.noop)
|
||||
if response.statusCode != 200 || !body.Data.IPAddresses?
|
||||
throw new Error('Invalid response from gosuper')
|
||||
device.updateState(
|
||||
ip_address: body.Data.IPAddresses.join(' ')
|
||||
)
|
||||
.catch ->
|
||||
device.reportUnhealthyGosuper()
|
||||
|
||||
console.log('Starting periodic check for IP addresses..')
|
||||
setInterval(updateIpAddr, 30 * 1000) # Every 30s
|
||||
updateIpAddr()
|
||||
|
@ -7,7 +7,7 @@ dockerUtils = require './docker-utils'
|
||||
Promise = require 'bluebird'
|
||||
utils = require './utils'
|
||||
logger = require './lib/logger'
|
||||
{ cachedResinApi, request } = require './request'
|
||||
{ cachedResinApi } = require './request'
|
||||
device = require './device'
|
||||
lockFile = Promise.promisifyAll(require('lockfile'))
|
||||
bootstrap = require './bootstrap'
|
||||
@ -20,6 +20,19 @@ osRelease = require './lib/os-release'
|
||||
deviceConfig = require './device-config'
|
||||
randomHexString = require './lib/random-hex-string'
|
||||
|
||||
UPDATE_IDLE = 0
|
||||
UPDATE_UPDATING = 1
|
||||
UPDATE_REQUIRED = 2
|
||||
UPDATE_SCHEDULED = 3
|
||||
|
||||
updateStatus =
|
||||
state: UPDATE_IDLE
|
||||
failed: 0
|
||||
forceNext: false
|
||||
intervalHandle: null
|
||||
lastFullUpdateCycle: process.hrtime()[0]
|
||||
currentlyDownloading: false
|
||||
|
||||
class UpdatesLockedError extends TypedError
|
||||
ImageNotFoundError = (err) ->
|
||||
return "#{err.statusCode}" is '404'
|
||||
@ -121,6 +134,10 @@ application = {}
|
||||
application.UpdatesLockedError = UpdatesLockedError
|
||||
application.localMode = false
|
||||
|
||||
application.healthy = ->
|
||||
timeSinceLastCycle = (process.hrtime()[0] - updateStatus.lastFullUpdateCycle) * 1000
|
||||
return updateStatus.currentlyDownloading or timeSinceLastCycle <= 2 * config.apiPollInterval
|
||||
|
||||
application.logSystemMessage = logSystemMessage = (message, obj, eventName) ->
|
||||
logger.log({ m: message, s: 1 })
|
||||
utils.mixpanelTrack(eventName ? message, obj)
|
||||
@ -217,6 +234,7 @@ fetch = (app, { deltaSource, setDeviceUpdateState = true } = {}) ->
|
||||
|
||||
docker.getImage(app.imageId).inspect()
|
||||
.catch ImageNotFoundError, ->
|
||||
updateStatus.currentlyDownloading = true
|
||||
device.updateState(status: 'Downloading', download_progress: 0)
|
||||
|
||||
Promise.try ->
|
||||
@ -244,6 +262,8 @@ fetch = (app, { deltaSource, setDeviceUpdateState = true } = {}) ->
|
||||
.catch (err) ->
|
||||
logSystemEvent(logTypes.downloadAppError, app, err)
|
||||
throw err
|
||||
.finally ->
|
||||
updateStatus.currentlyDownloading = false
|
||||
|
||||
shouldMountKmod = (image) ->
|
||||
device.isResinOSv1().then (isV1) ->
|
||||
@ -576,17 +596,6 @@ waitToKill = (app, timeout) ->
|
||||
Promise.delay(pollInterval).then(retryCheck)
|
||||
retryCheck()
|
||||
|
||||
UPDATE_IDLE = 0
|
||||
UPDATE_UPDATING = 1
|
||||
UPDATE_REQUIRED = 2
|
||||
UPDATE_SCHEDULED = 3
|
||||
|
||||
updateStatus =
|
||||
state: UPDATE_IDLE
|
||||
failed: 0
|
||||
forceNext: false
|
||||
intervalHandle: null
|
||||
|
||||
updateStrategies =
|
||||
'download-then-kill': ({ localApp, app, needsDownload, force, deltaSource }) ->
|
||||
Promise.try ->
|
||||
@ -885,6 +894,7 @@ application.update = update = (force, scheduled = false) ->
|
||||
else
|
||||
updateStatus.state = UPDATE_IDLE
|
||||
device.updateState(status: 'Idle')
|
||||
updateStatus.lastFullUpdateCycle = process.hrtime()[0]
|
||||
return
|
||||
|
||||
sanitiseContainerName = (name) -> name.replace(/^\//, '')
|
||||
|
@ -185,6 +185,10 @@ do ->
|
||||
targetState = {}
|
||||
actualState = {}
|
||||
updateState = { update_pending: false, update_failed: false, update_downloaded: false }
|
||||
reportErrors = 0
|
||||
|
||||
exports.stateReportHealthy = ->
|
||||
return !(utils.isConnectivityCheckEnabled() and utils.connected() and reportErrors > 3)
|
||||
|
||||
getStateDiff = ->
|
||||
_.omitBy targetState, (value, key) ->
|
||||
@ -211,11 +215,13 @@ do ->
|
||||
apikey: apiKey
|
||||
.timeout(config.apiTimeout)
|
||||
.then ->
|
||||
reportErrors = 0
|
||||
# Update the actual state.
|
||||
_.merge(actualState, stateDiff)
|
||||
)
|
||||
.delay(APPLY_STATE_SUCCESS_DELAY)
|
||||
.catch (error) ->
|
||||
reportErrors += 1
|
||||
utils.mixpanelTrack('Device info update failure', { error, stateDiff })
|
||||
# Delay 5s before retrying a failed update
|
||||
Promise.delay(APPLY_STATE_RETRY_DELAY)
|
||||
@ -254,3 +260,10 @@ exports.isResinOSv1 = memoizePromise ->
|
||||
|
||||
exports.getOSVariant = memoizePromise ->
|
||||
osRelease.getOSVariant(config.hostOSVersionPath)
|
||||
|
||||
do ->
|
||||
_gosuperHealthy = true
|
||||
exports.gosuperHealthy = ->
|
||||
return _gosuperHealthy
|
||||
exports.reportUnhealthyGosuper = ->
|
||||
_gosuperHealthy = false
|
||||
|
@ -70,11 +70,14 @@ disableConnectivityCheck = false
|
||||
# options: An object of net.connect options, with the addition of:
|
||||
# timeout: 10s
|
||||
checkHost = (options) ->
|
||||
if disableConnectivityCheck or pauseConnectivityCheck
|
||||
if !isConnectivityCheckEnabled()
|
||||
return true
|
||||
else
|
||||
return networkCheck.checkHost(options)
|
||||
|
||||
exports.isConnectivityCheckEnabled = isConnectivityCheckEnabled = ->
|
||||
return !disableConnectivityCheck and !pauseConnectivityCheck
|
||||
|
||||
# Custom monitor that uses checkHost function above.
|
||||
customMonitor = (options, fn) ->
|
||||
networkCheck.monitor(checkHost, options, fn)
|
||||
@ -98,30 +101,36 @@ vpnStatusInotifyCallback = ->
|
||||
# Use the following to catch EEXIST errors
|
||||
EEXIST = (err) -> err.code is 'EEXIST'
|
||||
|
||||
exports.connectivityCheck = _.once ->
|
||||
if !config.apiEndpoint?
|
||||
console.log('No apiEndpoint specified, skipping connectivity check')
|
||||
return
|
||||
parsedUrl = url.parse(config.apiEndpoint)
|
||||
fs.mkdirAsync(config.vpnStatusPath)
|
||||
.catch EEXIST, (err) ->
|
||||
console.log('VPN status path exists.')
|
||||
.then ->
|
||||
fs.watch(config.vpnStatusPath, vpnStatusInotifyCallback)
|
||||
do ->
|
||||
_connected = true
|
||||
exports.connected = ->
|
||||
return _connected
|
||||
|
||||
# Manually trigger the call back to detect cases when VPN was switched on before the supervisor starts.
|
||||
vpnStatusInotifyCallback()
|
||||
customMonitor
|
||||
host: parsedUrl.hostname
|
||||
port: parsedUrl.port ? (if parsedUrl.protocol is 'https:' then 443 else 80)
|
||||
interval: 10 * 1000
|
||||
(connected) ->
|
||||
if connected
|
||||
console.log('Internet Connectivity: OK')
|
||||
blink.pattern.stop()
|
||||
else
|
||||
console.log('Waiting for connectivity...')
|
||||
blink.pattern.start(networkPattern)
|
||||
exports.connectivityCheck = _.once ->
|
||||
if !config.apiEndpoint?
|
||||
console.log('No apiEndpoint specified, skipping connectivity check')
|
||||
return
|
||||
parsedUrl = url.parse(config.apiEndpoint)
|
||||
fs.mkdirAsync(config.vpnStatusPath)
|
||||
.catch EEXIST, (err) ->
|
||||
console.log('VPN status path exists.')
|
||||
.then ->
|
||||
fs.watch(config.vpnStatusPath, vpnStatusInotifyCallback)
|
||||
|
||||
# Manually trigger the call back to detect cases when VPN was switched on before the supervisor starts.
|
||||
vpnStatusInotifyCallback()
|
||||
customMonitor
|
||||
host: parsedUrl.hostname
|
||||
port: parsedUrl.port ? (if parsedUrl.protocol is 'https:' then 443 else 80)
|
||||
interval: 10 * 1000
|
||||
(connected) ->
|
||||
_connected = connected
|
||||
if connected
|
||||
console.log('Internet Connectivity: OK')
|
||||
blink.pattern.stop()
|
||||
else
|
||||
console.log('Waiting for connectivity...')
|
||||
blink.pattern.start(networkPattern)
|
||||
|
||||
|
||||
secretPromises = {}
|
||||
|
Loading…
x
Reference in New Issue
Block a user