Add a /v1/healthy endpoint that fails if the supervisor is unhealthy, and a HEALTHCHECK command to the Dockerfile that uses it

We add an endpoint to the supervisor API that checks the following conditions to determine whether the supervisor is healthy:
* That the update cycle has run fully, in a time that's less than twice the poll interval. Unless we're downloading an image, in which case
we assume it's healthy (otherwise we'd get into the issue of determining a reasonable timeout for the image download, which is already done in a configurable way with delta options and the like).
* That the current state report to the Resin API hasn't failed more than 3 times. Unless the device has no connectivity, or the connectivity check is disabled, in which case we don't know
if the report failed simply because there's no network.
* That the gosuper component is working (since we periodically hit its API to get the IP addresses, we mark it as not working if this API call fails).

We need this endpoint to be unauthenticated for the docker daemon to be able to hit it (though, as the rest of the API, it is protected with iptables rules).

Change-Type: minor
Signed-off-by: Pablo Carranza Velez <pablo@resin.io>
This commit is contained in:
Pablo Carranza Velez 2017-11-29 14:38:18 -08:00
parent 29616a6c29
commit 95bbe6ea49
7 changed files with 129 additions and 48 deletions

View File

@ -209,4 +209,6 @@ ENV CONFIG_MOUNT_POINT=/boot/config.json \
DEFAULT_PUBNUB_SUBSCRIBE_KEY=$DEFAULT_PUBNUB_SUBSCRIBE_KEY \
DEFAULT_MIXPANEL_TOKEN=$DEFAULT_MIXPANEL_TOKEN
HEALTHCHECK CMD wget -qO- http://127.0.0.1:${LISTEN_PORT:-48484}/v1/healthy || exit 1
CMD [ "/sbin/init" ]

View File

@ -483,3 +483,30 @@ $ curl -X POST --header "Content-Type:application/json" \
--data '{"deviceId": <deviceId>, "appId": <appId>, "method": "GET"}' \
"https://api.resin.io/supervisor/v1/apps/<appId>"
```
<hr>
### GET /v1/healthy
Added in supervisor v6.5.0.
Used internally to check whether the supervisor is running correctly, according to some heuristics that help determine
whether the internal components, application updates and reporting to the Resin API are functioning.
Responds with an empty 200 response if the supervisor is healthy, or a 500 status code if something is not working
correctly.
#### Examples:
From the app on the device:
```bash
$ curl "$RESIN_SUPERVISOR_ADDRESS/v1/healthy"
```
(Empty response)
Remotely via the API proxy:
```bash
$ curl -X POST --header "Content-Type:application/json" \
--header "Authorization: Bearer <auth token>" \
--data '{"deviceId": <deviceId>, "appId": <appId>, "method": "GET"}' \
"https://api.resin.io/supervisor/v1/healthy"
```

View File

@ -9,12 +9,7 @@ _ = require 'lodash'
proxyvisor = require './proxyvisor'
module.exports = (application) ->
api = express()
unparsedRouter = express.Router()
parsedRouter = express.Router()
parsedRouter.use(bodyParser())
api.use (req, res, next) ->
authenticate = (req, res, next) ->
queryKey = req.query.apikey
header = req.get('Authorization') ? ''
match = header.match(/^ApiKey (\w+)$/)
@ -33,6 +28,15 @@ module.exports = (application) ->
# This should never happen...
res.status(503).send('Invalid API key in supervisor')
api = express()
unparsedRouter = express.Router()
parsedRouter = express.Router()
unauthenticatedRouter = express.Router()
parsedRouter.use(bodyParser())
parsedRouter.use(authenticate)
unparsedRouter.use(authenticate)
unparsedRouter.get '/ping', (req, res) ->
res.send('OK')
@ -218,6 +222,20 @@ module.exports = (application) ->
unparsedRouter.get '/v1/device', (req, res) ->
res.json(device.getState())
unauthenticatedRouter.get '/v1/healthy', (req, res) ->
# Has the update cycle not hung? (unless we're downloading an image)
healthy = application.healthy()
# If we're connected and we know it, has the current state been reported?
healthy and= device.stateReportHealthy()
# As far as we know, is gosuper healthy?
healthy and= device.gosuperHealthy()
if healthy
res.sendStatus(200)
else
res.sendStatus(500)
api.use(unauthenticatedRouter)
api.use(unparsedRouter)
api.use(parsedRouter)
api.use(proxyvisor.router)

View File

@ -7,7 +7,6 @@ knex = require './db'
utils = require './utils'
bootstrap = require './bootstrap'
config = require './config'
_ = require 'lodash'
knex.init.then ->
utils.mixpanelTrack('Supervisor start')
@ -55,11 +54,14 @@ knex.init.then ->
updateIpAddr = ->
utils.gosuper.getAsync('/v1/ipaddr', { json: true })
.spread (response, body) ->
if response.statusCode == 200 && body.Data.IPAddresses?
device.updateState(
ip_address: body.Data.IPAddresses.join(' ')
)
.catch(_.noop)
if response.statusCode != 200 || !body.Data.IPAddresses?
throw new Error('Invalid response from gosuper')
device.updateState(
ip_address: body.Data.IPAddresses.join(' ')
)
.catch ->
device.reportUnhealthyGosuper()
console.log('Starting periodic check for IP addresses..')
setInterval(updateIpAddr, 30 * 1000) # Every 30s
updateIpAddr()

View File

@ -7,7 +7,7 @@ dockerUtils = require './docker-utils'
Promise = require 'bluebird'
utils = require './utils'
logger = require './lib/logger'
{ cachedResinApi, request } = require './request'
{ cachedResinApi } = require './request'
device = require './device'
lockFile = Promise.promisifyAll(require('lockfile'))
bootstrap = require './bootstrap'
@ -20,6 +20,19 @@ osRelease = require './lib/os-release'
deviceConfig = require './device-config'
randomHexString = require './lib/random-hex-string'
UPDATE_IDLE = 0
UPDATE_UPDATING = 1
UPDATE_REQUIRED = 2
UPDATE_SCHEDULED = 3
updateStatus =
state: UPDATE_IDLE
failed: 0
forceNext: false
intervalHandle: null
lastFullUpdateCycle: process.hrtime()[0]
currentlyDownloading: false
class UpdatesLockedError extends TypedError
ImageNotFoundError = (err) ->
return "#{err.statusCode}" is '404'
@ -121,6 +134,10 @@ application = {}
application.UpdatesLockedError = UpdatesLockedError
application.localMode = false
application.healthy = ->
timeSinceLastCycle = (process.hrtime()[0] - updateStatus.lastFullUpdateCycle) * 1000
return updateStatus.currentlyDownloading or timeSinceLastCycle <= 2 * config.apiPollInterval
application.logSystemMessage = logSystemMessage = (message, obj, eventName) ->
logger.log({ m: message, s: 1 })
utils.mixpanelTrack(eventName ? message, obj)
@ -217,6 +234,7 @@ fetch = (app, { deltaSource, setDeviceUpdateState = true } = {}) ->
docker.getImage(app.imageId).inspect()
.catch ImageNotFoundError, ->
updateStatus.currentlyDownloading = true
device.updateState(status: 'Downloading', download_progress: 0)
Promise.try ->
@ -244,6 +262,8 @@ fetch = (app, { deltaSource, setDeviceUpdateState = true } = {}) ->
.catch (err) ->
logSystemEvent(logTypes.downloadAppError, app, err)
throw err
.finally ->
updateStatus.currentlyDownloading = false
shouldMountKmod = (image) ->
device.isResinOSv1().then (isV1) ->
@ -576,17 +596,6 @@ waitToKill = (app, timeout) ->
Promise.delay(pollInterval).then(retryCheck)
retryCheck()
UPDATE_IDLE = 0
UPDATE_UPDATING = 1
UPDATE_REQUIRED = 2
UPDATE_SCHEDULED = 3
updateStatus =
state: UPDATE_IDLE
failed: 0
forceNext: false
intervalHandle: null
updateStrategies =
'download-then-kill': ({ localApp, app, needsDownload, force, deltaSource }) ->
Promise.try ->
@ -885,6 +894,7 @@ application.update = update = (force, scheduled = false) ->
else
updateStatus.state = UPDATE_IDLE
device.updateState(status: 'Idle')
updateStatus.lastFullUpdateCycle = process.hrtime()[0]
return
sanitiseContainerName = (name) -> name.replace(/^\//, '')

View File

@ -185,6 +185,10 @@ do ->
targetState = {}
actualState = {}
updateState = { update_pending: false, update_failed: false, update_downloaded: false }
reportErrors = 0
exports.stateReportHealthy = ->
return !(utils.isConnectivityCheckEnabled() and utils.connected() and reportErrors > 3)
getStateDiff = ->
_.omitBy targetState, (value, key) ->
@ -211,11 +215,13 @@ do ->
apikey: apiKey
.timeout(config.apiTimeout)
.then ->
reportErrors = 0
# Update the actual state.
_.merge(actualState, stateDiff)
)
.delay(APPLY_STATE_SUCCESS_DELAY)
.catch (error) ->
reportErrors += 1
utils.mixpanelTrack('Device info update failure', { error, stateDiff })
# Delay 5s before retrying a failed update
Promise.delay(APPLY_STATE_RETRY_DELAY)
@ -254,3 +260,10 @@ exports.isResinOSv1 = memoizePromise ->
exports.getOSVariant = memoizePromise ->
osRelease.getOSVariant(config.hostOSVersionPath)
do ->
_gosuperHealthy = true
exports.gosuperHealthy = ->
return _gosuperHealthy
exports.reportUnhealthyGosuper = ->
_gosuperHealthy = false

View File

@ -70,11 +70,14 @@ disableConnectivityCheck = false
# options: An object of net.connect options, with the addition of:
# timeout: 10s
checkHost = (options) ->
if disableConnectivityCheck or pauseConnectivityCheck
if !isConnectivityCheckEnabled()
return true
else
return networkCheck.checkHost(options)
exports.isConnectivityCheckEnabled = isConnectivityCheckEnabled = ->
return !disableConnectivityCheck and !pauseConnectivityCheck
# Custom monitor that uses checkHost function above.
customMonitor = (options, fn) ->
networkCheck.monitor(checkHost, options, fn)
@ -98,30 +101,36 @@ vpnStatusInotifyCallback = ->
# Use the following to catch EEXIST errors
EEXIST = (err) -> err.code is 'EEXIST'
exports.connectivityCheck = _.once ->
if !config.apiEndpoint?
console.log('No apiEndpoint specified, skipping connectivity check')
return
parsedUrl = url.parse(config.apiEndpoint)
fs.mkdirAsync(config.vpnStatusPath)
.catch EEXIST, (err) ->
console.log('VPN status path exists.')
.then ->
fs.watch(config.vpnStatusPath, vpnStatusInotifyCallback)
do ->
_connected = true
exports.connected = ->
return _connected
# Manually trigger the call back to detect cases when VPN was switched on before the supervisor starts.
vpnStatusInotifyCallback()
customMonitor
host: parsedUrl.hostname
port: parsedUrl.port ? (if parsedUrl.protocol is 'https:' then 443 else 80)
interval: 10 * 1000
(connected) ->
if connected
console.log('Internet Connectivity: OK')
blink.pattern.stop()
else
console.log('Waiting for connectivity...')
blink.pattern.start(networkPattern)
exports.connectivityCheck = _.once ->
if !config.apiEndpoint?
console.log('No apiEndpoint specified, skipping connectivity check')
return
parsedUrl = url.parse(config.apiEndpoint)
fs.mkdirAsync(config.vpnStatusPath)
.catch EEXIST, (err) ->
console.log('VPN status path exists.')
.then ->
fs.watch(config.vpnStatusPath, vpnStatusInotifyCallback)
# Manually trigger the call back to detect cases when VPN was switched on before the supervisor starts.
vpnStatusInotifyCallback()
customMonitor
host: parsedUrl.hostname
port: parsedUrl.port ? (if parsedUrl.protocol is 'https:' then 443 else 80)
interval: 10 * 1000
(connected) ->
_connected = connected
if connected
console.log('Internet Connectivity: OK')
blink.pattern.stop()
else
console.log('Waiting for connectivity...')
blink.pattern.start(networkPattern)
secretPromises = {}