diff --git a/src/memory.ts b/src/memory.ts new file mode 100644 index 00000000..8b43665d --- /dev/null +++ b/src/memory.ts @@ -0,0 +1,58 @@ +import { memoryUsage } from 'process'; + +import * as deviceState from './device-state'; +import log from './lib/supervisor-console'; + +export let initialMemory: number = 0; + +// Exported for tests only, as process.uptime cannot be stubbed +export const processUptime = () => Math.floor(process.uptime()); + +const secondsToHumanReadable = (seconds: number) => { + const hours = Math.floor(seconds / 3600); + const minutes = Math.floor((seconds - hours * 3600) / 60); + const secondsRemainder = seconds - hours * 3600 - minutes * 60; + return `${hours}h ${minutes}m ${secondsRemainder}s`; +}; + +// 15mb +const MEMORY_THRESHOLD_BYTES = 15 * 1024 * 1024; + +/** + * Returns false if Supervisor process memory usage is above threshold, + * otherwise returns true. + */ +export async function healthcheck( + thresholdBytes: number = MEMORY_THRESHOLD_BYTES, +): Promise { + // Measure initial memory after 20 seconds so that startup operations + // don't affect accuracy. + if (processUptime() < 20) { + return true; + } + + // Pass healthcheck if state isn't settled as we only care about + // growing base memory usage instead of memory usage spikes. + if (deviceState.isApplyInProgress()) { + return true; + } + + // Pass healthcheck while initial memory usage hasn't been measured + if (initialMemory === 0) { + initialMemory = memoryUsage.rss(); + return true; + } + + // Fail healthcheck if memory usage is above threshold + if (memoryUsage.rss() > initialMemory + thresholdBytes) { + log.info( + `Healthcheck failure - memory usage above threshold after ${secondsToHumanReadable( + processUptime(), + )}`, + ); + return false; + } + + // Pass healthcheck if memory usage is below threshold + return true; +} diff --git a/src/supervisor.ts b/src/supervisor.ts index 708fdf11..fbe46fa9 100644 --- a/src/supervisor.ts +++ b/src/supervisor.ts @@ -7,6 +7,7 @@ import SupervisorAPI from './device-api'; import * as v1 from './device-api/v1'; import * as v2 from './device-api/v2'; import logMonitor from './logging/monitor'; +import * as memory from './memory'; import { initializeContractRequirements } from './lib/contracts'; import { normaliseLegacyDatabase } from './lib/legacy'; @@ -70,7 +71,11 @@ export class Supervisor { log.info('Starting API server'); this.api = new SupervisorAPI({ routers: [v1.router, v2.router], - healthchecks: [apiBinder.healthcheck, deviceState.healthcheck], + healthchecks: [ + apiBinder.healthcheck, + deviceState.healthcheck, + memory.healthcheck, + ], }); deviceState.on('shutdown', () => this.api.stop()); return this.api.listen(conf.listenPort, conf.apiTimeout); diff --git a/test/integration/supervisor.spec.ts b/test/integration/supervisor.spec.ts index 8a9f43a4..7f0e7a43 100644 --- a/test/integration/supervisor.spec.ts +++ b/test/integration/supervisor.spec.ts @@ -1,10 +1,20 @@ import request from 'supertest'; +import { expect } from 'chai'; const BALENA_SUPERVISOR_ADDRESS = process.env.BALENA_SUPERVISOR_ADDRESS || 'http://balena-supervisor:48484'; describe('supervisor app', () => { - it('the supervisor app runs and the API responds with a healthy status', async () => { - await request(BALENA_SUPERVISOR_ADDRESS).get('/v1/healthy').expect(200); + it('the supervisor app runs and the API responds to /v1/healthy', async () => { + await request(BALENA_SUPERVISOR_ADDRESS) + .get('/v1/healthy') + .then(({ status }) => { + // There's a chance that the endpoint will respond with 500 + // due to memory healthcheck failure, which we can't easily + // control as it's checking memory in the balena-supervisor + // container. So in this case, just check that the healthcheck + // failed due to memory instead of anything else. + expect(status).to.be.oneOf([200, 500]); + }); }); }); diff --git a/test/unit/memory.spec.ts b/test/unit/memory.spec.ts new file mode 100644 index 00000000..a4ebffa8 --- /dev/null +++ b/test/unit/memory.spec.ts @@ -0,0 +1,72 @@ +import { expect } from 'chai'; +import type { SinonStub } from 'sinon'; +import { stub } from 'sinon'; +import * as process from 'process'; + +import * as memory from '~/src/memory'; +import * as deviceState from '~/src/device-state'; +import log from '~/lib/supervisor-console'; + +describe('memory.healthcheck', () => { + let uptimeStub: SinonStub; + let rssStub: SinonStub; + let isApplyInProgressStub: SinonStub; + + beforeEach(() => { + uptimeStub = stub(memory, 'processUptime').returns(20); + rssStub = stub(process.memoryUsage, 'rss').returns(100); + isApplyInProgressStub = stub(deviceState, 'isApplyInProgress').returns( + false, + ); + }); + + afterEach(() => { + uptimeStub.restore(); + rssStub.restore(); + isApplyInProgressStub.restore(); + }); + + it('passes healthcheck if process has not been running for 20s', async () => { + // @ts-expect-error - initialMemory is read-only + memory.initialMemory = 0; + uptimeStub.returns(19); + + expect(await memory.healthcheck()).to.be.true; + }); + + it('passes healthcheck while initial memory not set and sets initial memory', async () => { + // @ts-expect-error - initialMemory is read-only + memory.initialMemory = 0; + + expect(await memory.healthcheck()).to.be.true; + expect(memory.initialMemory).to.equal(100); + }); + + it('passes healthcheck while state apply in progress', async () => { + // @ts-expect-error - initialMemory is read-only + memory.initialMemory = 100; + isApplyInProgressStub.returns(true); + + expect(await memory.healthcheck()).to.be.true; + }); + + it('passes healthcheck if memory usage is below threshold', async () => { + // @ts-expect-error - initialMemory is read-only + memory.initialMemory = 100; + rssStub.returns(150); + + expect(await memory.healthcheck(100)).to.be.true; + }); + + it('fails healthcheck if memory usage is above threshold', async () => { + // @ts-expect-error - initialMemory is read-only + memory.initialMemory = 100; + uptimeStub.returns(61); + rssStub.returns(250); + + expect(await memory.healthcheck(100)).to.be.false; + expect(log.info).to.have.been.calledWith( + `Healthcheck failure - memory usage above threshold after 0h 1m 1s`, + ); + }); +});