mirror of
https://github.com/balena-os/balena-supervisor.git
synced 2024-12-21 22:47:49 +00:00
Merge pull request #2216 from balena-os/memory-healthcheck
Run memory healthcheck for Supervisor process
This commit is contained in:
commit
7b7305a0ab
58
src/memory.ts
Normal file
58
src/memory.ts
Normal file
@ -0,0 +1,58 @@
|
||||
import { memoryUsage } from 'process';
|
||||
|
||||
import * as deviceState from './device-state';
|
||||
import log from './lib/supervisor-console';
|
||||
|
||||
export let initialMemory: number = 0;
|
||||
|
||||
// Exported for tests only, as process.uptime cannot be stubbed
|
||||
export const processUptime = () => Math.floor(process.uptime());
|
||||
|
||||
const secondsToHumanReadable = (seconds: number) => {
|
||||
const hours = Math.floor(seconds / 3600);
|
||||
const minutes = Math.floor((seconds - hours * 3600) / 60);
|
||||
const secondsRemainder = seconds - hours * 3600 - minutes * 60;
|
||||
return `${hours}h ${minutes}m ${secondsRemainder}s`;
|
||||
};
|
||||
|
||||
// 15mb
|
||||
const MEMORY_THRESHOLD_BYTES = 15 * 1024 * 1024;
|
||||
|
||||
/**
|
||||
* Returns false if Supervisor process memory usage is above threshold,
|
||||
* otherwise returns true.
|
||||
*/
|
||||
export async function healthcheck(
|
||||
thresholdBytes: number = MEMORY_THRESHOLD_BYTES,
|
||||
): Promise<boolean> {
|
||||
// Measure initial memory after 20 seconds so that startup operations
|
||||
// don't affect accuracy.
|
||||
if (processUptime() < 20) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Pass healthcheck if state isn't settled as we only care about
|
||||
// growing base memory usage instead of memory usage spikes.
|
||||
if (deviceState.isApplyInProgress()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Pass healthcheck while initial memory usage hasn't been measured
|
||||
if (initialMemory === 0) {
|
||||
initialMemory = memoryUsage.rss();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Fail healthcheck if memory usage is above threshold
|
||||
if (memoryUsage.rss() > initialMemory + thresholdBytes) {
|
||||
log.info(
|
||||
`Healthcheck failure - memory usage above threshold after ${secondsToHumanReadable(
|
||||
processUptime(),
|
||||
)}`,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Pass healthcheck if memory usage is below threshold
|
||||
return true;
|
||||
}
|
@ -7,6 +7,7 @@ import SupervisorAPI from './device-api';
|
||||
import * as v1 from './device-api/v1';
|
||||
import * as v2 from './device-api/v2';
|
||||
import logMonitor from './logging/monitor';
|
||||
import * as memory from './memory';
|
||||
|
||||
import { initializeContractRequirements } from './lib/contracts';
|
||||
import { normaliseLegacyDatabase } from './lib/legacy';
|
||||
@ -70,7 +71,11 @@ export class Supervisor {
|
||||
log.info('Starting API server');
|
||||
this.api = new SupervisorAPI({
|
||||
routers: [v1.router, v2.router],
|
||||
healthchecks: [apiBinder.healthcheck, deviceState.healthcheck],
|
||||
healthchecks: [
|
||||
apiBinder.healthcheck,
|
||||
deviceState.healthcheck,
|
||||
memory.healthcheck,
|
||||
],
|
||||
});
|
||||
deviceState.on('shutdown', () => this.api.stop());
|
||||
return this.api.listen(conf.listenPort, conf.apiTimeout);
|
||||
|
@ -1,10 +1,20 @@
|
||||
import request from 'supertest';
|
||||
import { expect } from 'chai';
|
||||
|
||||
const BALENA_SUPERVISOR_ADDRESS =
|
||||
process.env.BALENA_SUPERVISOR_ADDRESS || 'http://balena-supervisor:48484';
|
||||
|
||||
describe('supervisor app', () => {
|
||||
it('the supervisor app runs and the API responds with a healthy status', async () => {
|
||||
await request(BALENA_SUPERVISOR_ADDRESS).get('/v1/healthy').expect(200);
|
||||
it('the supervisor app runs and the API responds to /v1/healthy', async () => {
|
||||
await request(BALENA_SUPERVISOR_ADDRESS)
|
||||
.get('/v1/healthy')
|
||||
.then(({ status }) => {
|
||||
// There's a chance that the endpoint will respond with 500
|
||||
// due to memory healthcheck failure, which we can't easily
|
||||
// control as it's checking memory in the balena-supervisor
|
||||
// container. So in this case, just check that the healthcheck
|
||||
// failed due to memory instead of anything else.
|
||||
expect(status).to.be.oneOf([200, 500]);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
72
test/unit/memory.spec.ts
Normal file
72
test/unit/memory.spec.ts
Normal file
@ -0,0 +1,72 @@
|
||||
import { expect } from 'chai';
|
||||
import type { SinonStub } from 'sinon';
|
||||
import { stub } from 'sinon';
|
||||
import * as process from 'process';
|
||||
|
||||
import * as memory from '~/src/memory';
|
||||
import * as deviceState from '~/src/device-state';
|
||||
import log from '~/lib/supervisor-console';
|
||||
|
||||
describe('memory.healthcheck', () => {
|
||||
let uptimeStub: SinonStub;
|
||||
let rssStub: SinonStub;
|
||||
let isApplyInProgressStub: SinonStub;
|
||||
|
||||
beforeEach(() => {
|
||||
uptimeStub = stub(memory, 'processUptime').returns(20);
|
||||
rssStub = stub(process.memoryUsage, 'rss').returns(100);
|
||||
isApplyInProgressStub = stub(deviceState, 'isApplyInProgress').returns(
|
||||
false,
|
||||
);
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
uptimeStub.restore();
|
||||
rssStub.restore();
|
||||
isApplyInProgressStub.restore();
|
||||
});
|
||||
|
||||
it('passes healthcheck if process has not been running for 20s', async () => {
|
||||
// @ts-expect-error - initialMemory is read-only
|
||||
memory.initialMemory = 0;
|
||||
uptimeStub.returns(19);
|
||||
|
||||
expect(await memory.healthcheck()).to.be.true;
|
||||
});
|
||||
|
||||
it('passes healthcheck while initial memory not set and sets initial memory', async () => {
|
||||
// @ts-expect-error - initialMemory is read-only
|
||||
memory.initialMemory = 0;
|
||||
|
||||
expect(await memory.healthcheck()).to.be.true;
|
||||
expect(memory.initialMemory).to.equal(100);
|
||||
});
|
||||
|
||||
it('passes healthcheck while state apply in progress', async () => {
|
||||
// @ts-expect-error - initialMemory is read-only
|
||||
memory.initialMemory = 100;
|
||||
isApplyInProgressStub.returns(true);
|
||||
|
||||
expect(await memory.healthcheck()).to.be.true;
|
||||
});
|
||||
|
||||
it('passes healthcheck if memory usage is below threshold', async () => {
|
||||
// @ts-expect-error - initialMemory is read-only
|
||||
memory.initialMemory = 100;
|
||||
rssStub.returns(150);
|
||||
|
||||
expect(await memory.healthcheck(100)).to.be.true;
|
||||
});
|
||||
|
||||
it('fails healthcheck if memory usage is above threshold', async () => {
|
||||
// @ts-expect-error - initialMemory is read-only
|
||||
memory.initialMemory = 100;
|
||||
uptimeStub.returns(61);
|
||||
rssStub.returns(250);
|
||||
|
||||
expect(await memory.healthcheck(100)).to.be.false;
|
||||
expect(log.info).to.have.been.calledWith(
|
||||
`Healthcheck failure - memory usage above threshold after 0h 1m 1s`,
|
||||
);
|
||||
});
|
||||
});
|
Loading…
Reference in New Issue
Block a user