mirror of
https://github.com/balena-os/balena-supervisor.git
synced 2024-12-19 05:37:53 +00:00
Merge pull request #2216 from balena-os/memory-healthcheck
Run memory healthcheck for Supervisor process
This commit is contained in:
commit
7b7305a0ab
58
src/memory.ts
Normal file
58
src/memory.ts
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import { memoryUsage } from 'process';
|
||||||
|
|
||||||
|
import * as deviceState from './device-state';
|
||||||
|
import log from './lib/supervisor-console';
|
||||||
|
|
||||||
|
export let initialMemory: number = 0;
|
||||||
|
|
||||||
|
// Exported for tests only, as process.uptime cannot be stubbed
|
||||||
|
export const processUptime = () => Math.floor(process.uptime());
|
||||||
|
|
||||||
|
const secondsToHumanReadable = (seconds: number) => {
|
||||||
|
const hours = Math.floor(seconds / 3600);
|
||||||
|
const minutes = Math.floor((seconds - hours * 3600) / 60);
|
||||||
|
const secondsRemainder = seconds - hours * 3600 - minutes * 60;
|
||||||
|
return `${hours}h ${minutes}m ${secondsRemainder}s`;
|
||||||
|
};
|
||||||
|
|
||||||
|
// 15mb
|
||||||
|
const MEMORY_THRESHOLD_BYTES = 15 * 1024 * 1024;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns false if Supervisor process memory usage is above threshold,
|
||||||
|
* otherwise returns true.
|
||||||
|
*/
|
||||||
|
export async function healthcheck(
|
||||||
|
thresholdBytes: number = MEMORY_THRESHOLD_BYTES,
|
||||||
|
): Promise<boolean> {
|
||||||
|
// Measure initial memory after 20 seconds so that startup operations
|
||||||
|
// don't affect accuracy.
|
||||||
|
if (processUptime() < 20) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pass healthcheck if state isn't settled as we only care about
|
||||||
|
// growing base memory usage instead of memory usage spikes.
|
||||||
|
if (deviceState.isApplyInProgress()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pass healthcheck while initial memory usage hasn't been measured
|
||||||
|
if (initialMemory === 0) {
|
||||||
|
initialMemory = memoryUsage.rss();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fail healthcheck if memory usage is above threshold
|
||||||
|
if (memoryUsage.rss() > initialMemory + thresholdBytes) {
|
||||||
|
log.info(
|
||||||
|
`Healthcheck failure - memory usage above threshold after ${secondsToHumanReadable(
|
||||||
|
processUptime(),
|
||||||
|
)}`,
|
||||||
|
);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pass healthcheck if memory usage is below threshold
|
||||||
|
return true;
|
||||||
|
}
|
@ -7,6 +7,7 @@ import SupervisorAPI from './device-api';
|
|||||||
import * as v1 from './device-api/v1';
|
import * as v1 from './device-api/v1';
|
||||||
import * as v2 from './device-api/v2';
|
import * as v2 from './device-api/v2';
|
||||||
import logMonitor from './logging/monitor';
|
import logMonitor from './logging/monitor';
|
||||||
|
import * as memory from './memory';
|
||||||
|
|
||||||
import { initializeContractRequirements } from './lib/contracts';
|
import { initializeContractRequirements } from './lib/contracts';
|
||||||
import { normaliseLegacyDatabase } from './lib/legacy';
|
import { normaliseLegacyDatabase } from './lib/legacy';
|
||||||
@ -70,7 +71,11 @@ export class Supervisor {
|
|||||||
log.info('Starting API server');
|
log.info('Starting API server');
|
||||||
this.api = new SupervisorAPI({
|
this.api = new SupervisorAPI({
|
||||||
routers: [v1.router, v2.router],
|
routers: [v1.router, v2.router],
|
||||||
healthchecks: [apiBinder.healthcheck, deviceState.healthcheck],
|
healthchecks: [
|
||||||
|
apiBinder.healthcheck,
|
||||||
|
deviceState.healthcheck,
|
||||||
|
memory.healthcheck,
|
||||||
|
],
|
||||||
});
|
});
|
||||||
deviceState.on('shutdown', () => this.api.stop());
|
deviceState.on('shutdown', () => this.api.stop());
|
||||||
return this.api.listen(conf.listenPort, conf.apiTimeout);
|
return this.api.listen(conf.listenPort, conf.apiTimeout);
|
||||||
|
@ -1,10 +1,20 @@
|
|||||||
import request from 'supertest';
|
import request from 'supertest';
|
||||||
|
import { expect } from 'chai';
|
||||||
|
|
||||||
const BALENA_SUPERVISOR_ADDRESS =
|
const BALENA_SUPERVISOR_ADDRESS =
|
||||||
process.env.BALENA_SUPERVISOR_ADDRESS || 'http://balena-supervisor:48484';
|
process.env.BALENA_SUPERVISOR_ADDRESS || 'http://balena-supervisor:48484';
|
||||||
|
|
||||||
describe('supervisor app', () => {
|
describe('supervisor app', () => {
|
||||||
it('the supervisor app runs and the API responds with a healthy status', async () => {
|
it('the supervisor app runs and the API responds to /v1/healthy', async () => {
|
||||||
await request(BALENA_SUPERVISOR_ADDRESS).get('/v1/healthy').expect(200);
|
await request(BALENA_SUPERVISOR_ADDRESS)
|
||||||
|
.get('/v1/healthy')
|
||||||
|
.then(({ status }) => {
|
||||||
|
// There's a chance that the endpoint will respond with 500
|
||||||
|
// due to memory healthcheck failure, which we can't easily
|
||||||
|
// control as it's checking memory in the balena-supervisor
|
||||||
|
// container. So in this case, just check that the healthcheck
|
||||||
|
// failed due to memory instead of anything else.
|
||||||
|
expect(status).to.be.oneOf([200, 500]);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
72
test/unit/memory.spec.ts
Normal file
72
test/unit/memory.spec.ts
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
import { expect } from 'chai';
|
||||||
|
import type { SinonStub } from 'sinon';
|
||||||
|
import { stub } from 'sinon';
|
||||||
|
import * as process from 'process';
|
||||||
|
|
||||||
|
import * as memory from '~/src/memory';
|
||||||
|
import * as deviceState from '~/src/device-state';
|
||||||
|
import log from '~/lib/supervisor-console';
|
||||||
|
|
||||||
|
describe('memory.healthcheck', () => {
|
||||||
|
let uptimeStub: SinonStub;
|
||||||
|
let rssStub: SinonStub;
|
||||||
|
let isApplyInProgressStub: SinonStub;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
uptimeStub = stub(memory, 'processUptime').returns(20);
|
||||||
|
rssStub = stub(process.memoryUsage, 'rss').returns(100);
|
||||||
|
isApplyInProgressStub = stub(deviceState, 'isApplyInProgress').returns(
|
||||||
|
false,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
uptimeStub.restore();
|
||||||
|
rssStub.restore();
|
||||||
|
isApplyInProgressStub.restore();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('passes healthcheck if process has not been running for 20s', async () => {
|
||||||
|
// @ts-expect-error - initialMemory is read-only
|
||||||
|
memory.initialMemory = 0;
|
||||||
|
uptimeStub.returns(19);
|
||||||
|
|
||||||
|
expect(await memory.healthcheck()).to.be.true;
|
||||||
|
});
|
||||||
|
|
||||||
|
it('passes healthcheck while initial memory not set and sets initial memory', async () => {
|
||||||
|
// @ts-expect-error - initialMemory is read-only
|
||||||
|
memory.initialMemory = 0;
|
||||||
|
|
||||||
|
expect(await memory.healthcheck()).to.be.true;
|
||||||
|
expect(memory.initialMemory).to.equal(100);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('passes healthcheck while state apply in progress', async () => {
|
||||||
|
// @ts-expect-error - initialMemory is read-only
|
||||||
|
memory.initialMemory = 100;
|
||||||
|
isApplyInProgressStub.returns(true);
|
||||||
|
|
||||||
|
expect(await memory.healthcheck()).to.be.true;
|
||||||
|
});
|
||||||
|
|
||||||
|
it('passes healthcheck if memory usage is below threshold', async () => {
|
||||||
|
// @ts-expect-error - initialMemory is read-only
|
||||||
|
memory.initialMemory = 100;
|
||||||
|
rssStub.returns(150);
|
||||||
|
|
||||||
|
expect(await memory.healthcheck(100)).to.be.true;
|
||||||
|
});
|
||||||
|
|
||||||
|
it('fails healthcheck if memory usage is above threshold', async () => {
|
||||||
|
// @ts-expect-error - initialMemory is read-only
|
||||||
|
memory.initialMemory = 100;
|
||||||
|
uptimeStub.returns(61);
|
||||||
|
rssStub.returns(250);
|
||||||
|
|
||||||
|
expect(await memory.healthcheck(100)).to.be.false;
|
||||||
|
expect(log.info).to.have.been.calledWith(
|
||||||
|
`Healthcheck failure - memory usage above threshold after 0h 1m 1s`,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
Loading…
Reference in New Issue
Block a user