Merge pull request #2049 from balena-os/decrease_metrics_frequency

Limit metrics reporting maximum frequency to 5 minutes
This commit is contained in:
bulldozer-balena[bot] 2022-11-18 16:06:54 +00:00 committed by GitHub
commit 40d19520e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 97 additions and 30 deletions

View File

@ -4,7 +4,6 @@ import { delay } from 'bluebird';
import { CoreOptions } from 'request'; import { CoreOptions } from 'request';
import { performance } from 'perf_hooks'; import { performance } from 'perf_hooks';
import * as constants from '../lib/constants';
import { withBackoff, OnFailureInfo } from '../lib/backoff'; import { withBackoff, OnFailureInfo } from '../lib/backoff';
import { log } from '../lib/supervisor-console'; import { log } from '../lib/supervisor-console';
import { InternalInconsistencyError, StatusError } from '../lib/errors'; import { InternalInconsistencyError, StatusError } from '../lib/errors';
@ -20,6 +19,13 @@ import { shallowDiff, prune, empty } from '../lib/json';
let lastReport: DeviceState = {}; let lastReport: DeviceState = {};
let lastReportTime: number = -Infinity; let lastReportTime: number = -Infinity;
// Tracks if unable to report the latest state change event.
let stateChangeDeferred: boolean = false;
// How often can we report our state to the server in ms
const maxReportFrequency = 10 * 1000;
// How often can we report metrics to the server in ms; mirrors server setting.
// Metrics are low priority, so less frequent than maxReportFrequency.
const maxMetricsFrequency = 300 * 1000;
// TODO: This counter is read by the healthcheck to see if the // TODO: This counter is read by the healthcheck to see if the
// supervisor is having issues to connect. We have removed the // supervisor is having issues to connect. We have removed the
@ -70,12 +76,14 @@ async function report({ body, opts }: StateReport) {
} }
} }
async function reportCurrentState(opts: StateReportOpts) { /**
// Wrap the report with fetching of state so report always has the latest state diff * Collects current state and reports with backoff. Diffs report content with
* previous report and does not send an empty report.
*
* Does *not* validate time elapsed since last report.
*/
async function reportCurrentState(opts: StateReportOpts, uuid: string) {
const getStateAndReport = async () => { const getStateAndReport = async () => {
const now = performance.now();
// Only try to report if enough time has elapsed since last report
if (now - lastReportTime >= constants.maxReportFrequency) {
const currentState = await deviceState.getCurrentForReport(lastReport); const currentState = await deviceState.getCurrentForReport(lastReport);
const stateDiff = prune(shallowDiff(lastReport, currentState, 2)); const stateDiff = prune(shallowDiff(lastReport, currentState, 2));
@ -83,17 +91,25 @@ async function reportCurrentState(opts: StateReportOpts) {
return; return;
} }
// If metrics not yet scheduled, report must include a state change to
// qualify for sending.
const metricsScheduled =
performance.now() - lastReportTime > maxMetricsFrequency;
if (!metricsScheduled) {
const uuidMap = stateDiff[uuid] as { [k: string]: any };
if (
Object.keys(uuidMap).every((n) =>
deviceState.sysInfoPropertyNames.includes(n),
)
) {
return;
}
}
await report({ body: stateDiff, opts }); await report({ body: stateDiff, opts });
lastReportTime = performance.now(); lastReportTime = performance.now();
lastReport = currentState; lastReport = currentState;
log.info('Reported current state to the cloud'); log.info('Reported current state to the cloud');
} else {
// Not enough time has elapsed since last report
// Delay report until next allowed time
const timeSinceLastReport = now - lastReportTime;
await delay(constants.maxReportFrequency - timeSinceLastReport);
await getStateAndReport();
}
}; };
// Create a report that will backoff on errors // Create a report that will backoff on errors
@ -131,6 +147,11 @@ function handleRetry(retryInfo: OnFailureInfo) {
); );
} }
/**
* Sends state report to cloud from two sources: 1) state change events and
* 2) timer for metrics. Report frequency is at most maxReportFrequency, and
* metrics is reported at most maxMetricsFrequency.
*/
export async function startReporting() { export async function startReporting() {
// Get configs needed to make a report // Get configs needed to make a report
const reportConfigs = (await config.getMany([ const reportConfigs = (await config.getMany([
@ -139,23 +160,59 @@ export async function startReporting() {
'deviceApiKey', 'deviceApiKey',
'appUpdatePollInterval', 'appUpdatePollInterval',
])) as StateReportOpts; ])) as StateReportOpts;
// Pass uuid to report separately to guarantee it exists.
const uuid = await config.get('uuid');
if (!uuid) {
throw new InternalInconsistencyError('No uuid found for local device');
}
let reportPending = false; let reportPending = false;
const doReport = async () => { // Reports current state if not already sending and prevents a state change
// from exceeding report frequency. Returns true if sent; otherwise false.
const doReport = async (): Promise<boolean> => {
if (!reportPending) { if (!reportPending) {
if (performance.now() - lastReportTime > maxReportFrequency) {
// Can't wait until report complete to clear deferred marker.
// Change events while in progress will set deferred marker synchronously.
// Ensure we don't miss reporting a change event.
stateChangeDeferred = false;
reportPending = true; reportPending = true;
await reportCurrentState(reportConfigs); await reportCurrentState(reportConfigs, uuid);
reportPending = false; reportPending = false;
return true;
} else {
return false;
}
} else {
return false;
}
};
const onStateChange = async () => {
// State change events are async, but may arrive in rapid succession.
// Defers to a timed report schedule if we can't report immediately, to
// ensure we don't miss reporting an event.
if (!(await doReport())) {
stateChangeDeferred = true;
} }
}; };
// If the state changes, report it // If the state changes, report it
deviceState.on('change', doReport); deviceState.on('change', onStateChange);
async function recursivelyReport(delayBy: number) { async function recursivelyReport(delayBy: number) {
try { try {
// Try to send current state // Follow-up when report not sent immediately on change event...
if (stateChangeDeferred) {
if (!(await doReport())) {
stateChangeDeferred = true;
}
} else {
// ... or on regular metrics schedule.
if (performance.now() - lastReportTime > maxMetricsFrequency) {
await doReport(); await doReport();
}
}
} finally { } finally {
// Wait until we want to report again // Wait until we want to report again
await delay(delayBy); await delay(delayBy);
@ -166,5 +223,5 @@ export async function startReporting() {
// Start monitoring for changes that do not trigger deviceState events // Start monitoring for changes that do not trigger deviceState events
// Example - device metrics // Example - device metrics
return recursivelyReport(constants.maxReportFrequency); return recursivelyReport(maxReportFrequency);
} }

View File

@ -447,6 +447,18 @@ async function getSysInfo(
); );
} }
/** SysInfo (metrics) property names used in report. */
export const sysInfoPropertyNames = [
'cpu_usage',
'memory_usage',
'memory_total',
'storage_usage',
'storage_total',
'storage_block_device',
'cpu_temp',
'cpu_id',
];
// Return current state in a way that the API understands // Return current state in a way that the API understands
export async function getCurrentForReport( export async function getCurrentForReport(
lastReport = {} as DeviceState, lastReport = {} as DeviceState,

View File

@ -64,8 +64,6 @@ const constants = {
backoffIncrement: 500, backoffIncrement: 500,
supervisorNetworkSubnet: '10.114.104.0/25', supervisorNetworkSubnet: '10.114.104.0/25',
supervisorNetworkGateway: '10.114.104.1', supervisorNetworkGateway: '10.114.104.1',
// How often can we report our state to the server in ms
maxReportFrequency: 10 * 1000,
// How much of a jitter we can add to our api polling // How much of a jitter we can add to our api polling
// (this number is used as an upper bound when generating // (this number is used as an upper bound when generating
// a random jitter) // a random jitter)