mirror of
https://github.com/balena-os/balena-supervisor.git
synced 2025-05-31 23:00:48 +00:00
Merge pull request #2351 from balena-os/knex-error
Logs processing improvements
This commit is contained in:
commit
dac9a24c10
@ -11,7 +11,6 @@ import LocalModeManager from '../local-mode';
|
|||||||
import * as dbFormat from '../device-state/db-format';
|
import * as dbFormat from '../device-state/db-format';
|
||||||
import { validateTargetContracts } from '../lib/contracts';
|
import { validateTargetContracts } from '../lib/contracts';
|
||||||
import * as constants from '../lib/constants';
|
import * as constants from '../lib/constants';
|
||||||
import { docker } from '../lib/docker-utils';
|
|
||||||
import log from '../lib/supervisor-console';
|
import log from '../lib/supervisor-console';
|
||||||
import {
|
import {
|
||||||
ContractViolationError,
|
ContractViolationError,
|
||||||
@ -90,20 +89,6 @@ export const initialized = _.once(async () => {
|
|||||||
await config.initialized();
|
await config.initialized();
|
||||||
|
|
||||||
await imageManager.cleanImageData();
|
await imageManager.cleanImageData();
|
||||||
const cleanup = async () => {
|
|
||||||
const containers = await docker.listContainers({ all: true });
|
|
||||||
await logger.clearOutOfDateDBLogs(_.map(containers, 'Id'));
|
|
||||||
};
|
|
||||||
|
|
||||||
// Rather than relying on removing out of date database entries when we're no
|
|
||||||
// longer using them, set a task that runs periodically to clear out the database
|
|
||||||
// This has the advantage that if for some reason a container is removed while the
|
|
||||||
// supervisor is down, we won't have zombie entries in the db
|
|
||||||
|
|
||||||
// Once a day
|
|
||||||
setInterval(cleanup, 1000 * 60 * 60 * 24);
|
|
||||||
// But also run it in on startup
|
|
||||||
await cleanup();
|
|
||||||
|
|
||||||
await localModeManager.init();
|
await localModeManager.init();
|
||||||
await serviceManager.attachToRunning();
|
await serviceManager.attachToRunning();
|
||||||
|
@ -353,14 +353,13 @@ export async function start(service: Service) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const serviceId = service.serviceId;
|
const serviceId = service.serviceId;
|
||||||
const imageId = service.imageId;
|
if (serviceId == null) {
|
||||||
if (serviceId == null || imageId == null) {
|
|
||||||
throw new InternalInconsistencyError(
|
throw new InternalInconsistencyError(
|
||||||
`serviceId and imageId not defined for service: ${service.serviceName} in ServiceManager.start`,
|
`serviceId not defined for service: ${service.serviceName} in ServiceManager.start`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
void logger.attach(container.id, { serviceId, imageId });
|
void logger.attach(container.id, { serviceId });
|
||||||
|
|
||||||
if (!alreadyStarted) {
|
if (!alreadyStarted) {
|
||||||
logger.logSystemEvent(LogTypes.startServiceSuccess, { service });
|
logger.logSystemEvent(LogTypes.startServiceSuccess, { service });
|
||||||
@ -416,15 +415,13 @@ export function listenToEvents() {
|
|||||||
});
|
});
|
||||||
|
|
||||||
const serviceId = service.serviceId;
|
const serviceId = service.serviceId;
|
||||||
const imageId = service.imageId;
|
if (serviceId == null) {
|
||||||
if (serviceId == null || imageId == null) {
|
|
||||||
throw new InternalInconsistencyError(
|
throw new InternalInconsistencyError(
|
||||||
`serviceId and imageId not defined for service: ${service.serviceName} in ServiceManager.listenToEvents`,
|
`serviceId not defined for service: ${service.serviceName} in ServiceManager.listenToEvents`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
void logger.attach(data.id, {
|
void logger.attach(data.id, {
|
||||||
serviceId,
|
serviceId,
|
||||||
imageId,
|
|
||||||
});
|
});
|
||||||
} else if (status === 'destroy') {
|
} else if (status === 'destroy') {
|
||||||
await logMonitor.detach(data.id);
|
await logMonitor.detach(data.id);
|
||||||
@ -468,10 +465,9 @@ export async function attachToRunning() {
|
|||||||
for (const service of services) {
|
for (const service of services) {
|
||||||
if (service.status === 'Running') {
|
if (service.status === 'Running') {
|
||||||
const serviceId = service.serviceId;
|
const serviceId = service.serviceId;
|
||||||
const imageId = service.imageId;
|
if (serviceId == null) {
|
||||||
if (serviceId == null || imageId == null) {
|
|
||||||
throw new InternalInconsistencyError(
|
throw new InternalInconsistencyError(
|
||||||
`serviceId and imageId not defined for service: ${service.serviceName} in ServiceManager.start`,
|
`serviceId not defined for service: ${service.serviceName} in ServiceManager.start`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -482,7 +478,6 @@ export async function attachToRunning() {
|
|||||||
}
|
}
|
||||||
void logger.attach(service.containerId, {
|
void logger.attach(service.containerId, {
|
||||||
serviceId,
|
serviceId,
|
||||||
imageId,
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -14,7 +14,7 @@ import log from './supervisor-console';
|
|||||||
export const toJournalDate = (timestamp: number): string =>
|
export const toJournalDate = (timestamp: number): string =>
|
||||||
new Date(timestamp).toISOString().replace(/T/, ' ').replace(/\..+$/, '');
|
new Date(timestamp).toISOString().replace(/T/, ' ').replace(/\..+$/, '');
|
||||||
|
|
||||||
export function spawnJournalctl(opts: {
|
export interface SpawnJournalctlOpts {
|
||||||
all: boolean;
|
all: boolean;
|
||||||
follow: boolean;
|
follow: boolean;
|
||||||
count?: number | 'all';
|
count?: number | 'all';
|
||||||
@ -24,7 +24,9 @@ export function spawnJournalctl(opts: {
|
|||||||
filterString?: string;
|
filterString?: string;
|
||||||
since?: string;
|
since?: string;
|
||||||
until?: string;
|
until?: string;
|
||||||
}): ChildProcess {
|
}
|
||||||
|
|
||||||
|
export function spawnJournalctl(opts: SpawnJournalctlOpts): ChildProcess {
|
||||||
const args: string[] = [];
|
const args: string[] = [];
|
||||||
if (opts.all) {
|
if (opts.all) {
|
||||||
args.push('-a');
|
args.push('-a');
|
||||||
|
@ -2,13 +2,11 @@ import Bluebird from 'bluebird';
|
|||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
|
|
||||||
import * as config from './config';
|
import * as config from './config';
|
||||||
import * as db from './db';
|
|
||||||
import * as eventTracker from './event-tracker';
|
import * as eventTracker from './event-tracker';
|
||||||
import type { LogType } from './lib/log-types';
|
import type { LogType } from './lib/log-types';
|
||||||
import { takeGlobalLockRW } from './lib/process-lock';
|
import { takeGlobalLockRW } from './lib/process-lock';
|
||||||
import type { LogBackend, LogMessage } from './logging';
|
import type { LogBackend, LogMessage } from './logging';
|
||||||
import { BalenaLogBackend, LocalLogBackend } from './logging';
|
import { BalenaLogBackend, LocalLogBackend } from './logging';
|
||||||
import type { MonitorHook } from './logging/monitor';
|
|
||||||
import logMonitor from './logging/monitor';
|
import logMonitor from './logging/monitor';
|
||||||
|
|
||||||
import * as globalEventBus from './event-bus';
|
import * as globalEventBus from './event-bus';
|
||||||
@ -105,8 +103,8 @@ export function enable(value: boolean = true) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export function log(message: LogMessage) {
|
export async function log(message: LogMessage) {
|
||||||
backend?.log(message);
|
await backend?.log(message);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function logSystemMessage(
|
export function logSystemMessage(
|
||||||
@ -115,11 +113,13 @@ export function logSystemMessage(
|
|||||||
eventName?: string,
|
eventName?: string,
|
||||||
track: boolean = true,
|
track: boolean = true,
|
||||||
) {
|
) {
|
||||||
const msgObj: LogMessage = { message, isSystem: true };
|
const msgObj: LogMessage = { message, isSystem: true, timestamp: Date.now() };
|
||||||
if (eventObj != null && eventObj.error != null) {
|
if (eventObj != null && eventObj.error != null) {
|
||||||
msgObj.isStdErr = true;
|
msgObj.isStdErr = true;
|
||||||
}
|
}
|
||||||
log(msgObj);
|
// IMPORTANT: this could potentially create a memory leak if logSystemMessage
|
||||||
|
// is used too quickly but we don't want supervisor logging to hold up other tasks
|
||||||
|
void log(msgObj);
|
||||||
if (track) {
|
if (track) {
|
||||||
eventTracker.track(
|
eventTracker.track(
|
||||||
eventName != null ? eventName : message,
|
eventName != null ? eventName : message,
|
||||||
@ -134,26 +134,21 @@ export function lock(containerId: string): Bluebird.Disposer<() => void> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
type ServiceInfo = { serviceId: number; imageId: number };
|
type ServiceInfo = { serviceId: number };
|
||||||
export function attach(
|
export async function attach(
|
||||||
containerId: string,
|
containerId: string,
|
||||||
{ serviceId, imageId }: ServiceInfo,
|
{ serviceId }: ServiceInfo,
|
||||||
): Bluebird<void> {
|
): Promise<void> {
|
||||||
// First detect if we already have an attached log stream
|
// First detect if we already have an attached log stream
|
||||||
// for this container
|
// for this container
|
||||||
if (logMonitor.isAttached(containerId)) {
|
if (logMonitor.isAttached(containerId)) {
|
||||||
return Bluebird.resolve();
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
return Bluebird.using(lock(containerId), async () => {
|
return Bluebird.using(lock(containerId), async () => {
|
||||||
await logMonitor.attach(
|
await logMonitor.attach(containerId, async (message) => {
|
||||||
containerId,
|
await log({ ...message, serviceId });
|
||||||
(message: Parameters<MonitorHook>[0] & Partial<ServiceInfo>) => {
|
});
|
||||||
message.serviceId = serviceId;
|
|
||||||
message.imageId = imageId;
|
|
||||||
log(message);
|
|
||||||
},
|
|
||||||
);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -201,16 +196,6 @@ export function logConfigChange(
|
|||||||
logSystemMessage(message, obj, eventName);
|
logSystemMessage(message, obj, eventName);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function clearOutOfDateDBLogs(containerIds: string[]) {
|
|
||||||
superConsole.debug(
|
|
||||||
'Performing database cleanup for container log timestamps',
|
|
||||||
);
|
|
||||||
await db
|
|
||||||
.models('containerLogs')
|
|
||||||
.whereNotIn('containerId', containerIds)
|
|
||||||
.delete();
|
|
||||||
}
|
|
||||||
|
|
||||||
function objectNameForLogs(eventObj: LogEventObject): string | null {
|
function objectNameForLogs(eventObj: LogEventObject): string | null {
|
||||||
if (eventObj == null) {
|
if (eventObj == null) {
|
||||||
return null;
|
return null;
|
||||||
|
@ -4,6 +4,7 @@ import _ from 'lodash';
|
|||||||
import stream from 'stream';
|
import stream from 'stream';
|
||||||
import url from 'url';
|
import url from 'url';
|
||||||
import zlib from 'zlib';
|
import zlib from 'zlib';
|
||||||
|
import { setTimeout } from 'timers/promises';
|
||||||
|
|
||||||
import type { LogMessage } from './log-backend';
|
import type { LogMessage } from './log-backend';
|
||||||
import { LogBackend } from './log-backend';
|
import { LogBackend } from './log-backend';
|
||||||
@ -15,7 +16,6 @@ const MIN_COOLDOWN_PERIOD = 5 * 1000; // 5 seconds
|
|||||||
const MAX_COOLDOWN_PERIOD = 300 * 1000; // 5 minutes
|
const MAX_COOLDOWN_PERIOD = 300 * 1000; // 5 minutes
|
||||||
const KEEPALIVE_TIMEOUT = 60 * 1000;
|
const KEEPALIVE_TIMEOUT = 60 * 1000;
|
||||||
const RESPONSE_GRACE_PERIOD = 5 * 1000;
|
const RESPONSE_GRACE_PERIOD = 5 * 1000;
|
||||||
|
|
||||||
const MAX_LOG_LENGTH = 10 * 1000;
|
const MAX_LOG_LENGTH = 10 * 1000;
|
||||||
const MAX_PENDING_BYTES = 256 * 1024;
|
const MAX_PENDING_BYTES = 256 * 1024;
|
||||||
|
|
||||||
@ -31,7 +31,6 @@ export class BalenaLogBackend extends LogBackend {
|
|||||||
private gzip: zlib.Gzip | null = null;
|
private gzip: zlib.Gzip | null = null;
|
||||||
private opts: Options;
|
private opts: Options;
|
||||||
private stream: stream.PassThrough;
|
private stream: stream.PassThrough;
|
||||||
private timeout: NodeJS.Timeout;
|
|
||||||
|
|
||||||
public initialised = false;
|
public initialised = false;
|
||||||
|
|
||||||
@ -61,7 +60,7 @@ export class BalenaLogBackend extends LogBackend {
|
|||||||
this.writable = true;
|
this.writable = true;
|
||||||
this.flush();
|
this.flush();
|
||||||
if (this.dropCount > 0) {
|
if (this.dropCount > 0) {
|
||||||
this.write({
|
this.tryWrite({
|
||||||
message: `Warning: Suppressed ${this.dropCount} message(s) due to high load`,
|
message: `Warning: Suppressed ${this.dropCount} message(s) due to high load`,
|
||||||
timestamp: Date.now(),
|
timestamp: Date.now(),
|
||||||
isSystem: true,
|
isSystem: true,
|
||||||
@ -76,15 +75,14 @@ export class BalenaLogBackend extends LogBackend {
|
|||||||
return this.initialised;
|
return this.initialised;
|
||||||
}
|
}
|
||||||
|
|
||||||
public log(message: LogMessage) {
|
public async log(message: LogMessage) {
|
||||||
// TODO: Perhaps don't just drop logs when we haven't
|
// TODO: Perhaps don't just drop logs when we haven't
|
||||||
// yet initialised (this happens when a device has not yet
|
// yet initialised (this happens when a device has not yet
|
||||||
// been provisioned)
|
// been provisioned)
|
||||||
|
// TODO: the backend should not be aware of unmanaged or publish state
|
||||||
if (this.unmanaged || !this.publishEnabled || !this.initialised) {
|
if (this.unmanaged || !this.publishEnabled || !this.initialised) {
|
||||||
return;
|
// Yield control to the event loop
|
||||||
}
|
await setTimeout(0);
|
||||||
|
|
||||||
if (!_.isObject(message)) {
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -92,15 +90,12 @@ export class BalenaLogBackend extends LogBackend {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
message.timestamp ??= Date.now();
|
message.message = _.truncate(message.message, {
|
||||||
message.message = message.message
|
length: MAX_LOG_LENGTH,
|
||||||
? _.truncate(message.message, {
|
omission: '[...]',
|
||||||
length: MAX_LOG_LENGTH,
|
});
|
||||||
omission: '[...]',
|
|
||||||
})
|
|
||||||
: '';
|
|
||||||
|
|
||||||
this.write(message);
|
await this.write(message);
|
||||||
}
|
}
|
||||||
|
|
||||||
public assignFields(endpoint: string, uuid: string, deviceApiKey: string) {
|
public assignFields(endpoint: string, uuid: string, deviceApiKey: string) {
|
||||||
@ -117,14 +112,9 @@ export class BalenaLogBackend extends LogBackend {
|
|||||||
|
|
||||||
private lastSetupAttempt = 0;
|
private lastSetupAttempt = 0;
|
||||||
private setupFailures = 0;
|
private setupFailures = 0;
|
||||||
private setupPending = false;
|
private setupPromise: Promise<void> | null = null;
|
||||||
private setup() {
|
|
||||||
if (this.setupPending || this.req != null) {
|
|
||||||
// If we already have a setup pending, or we are already setup, then do nothing
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
this.setupPending = true;
|
|
||||||
|
|
||||||
|
private async trySetup() {
|
||||||
// Work out the total delay we need
|
// Work out the total delay we need
|
||||||
const totalDelay = Math.min(
|
const totalDelay = Math.min(
|
||||||
2 ** this.setupFailures * MIN_COOLDOWN_PERIOD,
|
2 ** this.setupFailures * MIN_COOLDOWN_PERIOD,
|
||||||
@ -135,62 +125,83 @@ export class BalenaLogBackend extends LogBackend {
|
|||||||
// The difference between the two is the actual delay we want
|
// The difference between the two is the actual delay we want
|
||||||
const delay = Math.max(totalDelay - alreadyDelayedBy, 0);
|
const delay = Math.max(totalDelay - alreadyDelayedBy, 0);
|
||||||
|
|
||||||
setTimeout(() => {
|
await setTimeout(delay);
|
||||||
this.setupPending = false;
|
|
||||||
this.lastSetupAttempt = Date.now();
|
|
||||||
|
|
||||||
const setupFailed = () => {
|
this.lastSetupAttempt = Date.now();
|
||||||
this.setupFailures++;
|
|
||||||
this.teardown();
|
|
||||||
};
|
|
||||||
|
|
||||||
this.req = https.request(this.opts);
|
const setupFailed = () => {
|
||||||
|
this.setupFailures++;
|
||||||
|
this.teardown();
|
||||||
|
};
|
||||||
|
|
||||||
// Since we haven't sent the request body yet, and never will,the
|
this.req = https.request(this.opts);
|
||||||
// only reason for the server to prematurely respond is to
|
|
||||||
// communicate an error. So teardown the connection immediately
|
|
||||||
this.req.on('response', (res) => {
|
|
||||||
log.error(
|
|
||||||
'LogBackend: server responded with status code:',
|
|
||||||
res.statusCode,
|
|
||||||
);
|
|
||||||
setupFailed();
|
|
||||||
});
|
|
||||||
|
|
||||||
this.req.on('timeout', setupFailed);
|
// Since we haven't sent the request body yet, and never will,the
|
||||||
this.req.on('close', setupFailed);
|
// only reason for the server to prematurely respond is to
|
||||||
this.req.on('error', (err) => {
|
// communicate an error. So teardown the connection immediately
|
||||||
log.error('LogBackend: unexpected error:', err);
|
this.req.on('response', (res) => {
|
||||||
setupFailed();
|
log.error(
|
||||||
});
|
'LogBackend: server responded with status code:',
|
||||||
|
res.statusCode,
|
||||||
|
);
|
||||||
|
setupFailed();
|
||||||
|
});
|
||||||
|
|
||||||
// Immediately flush the headers. This gives a chance to the server to
|
this.req.on('timeout', setupFailed);
|
||||||
// respond with potential errors such as 401 authentication error
|
this.req.on('close', setupFailed);
|
||||||
this.req.flushHeaders();
|
this.req.on('error', (err) => {
|
||||||
|
log.error('LogBackend: unexpected error:', err);
|
||||||
|
setupFailed();
|
||||||
|
});
|
||||||
|
|
||||||
// We want a very low writable high watermark to prevent having many
|
// Immediately flush the headers. This gives a chance to the server to
|
||||||
// chunks stored in the writable queue of @_gzip and have them in
|
// respond with potential errors such as 401 authentication error
|
||||||
// @_stream instead. This is desirable because once @_gzip.flush() is
|
this.req.flushHeaders();
|
||||||
// called it will do all pending writes with that flush flag. This is
|
|
||||||
// not what we want though. If there are 100 items in the queue we want
|
|
||||||
// to write all of them with Z_NO_FLUSH and only afterwards do a
|
|
||||||
// Z_SYNC_FLUSH to maximize compression
|
|
||||||
this.gzip = zlib.createGzip({ writableHighWaterMark: 1024 });
|
|
||||||
this.gzip.on('error', setupFailed);
|
|
||||||
this.gzip.pipe(this.req);
|
|
||||||
|
|
||||||
// Only start piping if there has been no error after the header flush.
|
// We want a very low writable high watermark to prevent having many
|
||||||
// Doing it immediately would potentially lose logs if it turned out that
|
// chunks stored in the writable queue of @_gzip and have them in
|
||||||
// the server is unavailalbe because @_req stream would consume our
|
// @_stream instead. This is desirable because once @_gzip.flush() is
|
||||||
// passthrough buffer
|
// called it will do all pending writes with that flush flag. This is
|
||||||
this.timeout = setTimeout(() => {
|
// not what we want though. If there are 100 items in the queue we want
|
||||||
if (this.gzip != null) {
|
// to write all of them with Z_NO_FLUSH and only afterwards do a
|
||||||
this.setupFailures = 0;
|
// Z_SYNC_FLUSH to maximize compression
|
||||||
this.stream.pipe(this.gzip);
|
this.gzip = zlib.createGzip({ writableHighWaterMark: 1024 });
|
||||||
setImmediate(this.flush);
|
this.gzip.on('error', setupFailed);
|
||||||
}
|
this.gzip.pipe(this.req);
|
||||||
}, RESPONSE_GRACE_PERIOD);
|
|
||||||
}, delay);
|
// Only start piping if there has been no error after the header flush.
|
||||||
|
// Doing it immediately would potentially lose logs if it turned out that
|
||||||
|
// the server is unavailalbe because @_req stream would consume our
|
||||||
|
// passthrough buffer
|
||||||
|
await setTimeout(RESPONSE_GRACE_PERIOD);
|
||||||
|
|
||||||
|
// a teardown could happen while we wait for the grace period so we check
|
||||||
|
// that gzip is still valid
|
||||||
|
if (this.gzip != null) {
|
||||||
|
this.setupFailures = 0;
|
||||||
|
this.stream.pipe(this.gzip);
|
||||||
|
setImmediate(this.flush);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async setup() {
|
||||||
|
if (this.req != null) {
|
||||||
|
// If we are already setup, then do nothing
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the setup is in progress, let callers wait for the existing promise
|
||||||
|
if (this.setupPromise != null) {
|
||||||
|
return this.setupPromise;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store the setup promise in case there are concurrent calls to
|
||||||
|
// the setup
|
||||||
|
this.setupPromise = this.trySetup().finally(() => {
|
||||||
|
this.setupPromise = null;
|
||||||
|
});
|
||||||
|
|
||||||
|
return this.setupPromise;
|
||||||
}
|
}
|
||||||
|
|
||||||
private snooze = _.debounce(this.teardown, KEEPALIVE_TIMEOUT);
|
private snooze = _.debounce(this.teardown, KEEPALIVE_TIMEOUT);
|
||||||
@ -210,30 +221,43 @@ export class BalenaLogBackend extends LogBackend {
|
|||||||
|
|
||||||
private teardown() {
|
private teardown() {
|
||||||
if (this.req != null) {
|
if (this.req != null) {
|
||||||
clearTimeout(this.timeout);
|
|
||||||
this.req.removeAllListeners();
|
this.req.removeAllListeners();
|
||||||
this.req.on('error', _.noop);
|
this.req.on('error', () => {
|
||||||
|
/* noop */
|
||||||
|
});
|
||||||
if (this.gzip != null) {
|
if (this.gzip != null) {
|
||||||
this.stream.unpipe(this.gzip);
|
this.stream.unpipe(this.gzip);
|
||||||
this.gzip.end();
|
this.gzip.end();
|
||||||
|
this.gzip = null;
|
||||||
}
|
}
|
||||||
this.req = null;
|
this.req = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private write(message: LogMessage) {
|
private tryWrite(message: LogMessage) {
|
||||||
|
try {
|
||||||
|
this.writable = this.stream.write(JSON.stringify(message) + '\n');
|
||||||
|
this.flush();
|
||||||
|
} catch (e) {
|
||||||
|
log.error('Failed to write to logging stream, dropping message.', e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async write(message: LogMessage) {
|
||||||
this.snooze();
|
this.snooze();
|
||||||
this.setup();
|
|
||||||
|
// Setup could terminate unsuccessfully, at which point
|
||||||
|
// the messages will get added to the stream until it fills
|
||||||
|
await this.setup();
|
||||||
|
|
||||||
if (this.writable) {
|
if (this.writable) {
|
||||||
try {
|
this.tryWrite(message);
|
||||||
this.writable = this.stream.write(JSON.stringify(message) + '\n');
|
|
||||||
this.flush();
|
|
||||||
} catch (e) {
|
|
||||||
log.error('Failed to write to logging stream, dropping message.', e);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
this.dropCount += 1;
|
this.dropCount += 1;
|
||||||
|
|
||||||
|
// Yield execution to the event loop to avoid
|
||||||
|
// an aggressive logger to overwhelm the process
|
||||||
|
await setTimeout(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,17 +1,15 @@
|
|||||||
type BaseLogMessage = {
|
type BaseLogMessage = {
|
||||||
message: string;
|
message: string;
|
||||||
isStdErr?: boolean;
|
isStdErr?: boolean;
|
||||||
timestamp?: number;
|
timestamp: number;
|
||||||
};
|
};
|
||||||
export type LogMessage = BaseLogMessage &
|
export type LogMessage = BaseLogMessage &
|
||||||
(
|
(
|
||||||
| {
|
| {
|
||||||
serviceId?: number;
|
serviceId: number;
|
||||||
imageId?: number;
|
|
||||||
isSystem?: false;
|
isSystem?: false;
|
||||||
}
|
}
|
||||||
| {
|
| {
|
||||||
message: string;
|
|
||||||
isSystem: true;
|
isSystem: true;
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
@ -20,7 +18,7 @@ export abstract class LogBackend {
|
|||||||
public unmanaged: boolean;
|
public unmanaged: boolean;
|
||||||
public publishEnabled: boolean = true;
|
public publishEnabled: boolean = true;
|
||||||
|
|
||||||
public abstract log(message: LogMessage): void;
|
public abstract log(message: LogMessage): Promise<void>;
|
||||||
}
|
}
|
||||||
|
|
||||||
export default LogBackend;
|
export default LogBackend;
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
import JSONstream from 'JSONStream';
|
import { pipeline } from 'stream/promises';
|
||||||
|
|
||||||
import * as db from '../db';
|
|
||||||
import { spawnJournalctl, toJournalDate } from '../lib/journald';
|
import { spawnJournalctl, toJournalDate } from '../lib/journald';
|
||||||
import log from '../lib/supervisor-console';
|
import log from '../lib/supervisor-console';
|
||||||
import { setTimeout } from 'timers/promises';
|
import { setTimeout } from 'timers/promises';
|
||||||
@ -21,9 +20,6 @@ interface JournalRow {
|
|||||||
__REALTIME_TIMESTAMP: string;
|
__REALTIME_TIMESTAMP: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Flush every 10 mins
|
|
||||||
const DB_FLUSH_INTERVAL = 10 * 60 * 1000;
|
|
||||||
|
|
||||||
// Wait 5s when journalctl failed before trying to read the logs again
|
// Wait 5s when journalctl failed before trying to read the logs again
|
||||||
const JOURNALCTL_ERROR_RETRY_DELAY = 5000;
|
const JOURNALCTL_ERROR_RETRY_DELAY = 5000;
|
||||||
const JOURNALCTL_ERROR_RETRY_DELAY_MAX = 15 * 60 * 1000;
|
const JOURNALCTL_ERROR_RETRY_DELAY_MAX = 15 * 60 * 1000;
|
||||||
@ -41,6 +37,20 @@ function messageFieldToString(entry: JournalRow['MESSAGE']): string | null {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function* splitStream(chunkIterable: AsyncIterable<any>) {
|
||||||
|
let previous = '';
|
||||||
|
for await (const chunk of chunkIterable) {
|
||||||
|
previous += chunk;
|
||||||
|
const lines = previous.split(/\r?\n/);
|
||||||
|
previous = lines.pop() ?? '';
|
||||||
|
yield* lines;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (previous.length > 0) {
|
||||||
|
yield previous;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Streams logs from journalctl and calls container hooks when a record is received matching container id
|
* Streams logs from journalctl and calls container hooks when a record is received matching container id
|
||||||
*/
|
*/
|
||||||
@ -48,53 +58,67 @@ class LogMonitor {
|
|||||||
private containers: {
|
private containers: {
|
||||||
[containerId: string]: {
|
[containerId: string]: {
|
||||||
hook: MonitorHook;
|
hook: MonitorHook;
|
||||||
follow: boolean;
|
|
||||||
timestamp: number;
|
|
||||||
writeRequired: boolean;
|
|
||||||
};
|
};
|
||||||
} = {};
|
} = {};
|
||||||
private setupAttempts = 0;
|
private setupAttempts = 0;
|
||||||
|
|
||||||
public constructor() {
|
// Only stream logs since the start of the supervisor
|
||||||
setInterval(() => this.flushDb(), DB_FLUSH_INTERVAL);
|
private lastSentTimestamp = Date.now() - performance.now();
|
||||||
}
|
|
||||||
|
|
||||||
public start() {
|
public async start(): Promise<void> {
|
||||||
this.streamLogsFromJournal(
|
try {
|
||||||
{
|
// TODO: do not spawn journalctl if logging is not enabled
|
||||||
|
const { stdout, stderr } = spawnJournalctl({
|
||||||
all: true,
|
all: true,
|
||||||
follow: true,
|
follow: true,
|
||||||
format: 'json',
|
format: 'json',
|
||||||
filterString: '_SYSTEMD_UNIT=balena.service',
|
filterString: '_SYSTEMD_UNIT=balena.service',
|
||||||
},
|
since: toJournalDate(this.lastSentTimestamp),
|
||||||
(row) => {
|
});
|
||||||
if (row.CONTAINER_ID_FULL && this.containers[row.CONTAINER_ID_FULL]) {
|
if (!stdout) {
|
||||||
this.setupAttempts = 0;
|
// this will be catched below
|
||||||
this.handleRow(row);
|
throw new Error('failed to open process stream');
|
||||||
|
}
|
||||||
|
|
||||||
|
stderr?.on('data', (data) =>
|
||||||
|
log.error('journalctl - balena.service stderr: ', data.toString()),
|
||||||
|
);
|
||||||
|
|
||||||
|
const self = this;
|
||||||
|
|
||||||
|
await pipeline(stdout, splitStream, async function (lines) {
|
||||||
|
self.setupAttempts = 0;
|
||||||
|
for await (const line of lines) {
|
||||||
|
try {
|
||||||
|
const row = JSON.parse(line);
|
||||||
|
if (
|
||||||
|
row.CONTAINER_ID_FULL &&
|
||||||
|
self.containers[row.CONTAINER_ID_FULL]
|
||||||
|
) {
|
||||||
|
await self.handleRow(row);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// ignore parsing errors
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
});
|
||||||
(data) => {
|
log.debug('balena.service journalctl process exit.');
|
||||||
log.error('journalctl - balena.service stderr: ', data.toString());
|
} catch (e: any) {
|
||||||
},
|
log.error('journalctl - balena.service error: ', e.message ?? e);
|
||||||
() => {
|
}
|
||||||
// noop for closed
|
|
||||||
},
|
// On exit of process try to create another
|
||||||
async () => {
|
const wait = Math.min(
|
||||||
log.debug('balena.service journalctl process exit.');
|
2 ** this.setupAttempts++ * JOURNALCTL_ERROR_RETRY_DELAY,
|
||||||
// On exit of process try to create another
|
JOURNALCTL_ERROR_RETRY_DELAY_MAX,
|
||||||
const wait = Math.min(
|
|
||||||
2 ** this.setupAttempts++ * JOURNALCTL_ERROR_RETRY_DELAY,
|
|
||||||
JOURNALCTL_ERROR_RETRY_DELAY_MAX,
|
|
||||||
);
|
|
||||||
log.debug(
|
|
||||||
`Spawning another process to watch balena.service logs in ${
|
|
||||||
wait / 1000
|
|
||||||
}s`,
|
|
||||||
);
|
|
||||||
await setTimeout(wait);
|
|
||||||
return this.start();
|
|
||||||
},
|
|
||||||
);
|
);
|
||||||
|
log.debug(
|
||||||
|
`Spawning another process to watch balena.service logs in ${
|
||||||
|
wait / 1000
|
||||||
|
}s`,
|
||||||
|
);
|
||||||
|
await setTimeout(wait);
|
||||||
|
return this.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
public isAttached(containerId: string): boolean {
|
public isAttached(containerId: string): boolean {
|
||||||
@ -105,66 +129,15 @@ class LogMonitor {
|
|||||||
if (!this.containers[containerId]) {
|
if (!this.containers[containerId]) {
|
||||||
this.containers[containerId] = {
|
this.containers[containerId] = {
|
||||||
hook,
|
hook,
|
||||||
follow: false,
|
|
||||||
timestamp: Date.now(),
|
|
||||||
writeRequired: false,
|
|
||||||
};
|
};
|
||||||
this.containers[containerId].timestamp =
|
|
||||||
await this.getContainerSentTimestamp(containerId);
|
|
||||||
this.backfill(containerId, this.containers[containerId].timestamp);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public async detach(containerId: string) {
|
public async detach(containerId: string) {
|
||||||
delete this.containers[containerId];
|
delete this.containers[containerId];
|
||||||
await db.models('containerLogs').delete().where({ containerId });
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private streamLogsFromJournal(
|
private async handleRow(row: JournalRow) {
|
||||||
options: Parameters<typeof spawnJournalctl>[0],
|
|
||||||
onRow: (row: JournalRow) => void,
|
|
||||||
onError: (data: Buffer) => void,
|
|
||||||
onClose?: () => void,
|
|
||||||
onExit?: () => void,
|
|
||||||
): ReturnType<typeof spawnJournalctl> {
|
|
||||||
const journalctl = spawnJournalctl(options);
|
|
||||||
journalctl.stdout?.pipe(JSONstream.parse(true).on('data', onRow));
|
|
||||||
journalctl.stderr?.on('data', onError);
|
|
||||||
if (onClose) {
|
|
||||||
journalctl.on('close', onClose);
|
|
||||||
}
|
|
||||||
if (onExit) {
|
|
||||||
journalctl.on('exit', onExit);
|
|
||||||
}
|
|
||||||
return journalctl;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* stream logs from lastSentTimestamp until now so logs are not missed if the container started before supervisor
|
|
||||||
*/
|
|
||||||
private backfill(containerId: string, lastSentTimestamp: number) {
|
|
||||||
this.streamLogsFromJournal(
|
|
||||||
{
|
|
||||||
all: true,
|
|
||||||
follow: false,
|
|
||||||
format: 'json',
|
|
||||||
filterString: `CONTAINER_ID_FULL=${containerId}`,
|
|
||||||
since: toJournalDate(lastSentTimestamp + 1), // increment to exclude last sent log
|
|
||||||
},
|
|
||||||
(row) => this.handleRow(row),
|
|
||||||
(data) => {
|
|
||||||
log.error(
|
|
||||||
`journalctl - container ${containerId} stderr: `,
|
|
||||||
data.toString(),
|
|
||||||
);
|
|
||||||
},
|
|
||||||
() => {
|
|
||||||
this.containers[containerId].follow = true;
|
|
||||||
},
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
private handleRow(row: JournalRow) {
|
|
||||||
if (
|
if (
|
||||||
row.CONTAINER_ID_FULL == null ||
|
row.CONTAINER_ID_FULL == null ||
|
||||||
row.CONTAINER_NAME === 'balena_supervisor' ||
|
row.CONTAINER_NAME === 'balena_supervisor' ||
|
||||||
@ -182,66 +155,9 @@ class LogMonitor {
|
|||||||
}
|
}
|
||||||
const isStdErr = row.PRIORITY === '3';
|
const isStdErr = row.PRIORITY === '3';
|
||||||
const timestamp = Math.floor(Number(row.__REALTIME_TIMESTAMP) / 1000); // microseconds to milliseconds
|
const timestamp = Math.floor(Number(row.__REALTIME_TIMESTAMP) / 1000); // microseconds to milliseconds
|
||||||
this.updateContainerSentTimestamp(containerId, timestamp);
|
|
||||||
|
|
||||||
// WARNING: this could lead to a memory leak as the hook is not being awaited
|
await this.containers[containerId].hook({ message, isStdErr, timestamp });
|
||||||
// and the journal can be very verbose
|
this.lastSentTimestamp = timestamp;
|
||||||
void this.containers[containerId].hook({ message, isStdErr, timestamp });
|
|
||||||
}
|
|
||||||
|
|
||||||
private updateContainerSentTimestamp(
|
|
||||||
containerId: string,
|
|
||||||
timestamp: number,
|
|
||||||
): void {
|
|
||||||
this.containers[containerId].timestamp = timestamp;
|
|
||||||
this.containers[containerId].writeRequired = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private async getContainerSentTimestamp(
|
|
||||||
containerId: string,
|
|
||||||
): Promise<number> {
|
|
||||||
try {
|
|
||||||
const row = await db
|
|
||||||
.models('containerLogs')
|
|
||||||
.select('lastSentTimestamp')
|
|
||||||
.where({ containerId })
|
|
||||||
.first();
|
|
||||||
|
|
||||||
if (!row) {
|
|
||||||
const now = Date.now();
|
|
||||||
await db
|
|
||||||
.models('containerLogs')
|
|
||||||
.insert({ containerId, lastSentTimestamp: now });
|
|
||||||
return now;
|
|
||||||
} else {
|
|
||||||
return row.lastSentTimestamp;
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
log.error(
|
|
||||||
'There was an error retrieving the container log timestamps:',
|
|
||||||
e,
|
|
||||||
);
|
|
||||||
return Date.now();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private async flushDb() {
|
|
||||||
log.debug('Attempting container log timestamp flush...');
|
|
||||||
try {
|
|
||||||
for (const containerId of Object.keys(this.containers)) {
|
|
||||||
// Avoid writing to the db if we don't need to
|
|
||||||
if (!this.containers[containerId].writeRequired) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
await db.models('containerLogs').where({ containerId }).update({
|
|
||||||
lastSentTimestamp: this.containers[containerId].timestamp,
|
|
||||||
});
|
|
||||||
this.containers[containerId].writeRequired = false;
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
log.error('There was an error storing the container log timestamps:', e);
|
|
||||||
}
|
|
||||||
log.debug('Container log timestamp flush complete');
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
10
src/migrations/M00011.js
Normal file
10
src/migrations/M00011.js
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
export async function up(knex) {
|
||||||
|
// Drop the container logs table
|
||||||
|
if (await knex.schema.hasTable('containerLogs')) {
|
||||||
|
await knex.schema.dropTable('containerLogs');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function down() {
|
||||||
|
throw new Error('Not implemented');
|
||||||
|
}
|
@ -84,7 +84,7 @@ export class Supervisor {
|
|||||||
apiBinder.start(),
|
apiBinder.start(),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
logMonitor.start();
|
await logMonitor.start();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,8 +52,12 @@ describe('Logger', function () {
|
|||||||
|
|
||||||
it('waits the grace period before sending any logs', async function () {
|
it('waits the grace period before sending any logs', async function () {
|
||||||
const clock = sinon.useFakeTimers();
|
const clock = sinon.useFakeTimers();
|
||||||
logger.log({ message: 'foobar', serviceId: 15 });
|
await logger.log({
|
||||||
clock.tick(4999);
|
message: 'foobar',
|
||||||
|
serviceId: 15,
|
||||||
|
timestamp: Date.now(),
|
||||||
|
});
|
||||||
|
await clock.tickAsync(4999);
|
||||||
clock.restore();
|
clock.restore();
|
||||||
|
|
||||||
await setTimeout(100);
|
await setTimeout(100);
|
||||||
@ -62,8 +66,12 @@ describe('Logger', function () {
|
|||||||
|
|
||||||
it('tears down the connection after inactivity', async function () {
|
it('tears down the connection after inactivity', async function () {
|
||||||
const clock = sinon.useFakeTimers();
|
const clock = sinon.useFakeTimers();
|
||||||
logger.log({ message: 'foobar', serviceId: 15 });
|
await logger.log({
|
||||||
clock.tick(61000);
|
message: 'foobar',
|
||||||
|
serviceId: 15,
|
||||||
|
timestamp: Date.now(),
|
||||||
|
});
|
||||||
|
await clock.tickAsync(61000);
|
||||||
clock.restore();
|
clock.restore();
|
||||||
|
|
||||||
await setTimeout(100);
|
await setTimeout(100);
|
||||||
@ -72,9 +80,9 @@ describe('Logger', function () {
|
|||||||
|
|
||||||
it('sends logs as gzipped ndjson', async function () {
|
it('sends logs as gzipped ndjson', async function () {
|
||||||
const timestamp = Date.now();
|
const timestamp = Date.now();
|
||||||
logger.log({ message: 'foobar', serviceId: 15 });
|
await logger.log({ message: 'foobar', serviceId: 15, timestamp: 1000 });
|
||||||
logger.log({ timestamp: 1337, message: 'foobar', serviceId: 15 });
|
await logger.log({ timestamp: 1337, message: 'foobar', serviceId: 15 });
|
||||||
logger.log({ message: 'foobar' }); // shold be ignored
|
await logger.log({ message: 'foobar', isSystem: true, timestamp: 1500 }); // shold be ignored
|
||||||
|
|
||||||
await setTimeout(5500);
|
await setTimeout(5500);
|
||||||
expect(this.requestStub.calledOnce).to.be.true;
|
expect(this.requestStub.calledOnce).to.be.true;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user