Fail fast if managed task workers are near-OOM (#1657)

- Add `onefuzz::memory::available_bytes()` to enable checking system-wide memory usage
- In managed task worker runs, heuristically check for imminent OOM conditions and try to exit early
This commit is contained in:
Joe Ranweiler
2022-02-28 21:36:52 -08:00
committed by GitHub
parent 7f932167fe
commit 1b019818b5
8 changed files with 288 additions and 3 deletions

1
src/agent/Cargo.lock generated
View File

@ -1803,6 +1803,7 @@ dependencies = [
"url-escape",
"urlparse",
"uuid",
"winapi 0.3.9",
"winreg 0.10.1",
]

View File

@ -1,10 +1,17 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
use std::path::PathBuf;
#[cfg(not(target_os = "macos"))]
use std::time::Duration;
use crate::tasks::config::{CommonConfig, Config};
use anyhow::Result;
use clap::{App, Arg, SubCommand};
use std::path::PathBuf;
use crate::tasks::config::{CommonConfig, Config};
#[cfg(not(target_os = "macos"))]
const OOM_CHECK_INTERVAL: Duration = Duration::from_secs(5);
pub async fn run(args: &clap::ArgMatches<'_>) -> Result<()> {
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
@ -13,7 +20,22 @@ pub async fn run(args: &clap::ArgMatches<'_>) -> Result<()> {
let config = Config::from_file(config_path, setup_dir)?;
init_telemetry(config.common());
let result = config.run().await;
let min_available_memory_bytes = 1_000_000 * config.common().min_available_memory_mb;
// If the memory limit is 0, this will resolve immediately with an error.
let check_oom = out_of_memory(min_available_memory_bytes);
let result = tokio::select! {
result = config.run() => result,
// Ignore this task if it returns due to a querying error.
Ok(oom) = check_oom => {
// Convert the OOM notification to an error, so we can log it below.
let err = format_err!("out of memory: {} bytes available, {} required", oom.available_bytes, oom.min_bytes);
Err(err)
},
};
if let Err(err) = &result {
error!("error running task: {:?}", err);
@ -23,6 +45,61 @@ pub async fn run(args: &clap::ArgMatches<'_>) -> Result<()> {
result
}
#[cfg(not(target_os = "macos"))]
const MAX_OOM_QUERY_ERRORS: usize = 5;
// Periodically check available system memory.
//
// If available memory drops below the minimum, exit informatively.
//
// Parameterized to enable future configuration by VMSS.
#[cfg(not(target_os = "macos"))]
async fn out_of_memory(min_bytes: u64) -> Result<OutOfMemory> {
if min_bytes == 0 {
bail!("available memory minimum is unreachable");
}
let mut consecutive_query_errors = 0;
loop {
match onefuzz::memory::available_bytes() {
Ok(available_bytes) => {
// Reset so we count consecutive errors.
consecutive_query_errors = 0;
if available_bytes < min_bytes {
return Ok(OutOfMemory {
available_bytes,
min_bytes,
});
}
}
Err(err) => {
warn!("error querying system memory usage: {}", err);
consecutive_query_errors += 1;
if consecutive_query_errors > MAX_OOM_QUERY_ERRORS {
return Err(err);
}
}
}
tokio::time::sleep(OOM_CHECK_INTERVAL).await;
}
}
#[cfg(target_os = "macos")]
async fn out_of_memory(_min_bytes: u64) -> Result<OutOfMemory> {
// Resolve immediately.
bail!("out-of-memory check not implemented on macOS")
}
struct OutOfMemory {
available_bytes: u64,
min_bytes: u64,
}
fn init_telemetry(config: &CommonConfig) {
onefuzz_telemetry::set_appinsights_clients(
config.instance_telemetry_key.clone(),

View File

@ -20,6 +20,12 @@ use serde::{self, Deserialize};
use std::{path::PathBuf, sync::Arc, time::Duration};
use uuid::Uuid;
const DEFAULT_MIN_AVAILABLE_MEMORY_MB: u64 = 100;
fn default_min_available_memory_mb() -> u64 {
DEFAULT_MIN_AVAILABLE_MEMORY_MB
}
#[derive(Debug, Deserialize, PartialEq, Clone)]
pub enum ContainerType {
#[serde(alias = "inputs")]
@ -42,6 +48,14 @@ pub struct CommonConfig {
#[serde(default)]
pub setup_dir: PathBuf,
/// Lower bound on available system memory. If the available memory drops
/// below the limit, the task will exit with an error. This is a fail-fast
/// mechanism to support debugging.
///
/// Can be disabled by setting to 0.
#[serde(default = "default_min_available_memory_mb")]
pub min_available_memory_mb: u64,
}
impl CommonConfig {

View File

@ -47,6 +47,7 @@ backoff = { version = "0.3", features = ["tokio"] }
winreg = "0.10"
input-tester = { path = "../input-tester" }
debugger = { path = "../debugger" }
winapi = { version = "0.3", features = ["impl-default", "psapi"] }
[target.'cfg(target_family = "unix")'.dependencies]
cpp_demangle = "0.3"

View File

@ -0,0 +1,11 @@
#[cfg(not(target_os = "macos"))]
fn main() {
let bytes = onefuzz::memory::available_bytes().unwrap();
let gb = (bytes as f64) * 1e-9;
println!("available bytes: {} ({:.1} GB)", bytes, gb);
}
#[cfg(target_os = "macos")]
fn main() {
unimplemented!()
}

View File

@ -20,6 +20,7 @@ pub mod input_tester;
pub mod jitter;
pub mod libfuzzer;
pub mod machine_id;
pub mod memory;
pub mod monitor;
pub mod process;
pub mod sha256;

View File

@ -0,0 +1,81 @@
#[cfg(target_os = "windows")]
use std::convert::TryFrom;
#[cfg(not(target_os = "macos"))]
use anyhow::Result;
#[cfg(target_os = "linux")]
use regex::Regex;
#[cfg(target_os = "windows")]
use winapi::um::psapi::PERFORMANCE_INFORMATION;
#[cfg(target_os = "windows")]
pub fn available_bytes() -> Result<u64> {
let info = get_performance_info()?;
let pages = info.CommitLimit.saturating_sub(info.CommitTotal);
let bytes = pages * info.PageSize;
let bytes = u64::try_from(bytes)?;
Ok(bytes)
}
#[cfg(target_os = "windows")]
fn get_performance_info() -> Result<PERFORMANCE_INFORMATION> {
use winapi::shared::minwindef::FALSE;
use winapi::um::errhandlingapi::GetLastError;
use winapi::um::psapi::GetPerformanceInfo;
let mut info = PERFORMANCE_INFORMATION::default();
let success = unsafe {
// Will always fit in a `u32`.
//
// https://docs.microsoft.com/en-us/windows/win32/api/psapi/ns-psapi-performance_information
let size = std::mem::size_of::<PERFORMANCE_INFORMATION>();
let size = u32::try_from(size)?;
GetPerformanceInfo(&mut info, size)
};
if success == FALSE {
let code = unsafe { GetLastError() };
bail!("error querying performance information: {:x}", code);
}
Ok(info)
}
#[cfg(target_os = "linux")]
pub fn available_bytes() -> Result<u64> {
const BYTES_PER_KB: u64 = 1024;
let meminfo = std::fs::read_to_string("/proc/meminfo")?;
let available_kb = parse_available_kb(&meminfo)?;
let available_bytes = available_kb * BYTES_PER_KB;
Ok(available_bytes)
}
#[cfg(target_os = "linux")]
fn parse_available_kb(meminfo: &str) -> Result<u64> {
let captures = AVAILABLE_KB
.captures(meminfo)
.ok_or_else(|| format_err!("`MemAvailable` not found in `/proc/meminfo`"))?;
let available_kb = captures
.get(1)
.ok_or_else(|| format_err!("`MemAvailable` not found in `/proc/meminfo`"))?
.as_str()
.parse()?;
Ok(available_kb)
}
#[cfg(target_os = "linux")]
lazy_static::lazy_static! {
static ref AVAILABLE_KB: Regex = Regex::new(r"MemAvailable:\s*(\d+) kB").unwrap();
}
#[cfg(test)]
#[cfg(target_os = "linux")]
mod tests_linux;

View File

@ -0,0 +1,99 @@
use anyhow::Result;
use super::parse_available_kb;
#[test]
fn test_parse_available_kb() -> Result<()> {
assert_eq!(parse_available_kb(MEMINFO)?, 1001092);
assert_eq!(parse_available_kb("MemAvailable: 1001092 kB")?, 1001092);
assert_eq!(
parse_available_kb("MemAvailable: 1001092 kB\tMemAvailable: 123 kB")?,
1001092
);
assert_eq!(
parse_available_kb(" MemAvailable: 1001092 kB")?,
1001092
);
assert_eq!(parse_available_kb(" MemAvailable:1001092 kB")?, 1001092);
assert_eq!(parse_available_kb(" MemAvailable: 1001092 kB")?, 1001092);
assert_eq!(
parse_available_kb(" MemAvailable: 1001092 kB")?,
1001092
);
assert_eq!(
parse_available_kb("extra MemAvailable: 1001092 kB")?,
1001092
);
assert_eq!(
parse_available_kb("extra MemAvailable:1001092 kB")?,
1001092
);
assert_eq!(
parse_available_kb("extra MemAvailable: 1001092 kB")?,
1001092
);
assert_eq!(
parse_available_kb("extra MemAvailable: 1001092 kB")?,
1001092
);
Ok(())
}
#[test]
fn test_parse_available_kb_missing() {
assert!(parse_available_kb("").is_err());
assert!(parse_available_kb("1001092").is_err());
assert!(parse_available_kb("MemAvailable: ").is_err());
assert!(parse_available_kb("MemAvailable: 1001092 MB").is_err());
assert!(parse_available_kb("MemFree: 198308 kB").is_err());
}
const MEMINFO: &str = "MemTotal: 16036984 kB
MemFree: 198308 kB
MemAvailable: 1001092 kB
Buffers: 521880 kB
Cached: 459416 kB
SwapCached: 1580 kB
Active: 830140 kB
Inactive: 206728 kB
Active(anon): 22492 kB
Inactive(anon): 28876 kB
Active(file): 807648 kB
Inactive(file): 177852 kB
Unevictable: 0 kB
Mlocked: 0 kB
SwapTotal: 4194300 kB
SwapFree: 4181440 kB
Dirty: 8 kB
Writeback: 0 kB
AnonPages: 54368 kB
Mapped: 31344 kB
Shmem: 792 kB
Slab: 192900 kB
SReclaimable: 131056 kB
SUnreclaim: 61844 kB
KernelStack: 3104 kB
PageTables: 5324 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
WritebackTmp: 0 kB
CommitLimit: 12212792 kB
Committed_AS: 575108 kB
VmallocTotal: 34359738367 kB
VmallocUsed: 0 kB
VmallocChunk: 0 kB
HardwareCorrupted: 0 kB
AnonHugePages: 0 kB
ShmemHugePages: 0 kB
ShmemPmdMapped: 0 kB
CmaTotal: 0 kB
CmaFree: 0 kB
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 2048 kB
DirectMap4k: 152880 kB
DirectMap2M: 4696064 kB
DirectMap1G: 11534336 kB";