mirror of
https://github.com/microsoft/onefuzz.git
synced 2025-06-14 19:08:08 +00:00
Fail fast if managed task workers are near-OOM (#1657)
- Add `onefuzz::memory::available_bytes()` to enable checking system-wide memory usage - In managed task worker runs, heuristically check for imminent OOM conditions and try to exit early
This commit is contained in:
1
src/agent/Cargo.lock
generated
1
src/agent/Cargo.lock
generated
@ -1803,6 +1803,7 @@ dependencies = [
|
||||
"url-escape",
|
||||
"urlparse",
|
||||
"uuid",
|
||||
"winapi 0.3.9",
|
||||
"winreg 0.10.1",
|
||||
]
|
||||
|
||||
|
@ -1,10 +1,17 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT License.
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::tasks::config::{CommonConfig, Config};
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg, SubCommand};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::tasks::config::{CommonConfig, Config};
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
const OOM_CHECK_INTERVAL: Duration = Duration::from_secs(5);
|
||||
|
||||
pub async fn run(args: &clap::ArgMatches<'_>) -> Result<()> {
|
||||
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
|
||||
@ -13,7 +20,22 @@ pub async fn run(args: &clap::ArgMatches<'_>) -> Result<()> {
|
||||
let config = Config::from_file(config_path, setup_dir)?;
|
||||
|
||||
init_telemetry(config.common());
|
||||
let result = config.run().await;
|
||||
|
||||
let min_available_memory_bytes = 1_000_000 * config.common().min_available_memory_mb;
|
||||
|
||||
// If the memory limit is 0, this will resolve immediately with an error.
|
||||
let check_oom = out_of_memory(min_available_memory_bytes);
|
||||
|
||||
let result = tokio::select! {
|
||||
result = config.run() => result,
|
||||
|
||||
// Ignore this task if it returns due to a querying error.
|
||||
Ok(oom) = check_oom => {
|
||||
// Convert the OOM notification to an error, so we can log it below.
|
||||
let err = format_err!("out of memory: {} bytes available, {} required", oom.available_bytes, oom.min_bytes);
|
||||
Err(err)
|
||||
},
|
||||
};
|
||||
|
||||
if let Err(err) = &result {
|
||||
error!("error running task: {:?}", err);
|
||||
@ -23,6 +45,61 @@ pub async fn run(args: &clap::ArgMatches<'_>) -> Result<()> {
|
||||
result
|
||||
}
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
const MAX_OOM_QUERY_ERRORS: usize = 5;
|
||||
|
||||
// Periodically check available system memory.
|
||||
//
|
||||
// If available memory drops below the minimum, exit informatively.
|
||||
//
|
||||
// Parameterized to enable future configuration by VMSS.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
async fn out_of_memory(min_bytes: u64) -> Result<OutOfMemory> {
|
||||
if min_bytes == 0 {
|
||||
bail!("available memory minimum is unreachable");
|
||||
}
|
||||
|
||||
let mut consecutive_query_errors = 0;
|
||||
|
||||
loop {
|
||||
match onefuzz::memory::available_bytes() {
|
||||
Ok(available_bytes) => {
|
||||
// Reset so we count consecutive errors.
|
||||
consecutive_query_errors = 0;
|
||||
|
||||
if available_bytes < min_bytes {
|
||||
return Ok(OutOfMemory {
|
||||
available_bytes,
|
||||
min_bytes,
|
||||
});
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
warn!("error querying system memory usage: {}", err);
|
||||
|
||||
consecutive_query_errors += 1;
|
||||
|
||||
if consecutive_query_errors > MAX_OOM_QUERY_ERRORS {
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tokio::time::sleep(OOM_CHECK_INTERVAL).await;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
async fn out_of_memory(_min_bytes: u64) -> Result<OutOfMemory> {
|
||||
// Resolve immediately.
|
||||
bail!("out-of-memory check not implemented on macOS")
|
||||
}
|
||||
|
||||
struct OutOfMemory {
|
||||
available_bytes: u64,
|
||||
min_bytes: u64,
|
||||
}
|
||||
|
||||
fn init_telemetry(config: &CommonConfig) {
|
||||
onefuzz_telemetry::set_appinsights_clients(
|
||||
config.instance_telemetry_key.clone(),
|
||||
|
@ -20,6 +20,12 @@ use serde::{self, Deserialize};
|
||||
use std::{path::PathBuf, sync::Arc, time::Duration};
|
||||
use uuid::Uuid;
|
||||
|
||||
const DEFAULT_MIN_AVAILABLE_MEMORY_MB: u64 = 100;
|
||||
|
||||
fn default_min_available_memory_mb() -> u64 {
|
||||
DEFAULT_MIN_AVAILABLE_MEMORY_MB
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, PartialEq, Clone)]
|
||||
pub enum ContainerType {
|
||||
#[serde(alias = "inputs")]
|
||||
@ -42,6 +48,14 @@ pub struct CommonConfig {
|
||||
|
||||
#[serde(default)]
|
||||
pub setup_dir: PathBuf,
|
||||
|
||||
/// Lower bound on available system memory. If the available memory drops
|
||||
/// below the limit, the task will exit with an error. This is a fail-fast
|
||||
/// mechanism to support debugging.
|
||||
///
|
||||
/// Can be disabled by setting to 0.
|
||||
#[serde(default = "default_min_available_memory_mb")]
|
||||
pub min_available_memory_mb: u64,
|
||||
}
|
||||
|
||||
impl CommonConfig {
|
||||
|
@ -47,6 +47,7 @@ backoff = { version = "0.3", features = ["tokio"] }
|
||||
winreg = "0.10"
|
||||
input-tester = { path = "../input-tester" }
|
||||
debugger = { path = "../debugger" }
|
||||
winapi = { version = "0.3", features = ["impl-default", "psapi"] }
|
||||
|
||||
[target.'cfg(target_family = "unix")'.dependencies]
|
||||
cpp_demangle = "0.3"
|
||||
|
11
src/agent/onefuzz/examples/memory.rs
Normal file
11
src/agent/onefuzz/examples/memory.rs
Normal file
@ -0,0 +1,11 @@
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
fn main() {
|
||||
let bytes = onefuzz::memory::available_bytes().unwrap();
|
||||
let gb = (bytes as f64) * 1e-9;
|
||||
println!("available bytes: {} ({:.1} GB)", bytes, gb);
|
||||
}
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
fn main() {
|
||||
unimplemented!()
|
||||
}
|
@ -20,6 +20,7 @@ pub mod input_tester;
|
||||
pub mod jitter;
|
||||
pub mod libfuzzer;
|
||||
pub mod machine_id;
|
||||
pub mod memory;
|
||||
pub mod monitor;
|
||||
pub mod process;
|
||||
pub mod sha256;
|
||||
|
81
src/agent/onefuzz/src/memory.rs
Normal file
81
src/agent/onefuzz/src/memory.rs
Normal file
@ -0,0 +1,81 @@
|
||||
#[cfg(target_os = "windows")]
|
||||
use std::convert::TryFrom;
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
use anyhow::Result;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
use regex::Regex;
|
||||
|
||||
#[cfg(target_os = "windows")]
|
||||
use winapi::um::psapi::PERFORMANCE_INFORMATION;
|
||||
|
||||
#[cfg(target_os = "windows")]
|
||||
pub fn available_bytes() -> Result<u64> {
|
||||
let info = get_performance_info()?;
|
||||
let pages = info.CommitLimit.saturating_sub(info.CommitTotal);
|
||||
let bytes = pages * info.PageSize;
|
||||
let bytes = u64::try_from(bytes)?;
|
||||
|
||||
Ok(bytes)
|
||||
}
|
||||
|
||||
#[cfg(target_os = "windows")]
|
||||
fn get_performance_info() -> Result<PERFORMANCE_INFORMATION> {
|
||||
use winapi::shared::minwindef::FALSE;
|
||||
use winapi::um::errhandlingapi::GetLastError;
|
||||
use winapi::um::psapi::GetPerformanceInfo;
|
||||
|
||||
let mut info = PERFORMANCE_INFORMATION::default();
|
||||
|
||||
let success = unsafe {
|
||||
// Will always fit in a `u32`.
|
||||
//
|
||||
// https://docs.microsoft.com/en-us/windows/win32/api/psapi/ns-psapi-performance_information
|
||||
let size = std::mem::size_of::<PERFORMANCE_INFORMATION>();
|
||||
let size = u32::try_from(size)?;
|
||||
GetPerformanceInfo(&mut info, size)
|
||||
};
|
||||
|
||||
if success == FALSE {
|
||||
let code = unsafe { GetLastError() };
|
||||
bail!("error querying performance information: {:x}", code);
|
||||
}
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub fn available_bytes() -> Result<u64> {
|
||||
const BYTES_PER_KB: u64 = 1024;
|
||||
|
||||
let meminfo = std::fs::read_to_string("/proc/meminfo")?;
|
||||
let available_kb = parse_available_kb(&meminfo)?;
|
||||
let available_bytes = available_kb * BYTES_PER_KB;
|
||||
|
||||
Ok(available_bytes)
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
fn parse_available_kb(meminfo: &str) -> Result<u64> {
|
||||
let captures = AVAILABLE_KB
|
||||
.captures(meminfo)
|
||||
.ok_or_else(|| format_err!("`MemAvailable` not found in `/proc/meminfo`"))?;
|
||||
|
||||
let available_kb = captures
|
||||
.get(1)
|
||||
.ok_or_else(|| format_err!("`MemAvailable` not found in `/proc/meminfo`"))?
|
||||
.as_str()
|
||||
.parse()?;
|
||||
|
||||
Ok(available_kb)
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
lazy_static::lazy_static! {
|
||||
static ref AVAILABLE_KB: Regex = Regex::new(r"MemAvailable:\s*(\d+) kB").unwrap();
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[cfg(target_os = "linux")]
|
||||
mod tests_linux;
|
99
src/agent/onefuzz/src/memory/tests_linux.rs
Normal file
99
src/agent/onefuzz/src/memory/tests_linux.rs
Normal file
@ -0,0 +1,99 @@
|
||||
use anyhow::Result;
|
||||
|
||||
use super::parse_available_kb;
|
||||
|
||||
#[test]
|
||||
fn test_parse_available_kb() -> Result<()> {
|
||||
assert_eq!(parse_available_kb(MEMINFO)?, 1001092);
|
||||
assert_eq!(parse_available_kb("MemAvailable: 1001092 kB")?, 1001092);
|
||||
assert_eq!(
|
||||
parse_available_kb("MemAvailable: 1001092 kB\tMemAvailable: 123 kB")?,
|
||||
1001092
|
||||
);
|
||||
assert_eq!(
|
||||
parse_available_kb(" MemAvailable: 1001092 kB")?,
|
||||
1001092
|
||||
);
|
||||
assert_eq!(parse_available_kb(" MemAvailable:1001092 kB")?, 1001092);
|
||||
assert_eq!(parse_available_kb(" MemAvailable: 1001092 kB")?, 1001092);
|
||||
assert_eq!(
|
||||
parse_available_kb(" MemAvailable: 1001092 kB")?,
|
||||
1001092
|
||||
);
|
||||
assert_eq!(
|
||||
parse_available_kb("extra MemAvailable: 1001092 kB")?,
|
||||
1001092
|
||||
);
|
||||
assert_eq!(
|
||||
parse_available_kb("extra MemAvailable:1001092 kB")?,
|
||||
1001092
|
||||
);
|
||||
assert_eq!(
|
||||
parse_available_kb("extra MemAvailable: 1001092 kB")?,
|
||||
1001092
|
||||
);
|
||||
assert_eq!(
|
||||
parse_available_kb("extra MemAvailable: 1001092 kB")?,
|
||||
1001092
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_available_kb_missing() {
|
||||
assert!(parse_available_kb("").is_err());
|
||||
assert!(parse_available_kb("1001092").is_err());
|
||||
assert!(parse_available_kb("MemAvailable: ").is_err());
|
||||
assert!(parse_available_kb("MemAvailable: 1001092 MB").is_err());
|
||||
assert!(parse_available_kb("MemFree: 198308 kB").is_err());
|
||||
}
|
||||
|
||||
const MEMINFO: &str = "MemTotal: 16036984 kB
|
||||
MemFree: 198308 kB
|
||||
MemAvailable: 1001092 kB
|
||||
Buffers: 521880 kB
|
||||
Cached: 459416 kB
|
||||
SwapCached: 1580 kB
|
||||
Active: 830140 kB
|
||||
Inactive: 206728 kB
|
||||
Active(anon): 22492 kB
|
||||
Inactive(anon): 28876 kB
|
||||
Active(file): 807648 kB
|
||||
Inactive(file): 177852 kB
|
||||
Unevictable: 0 kB
|
||||
Mlocked: 0 kB
|
||||
SwapTotal: 4194300 kB
|
||||
SwapFree: 4181440 kB
|
||||
Dirty: 8 kB
|
||||
Writeback: 0 kB
|
||||
AnonPages: 54368 kB
|
||||
Mapped: 31344 kB
|
||||
Shmem: 792 kB
|
||||
Slab: 192900 kB
|
||||
SReclaimable: 131056 kB
|
||||
SUnreclaim: 61844 kB
|
||||
KernelStack: 3104 kB
|
||||
PageTables: 5324 kB
|
||||
NFS_Unstable: 0 kB
|
||||
Bounce: 0 kB
|
||||
WritebackTmp: 0 kB
|
||||
CommitLimit: 12212792 kB
|
||||
Committed_AS: 575108 kB
|
||||
VmallocTotal: 34359738367 kB
|
||||
VmallocUsed: 0 kB
|
||||
VmallocChunk: 0 kB
|
||||
HardwareCorrupted: 0 kB
|
||||
AnonHugePages: 0 kB
|
||||
ShmemHugePages: 0 kB
|
||||
ShmemPmdMapped: 0 kB
|
||||
CmaTotal: 0 kB
|
||||
CmaFree: 0 kB
|
||||
HugePages_Total: 0
|
||||
HugePages_Free: 0
|
||||
HugePages_Rsvd: 0
|
||||
HugePages_Surp: 0
|
||||
Hugepagesize: 2048 kB
|
||||
DirectMap4k: 152880 kB
|
||||
DirectMap2M: 4696064 kB
|
||||
DirectMap1G: 11534336 kB";
|
Reference in New Issue
Block a user