mirror of
https://github.com/microsoft/onefuzz.git
synced 2025-06-14 19:08:08 +00:00
Fail fast if managed task workers are near-OOM (#1657)
- Add `onefuzz::memory::available_bytes()` to enable checking system-wide memory usage - In managed task worker runs, heuristically check for imminent OOM conditions and try to exit early
This commit is contained in:
1
src/agent/Cargo.lock
generated
1
src/agent/Cargo.lock
generated
@ -1803,6 +1803,7 @@ dependencies = [
|
|||||||
"url-escape",
|
"url-escape",
|
||||||
"urlparse",
|
"urlparse",
|
||||||
"uuid",
|
"uuid",
|
||||||
|
"winapi 0.3.9",
|
||||||
"winreg 0.10.1",
|
"winreg 0.10.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -1,10 +1,17 @@
|
|||||||
// Copyright (c) Microsoft Corporation.
|
// Copyright (c) Microsoft Corporation.
|
||||||
// Licensed under the MIT License.
|
// Licensed under the MIT License.
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "macos"))]
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
use crate::tasks::config::{CommonConfig, Config};
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use clap::{App, Arg, SubCommand};
|
use clap::{App, Arg, SubCommand};
|
||||||
use std::path::PathBuf;
|
|
||||||
|
use crate::tasks::config::{CommonConfig, Config};
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "macos"))]
|
||||||
|
const OOM_CHECK_INTERVAL: Duration = Duration::from_secs(5);
|
||||||
|
|
||||||
pub async fn run(args: &clap::ArgMatches<'_>) -> Result<()> {
|
pub async fn run(args: &clap::ArgMatches<'_>) -> Result<()> {
|
||||||
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
|
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
|
||||||
@ -13,7 +20,22 @@ pub async fn run(args: &clap::ArgMatches<'_>) -> Result<()> {
|
|||||||
let config = Config::from_file(config_path, setup_dir)?;
|
let config = Config::from_file(config_path, setup_dir)?;
|
||||||
|
|
||||||
init_telemetry(config.common());
|
init_telemetry(config.common());
|
||||||
let result = config.run().await;
|
|
||||||
|
let min_available_memory_bytes = 1_000_000 * config.common().min_available_memory_mb;
|
||||||
|
|
||||||
|
// If the memory limit is 0, this will resolve immediately with an error.
|
||||||
|
let check_oom = out_of_memory(min_available_memory_bytes);
|
||||||
|
|
||||||
|
let result = tokio::select! {
|
||||||
|
result = config.run() => result,
|
||||||
|
|
||||||
|
// Ignore this task if it returns due to a querying error.
|
||||||
|
Ok(oom) = check_oom => {
|
||||||
|
// Convert the OOM notification to an error, so we can log it below.
|
||||||
|
let err = format_err!("out of memory: {} bytes available, {} required", oom.available_bytes, oom.min_bytes);
|
||||||
|
Err(err)
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
if let Err(err) = &result {
|
if let Err(err) = &result {
|
||||||
error!("error running task: {:?}", err);
|
error!("error running task: {:?}", err);
|
||||||
@ -23,6 +45,61 @@ pub async fn run(args: &clap::ArgMatches<'_>) -> Result<()> {
|
|||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "macos"))]
|
||||||
|
const MAX_OOM_QUERY_ERRORS: usize = 5;
|
||||||
|
|
||||||
|
// Periodically check available system memory.
|
||||||
|
//
|
||||||
|
// If available memory drops below the minimum, exit informatively.
|
||||||
|
//
|
||||||
|
// Parameterized to enable future configuration by VMSS.
|
||||||
|
#[cfg(not(target_os = "macos"))]
|
||||||
|
async fn out_of_memory(min_bytes: u64) -> Result<OutOfMemory> {
|
||||||
|
if min_bytes == 0 {
|
||||||
|
bail!("available memory minimum is unreachable");
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut consecutive_query_errors = 0;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match onefuzz::memory::available_bytes() {
|
||||||
|
Ok(available_bytes) => {
|
||||||
|
// Reset so we count consecutive errors.
|
||||||
|
consecutive_query_errors = 0;
|
||||||
|
|
||||||
|
if available_bytes < min_bytes {
|
||||||
|
return Ok(OutOfMemory {
|
||||||
|
available_bytes,
|
||||||
|
min_bytes,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
warn!("error querying system memory usage: {}", err);
|
||||||
|
|
||||||
|
consecutive_query_errors += 1;
|
||||||
|
|
||||||
|
if consecutive_query_errors > MAX_OOM_QUERY_ERRORS {
|
||||||
|
return Err(err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::time::sleep(OOM_CHECK_INTERVAL).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "macos")]
|
||||||
|
async fn out_of_memory(_min_bytes: u64) -> Result<OutOfMemory> {
|
||||||
|
// Resolve immediately.
|
||||||
|
bail!("out-of-memory check not implemented on macOS")
|
||||||
|
}
|
||||||
|
|
||||||
|
struct OutOfMemory {
|
||||||
|
available_bytes: u64,
|
||||||
|
min_bytes: u64,
|
||||||
|
}
|
||||||
|
|
||||||
fn init_telemetry(config: &CommonConfig) {
|
fn init_telemetry(config: &CommonConfig) {
|
||||||
onefuzz_telemetry::set_appinsights_clients(
|
onefuzz_telemetry::set_appinsights_clients(
|
||||||
config.instance_telemetry_key.clone(),
|
config.instance_telemetry_key.clone(),
|
||||||
|
@ -20,6 +20,12 @@ use serde::{self, Deserialize};
|
|||||||
use std::{path::PathBuf, sync::Arc, time::Duration};
|
use std::{path::PathBuf, sync::Arc, time::Duration};
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
const DEFAULT_MIN_AVAILABLE_MEMORY_MB: u64 = 100;
|
||||||
|
|
||||||
|
fn default_min_available_memory_mb() -> u64 {
|
||||||
|
DEFAULT_MIN_AVAILABLE_MEMORY_MB
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, PartialEq, Clone)]
|
#[derive(Debug, Deserialize, PartialEq, Clone)]
|
||||||
pub enum ContainerType {
|
pub enum ContainerType {
|
||||||
#[serde(alias = "inputs")]
|
#[serde(alias = "inputs")]
|
||||||
@ -42,6 +48,14 @@ pub struct CommonConfig {
|
|||||||
|
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub setup_dir: PathBuf,
|
pub setup_dir: PathBuf,
|
||||||
|
|
||||||
|
/// Lower bound on available system memory. If the available memory drops
|
||||||
|
/// below the limit, the task will exit with an error. This is a fail-fast
|
||||||
|
/// mechanism to support debugging.
|
||||||
|
///
|
||||||
|
/// Can be disabled by setting to 0.
|
||||||
|
#[serde(default = "default_min_available_memory_mb")]
|
||||||
|
pub min_available_memory_mb: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CommonConfig {
|
impl CommonConfig {
|
||||||
|
@ -47,6 +47,7 @@ backoff = { version = "0.3", features = ["tokio"] }
|
|||||||
winreg = "0.10"
|
winreg = "0.10"
|
||||||
input-tester = { path = "../input-tester" }
|
input-tester = { path = "../input-tester" }
|
||||||
debugger = { path = "../debugger" }
|
debugger = { path = "../debugger" }
|
||||||
|
winapi = { version = "0.3", features = ["impl-default", "psapi"] }
|
||||||
|
|
||||||
[target.'cfg(target_family = "unix")'.dependencies]
|
[target.'cfg(target_family = "unix")'.dependencies]
|
||||||
cpp_demangle = "0.3"
|
cpp_demangle = "0.3"
|
||||||
|
11
src/agent/onefuzz/examples/memory.rs
Normal file
11
src/agent/onefuzz/examples/memory.rs
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#[cfg(not(target_os = "macos"))]
|
||||||
|
fn main() {
|
||||||
|
let bytes = onefuzz::memory::available_bytes().unwrap();
|
||||||
|
let gb = (bytes as f64) * 1e-9;
|
||||||
|
println!("available bytes: {} ({:.1} GB)", bytes, gb);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "macos")]
|
||||||
|
fn main() {
|
||||||
|
unimplemented!()
|
||||||
|
}
|
@ -20,6 +20,7 @@ pub mod input_tester;
|
|||||||
pub mod jitter;
|
pub mod jitter;
|
||||||
pub mod libfuzzer;
|
pub mod libfuzzer;
|
||||||
pub mod machine_id;
|
pub mod machine_id;
|
||||||
|
pub mod memory;
|
||||||
pub mod monitor;
|
pub mod monitor;
|
||||||
pub mod process;
|
pub mod process;
|
||||||
pub mod sha256;
|
pub mod sha256;
|
||||||
|
81
src/agent/onefuzz/src/memory.rs
Normal file
81
src/agent/onefuzz/src/memory.rs
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
#[cfg(target_os = "windows")]
|
||||||
|
use std::convert::TryFrom;
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "macos"))]
|
||||||
|
use anyhow::Result;
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
#[cfg(target_os = "windows")]
|
||||||
|
use winapi::um::psapi::PERFORMANCE_INFORMATION;
|
||||||
|
|
||||||
|
#[cfg(target_os = "windows")]
|
||||||
|
pub fn available_bytes() -> Result<u64> {
|
||||||
|
let info = get_performance_info()?;
|
||||||
|
let pages = info.CommitLimit.saturating_sub(info.CommitTotal);
|
||||||
|
let bytes = pages * info.PageSize;
|
||||||
|
let bytes = u64::try_from(bytes)?;
|
||||||
|
|
||||||
|
Ok(bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "windows")]
|
||||||
|
fn get_performance_info() -> Result<PERFORMANCE_INFORMATION> {
|
||||||
|
use winapi::shared::minwindef::FALSE;
|
||||||
|
use winapi::um::errhandlingapi::GetLastError;
|
||||||
|
use winapi::um::psapi::GetPerformanceInfo;
|
||||||
|
|
||||||
|
let mut info = PERFORMANCE_INFORMATION::default();
|
||||||
|
|
||||||
|
let success = unsafe {
|
||||||
|
// Will always fit in a `u32`.
|
||||||
|
//
|
||||||
|
// https://docs.microsoft.com/en-us/windows/win32/api/psapi/ns-psapi-performance_information
|
||||||
|
let size = std::mem::size_of::<PERFORMANCE_INFORMATION>();
|
||||||
|
let size = u32::try_from(size)?;
|
||||||
|
GetPerformanceInfo(&mut info, size)
|
||||||
|
};
|
||||||
|
|
||||||
|
if success == FALSE {
|
||||||
|
let code = unsafe { GetLastError() };
|
||||||
|
bail!("error querying performance information: {:x}", code);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(info)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
pub fn available_bytes() -> Result<u64> {
|
||||||
|
const BYTES_PER_KB: u64 = 1024;
|
||||||
|
|
||||||
|
let meminfo = std::fs::read_to_string("/proc/meminfo")?;
|
||||||
|
let available_kb = parse_available_kb(&meminfo)?;
|
||||||
|
let available_bytes = available_kb * BYTES_PER_KB;
|
||||||
|
|
||||||
|
Ok(available_bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
fn parse_available_kb(meminfo: &str) -> Result<u64> {
|
||||||
|
let captures = AVAILABLE_KB
|
||||||
|
.captures(meminfo)
|
||||||
|
.ok_or_else(|| format_err!("`MemAvailable` not found in `/proc/meminfo`"))?;
|
||||||
|
|
||||||
|
let available_kb = captures
|
||||||
|
.get(1)
|
||||||
|
.ok_or_else(|| format_err!("`MemAvailable` not found in `/proc/meminfo`"))?
|
||||||
|
.as_str()
|
||||||
|
.parse()?;
|
||||||
|
|
||||||
|
Ok(available_kb)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
lazy_static::lazy_static! {
|
||||||
|
static ref AVAILABLE_KB: Regex = Regex::new(r"MemAvailable:\s*(\d+) kB").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
mod tests_linux;
|
99
src/agent/onefuzz/src/memory/tests_linux.rs
Normal file
99
src/agent/onefuzz/src/memory/tests_linux.rs
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
|
||||||
|
use super::parse_available_kb;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_available_kb() -> Result<()> {
|
||||||
|
assert_eq!(parse_available_kb(MEMINFO)?, 1001092);
|
||||||
|
assert_eq!(parse_available_kb("MemAvailable: 1001092 kB")?, 1001092);
|
||||||
|
assert_eq!(
|
||||||
|
parse_available_kb("MemAvailable: 1001092 kB\tMemAvailable: 123 kB")?,
|
||||||
|
1001092
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_available_kb(" MemAvailable: 1001092 kB")?,
|
||||||
|
1001092
|
||||||
|
);
|
||||||
|
assert_eq!(parse_available_kb(" MemAvailable:1001092 kB")?, 1001092);
|
||||||
|
assert_eq!(parse_available_kb(" MemAvailable: 1001092 kB")?, 1001092);
|
||||||
|
assert_eq!(
|
||||||
|
parse_available_kb(" MemAvailable: 1001092 kB")?,
|
||||||
|
1001092
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_available_kb("extra MemAvailable: 1001092 kB")?,
|
||||||
|
1001092
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_available_kb("extra MemAvailable:1001092 kB")?,
|
||||||
|
1001092
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_available_kb("extra MemAvailable: 1001092 kB")?,
|
||||||
|
1001092
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
parse_available_kb("extra MemAvailable: 1001092 kB")?,
|
||||||
|
1001092
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_available_kb_missing() {
|
||||||
|
assert!(parse_available_kb("").is_err());
|
||||||
|
assert!(parse_available_kb("1001092").is_err());
|
||||||
|
assert!(parse_available_kb("MemAvailable: ").is_err());
|
||||||
|
assert!(parse_available_kb("MemAvailable: 1001092 MB").is_err());
|
||||||
|
assert!(parse_available_kb("MemFree: 198308 kB").is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
const MEMINFO: &str = "MemTotal: 16036984 kB
|
||||||
|
MemFree: 198308 kB
|
||||||
|
MemAvailable: 1001092 kB
|
||||||
|
Buffers: 521880 kB
|
||||||
|
Cached: 459416 kB
|
||||||
|
SwapCached: 1580 kB
|
||||||
|
Active: 830140 kB
|
||||||
|
Inactive: 206728 kB
|
||||||
|
Active(anon): 22492 kB
|
||||||
|
Inactive(anon): 28876 kB
|
||||||
|
Active(file): 807648 kB
|
||||||
|
Inactive(file): 177852 kB
|
||||||
|
Unevictable: 0 kB
|
||||||
|
Mlocked: 0 kB
|
||||||
|
SwapTotal: 4194300 kB
|
||||||
|
SwapFree: 4181440 kB
|
||||||
|
Dirty: 8 kB
|
||||||
|
Writeback: 0 kB
|
||||||
|
AnonPages: 54368 kB
|
||||||
|
Mapped: 31344 kB
|
||||||
|
Shmem: 792 kB
|
||||||
|
Slab: 192900 kB
|
||||||
|
SReclaimable: 131056 kB
|
||||||
|
SUnreclaim: 61844 kB
|
||||||
|
KernelStack: 3104 kB
|
||||||
|
PageTables: 5324 kB
|
||||||
|
NFS_Unstable: 0 kB
|
||||||
|
Bounce: 0 kB
|
||||||
|
WritebackTmp: 0 kB
|
||||||
|
CommitLimit: 12212792 kB
|
||||||
|
Committed_AS: 575108 kB
|
||||||
|
VmallocTotal: 34359738367 kB
|
||||||
|
VmallocUsed: 0 kB
|
||||||
|
VmallocChunk: 0 kB
|
||||||
|
HardwareCorrupted: 0 kB
|
||||||
|
AnonHugePages: 0 kB
|
||||||
|
ShmemHugePages: 0 kB
|
||||||
|
ShmemPmdMapped: 0 kB
|
||||||
|
CmaTotal: 0 kB
|
||||||
|
CmaFree: 0 kB
|
||||||
|
HugePages_Total: 0
|
||||||
|
HugePages_Free: 0
|
||||||
|
HugePages_Rsvd: 0
|
||||||
|
HugePages_Surp: 0
|
||||||
|
Hugepagesize: 2048 kB
|
||||||
|
DirectMap4k: 152880 kB
|
||||||
|
DirectMap2M: 4696064 kB
|
||||||
|
DirectMap1G: 11534336 kB";
|
Reference in New Issue
Block a user