From b39f0964c93bcf9733ae2636d8ec82cb778e9c25 Mon Sep 17 00:00:00 2001 From: Joe Ranweiler Date: Thu, 10 Jun 2021 07:40:32 -0700 Subject: [PATCH] Retry coverage recording (#978) --- .../src/tasks/coverage/generic.rs | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/src/agent/onefuzz-agent/src/tasks/coverage/generic.rs b/src/agent/onefuzz-agent/src/tasks/coverage/generic.rs index 826843f82..917170574 100644 --- a/src/agent/onefuzz-agent/src/tasks/coverage/generic.rs +++ b/src/agent/onefuzz-agent/src/tasks/coverage/generic.rs @@ -13,7 +13,7 @@ use coverage::cache::ModuleCache; use coverage::code::{CmdFilter, CmdFilterDef}; use onefuzz::expand::Expand; use onefuzz::syncdir::SyncedDir; -use onefuzz_telemetry::{Event::coverage_data, EventData}; +use onefuzz_telemetry::{warn, Event::coverage_data, EventData}; use serde::de::DeserializeOwned; use storage_queue::{Message, QueueClient}; use tokio::fs; @@ -25,6 +25,7 @@ use crate::tasks::config::CommonConfig; use crate::tasks::generic::input_poller::{CallbackImpl, InputPoller, Processor}; use crate::tasks::heartbeat::{HeartbeatSender, TaskHeartbeatClient}; +const MAX_COVERAGE_RECORDING_ATTEMPTS: usize = 2; const COVERAGE_FILE: &str = "coverage.json"; const MODULE_CACHE_FILE: &str = "module-cache.json"; @@ -188,6 +189,40 @@ impl<'a> TaskContext<'a> { } pub async fn record_input(&mut self, input: &Path) -> Result<()> { + let attempts = MAX_COVERAGE_RECORDING_ATTEMPTS; + + for attempt in 1..=attempts { + let result = self.try_record_input(input).await; + + if let Err(err) = &result { + // Recording failed, check if we can retry. + if attempt < attempts { + // We will retry, but warn to capture the error if we succeed. + warn!( + "error recording coverage for input = {}: {:?}", + input.display(), + err + ); + } else { + // Final attempt, do not retry. + return result.with_context(|| { + format_err!( + "failed to record coverage for input = {} after {} attempts", + input.display(), + attempts + ) + }); + } + } else { + // We successfully recorded the coverage for `input`, so stop. + break; + } + } + + Ok(()) + } + + async fn try_record_input(&mut self, input: &Path) -> Result<()> { let coverage = self.record_impl(input).await?; self.coverage.merge_max(&coverage);