mirror of
https://github.com/microsoft/onefuzz.git
synced 2025-06-15 19:38:11 +00:00
always retry on specific errors from azcopy (#1196)
This commit is contained in:
@ -20,6 +20,14 @@ use url::Url;
|
|||||||
const RETRY_INTERVAL: Duration = Duration::from_secs(5);
|
const RETRY_INTERVAL: Duration = Duration::from_secs(5);
|
||||||
const RETRY_COUNT: usize = 5;
|
const RETRY_COUNT: usize = 5;
|
||||||
|
|
||||||
|
const ALWAYS_RETRY_ERROR_STRINGS: &[&str] = &[
|
||||||
|
// There isn't an ergonomic method to sync between the OneFuzz agent and fuzzers generating
|
||||||
|
// data. As such, we should always retry azcopy commands that fail with errors that occur due
|
||||||
|
// to the fuzzers writing files while a sync is occurring.
|
||||||
|
// ref: https://github.com/microsoft/onefuzz/issues/1189
|
||||||
|
"source modified during transfer",
|
||||||
|
];
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
enum Mode {
|
enum Mode {
|
||||||
Copy,
|
Copy,
|
||||||
@ -106,21 +114,47 @@ async fn az_impl(mode: Mode, src: &OsStr, dst: &OsStr, args: &[&str]) -> Result<
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Work around issues where azcopy fails with an error we should consider
|
||||||
|
// "acceptable" to always retry on.
|
||||||
|
fn should_always_retry(err: &anyhow::Error) -> bool {
|
||||||
|
let as_string = format!("{:?}", err);
|
||||||
|
for value in ALWAYS_RETRY_ERROR_STRINGS {
|
||||||
|
if as_string.contains(value) {
|
||||||
|
info!(
|
||||||
|
"azcopy failed with an error that always triggers a retry: {} - {:?}",
|
||||||
|
value, err
|
||||||
|
);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
async fn retry_az_impl(mode: Mode, src: &OsStr, dst: &OsStr, args: &[&str]) -> Result<()> {
|
async fn retry_az_impl(mode: Mode, src: &OsStr, dst: &OsStr, args: &[&str]) -> Result<()> {
|
||||||
let counter = AtomicUsize::new(0);
|
let attempt_counter = AtomicUsize::new(0);
|
||||||
|
let failure_counter = AtomicUsize::new(0);
|
||||||
|
|
||||||
let operation = || async {
|
let operation = || async {
|
||||||
let attempt_count = counter.fetch_add(1, Ordering::SeqCst);
|
let attempt_count = attempt_counter.fetch_add(1, Ordering::SeqCst);
|
||||||
let result = az_impl(mode, src, dst, args)
|
let mut failure_count = failure_counter.load(Ordering::SeqCst);
|
||||||
.await
|
let result = az_impl(mode, src, dst, args).await.with_context(|| {
|
||||||
.with_context(|| format!("azcopy {} attempt {} failed", mode, attempt_count + 1));
|
format!(
|
||||||
|
"azcopy {} attempt {} failed. (failure {})",
|
||||||
|
mode,
|
||||||
|
attempt_count + 1,
|
||||||
|
failure_count + 1
|
||||||
|
)
|
||||||
|
});
|
||||||
match result {
|
match result {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(x) => {
|
Err(err) => {
|
||||||
if attempt_count >= RETRY_COUNT {
|
if !should_always_retry(&err) {
|
||||||
Err(backoff::Error::Permanent(x))
|
failure_count = failure_counter.fetch_add(1, Ordering::SeqCst);
|
||||||
|
}
|
||||||
|
if failure_count >= RETRY_COUNT {
|
||||||
|
Err(backoff::Error::Permanent(err))
|
||||||
} else {
|
} else {
|
||||||
Err(backoff::Error::Transient(x))
|
Err(backoff::Error::Transient(err))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user