combine execs/sec and iteration counts from multiple workers (#786)

2025-06-17 20:38:06 +00:00 · 2021-04-09 17:21:08 -04:00
parent 030ab4b87f
commit ba009cd30a
1 changed files with 108 additions and 23 deletions
--- a/src/agent/onefuzz-agent/src/tasks/fuzz/libfuzzer_fuzz.rs
+++ b/src/agent/onefuzz-agent/src/tasks/fuzz/libfuzzer_fuzz.rs
@ -23,7 +23,7 @@ use tokio::{
    io::{AsyncBufReadExt, BufReader},
    sync::mpsc,
    task,
-    time::{self, Duration},
+    time::{delay_for, Duration},
 };
 use uuid::Uuid;

@ -96,7 +96,7 @@ impl LibFuzzerFuzzTask {
        let new_crashes = self.config.crashes.monitor_results(new_result, true);

        let (stats_sender, stats_receiver) = mpsc::unbounded_channel();
-        let report_stats = report_runtime_stats(self.workers(), stats_receiver, hb_client);
+        let report_stats = report_runtime_stats(stats_receiver, hb_client);
        let fuzzers = self.run_fuzzers(Some(&stats_sender));
        futures::try_join!(resync, new_inputs, new_crashes, fuzzers, report_stats)?;

@ -311,7 +311,7 @@ fn try_report_iter_update(

 async fn report_fuzzer_sys_info(worker_id: usize, run_id: Uuid, fuzzer_pid: u32) -> Result<()> {
    // Allow for sampling CPU usage.
-    time::delay_for(PROC_INFO_COLLECTION_DELAY).await;
+    delay_for(PROC_INFO_COLLECTION_DELAY).await;

    loop {
        // process doesn't exist
@ -335,13 +335,13 @@ async fn report_fuzzer_sys_info(worker_id: usize, run_id: Uuid, fuzzer_pid: u32)
            break;
        }

-        time::delay_for(PROC_INFO_PERIOD).await;
+        delay_for(PROC_INFO_PERIOD).await;
    }

    Ok(())
 }

-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Debug)]
 pub struct RuntimeStats {
    worker_id: usize,
    run_id: Uuid,
@ -349,12 +349,34 @@ pub struct RuntimeStats {
    execs_sec: f64,
 }

-impl RuntimeStats {
-    pub fn report(&self) {
+#[derive(Debug, Default)]
+pub struct TotalStats {
+    worker_stats: HashMap<usize, RuntimeStats>,
+    count: u64,
+    execs_sec: f64,
+}
+
+impl TotalStats {
+    fn update(&mut self, worker_data: RuntimeStats) {
+        if let Some(current) = self.worker_stats.get(&worker_data.worker_id) {
+            // if it's the same run, only add the differences
+            if current.run_id == worker_data.run_id {
+                self.count += worker_data.count.saturating_sub(current.count);
+            } else {
+                self.count += worker_data.count;
+            }
+        } else {
+            self.count += worker_data.count;
+        }
+
+        self.worker_stats.insert(worker_data.worker_id, worker_data);
+
+        self.execs_sec = self.worker_stats.values().map(|x| x.execs_sec).sum();
+    }
+
+    fn report(&self) {
        event!(
            runtime_stats;
-            EventData::WorkerId = self.worker_id,
-            EventData::RunId = self.run_id,
            EventData::Count = self.count,
            EventData::ExecsSecond = self.execs_sec
        );
@ -374,7 +396,7 @@ impl Timer {
    }

    async fn wait(&self) {
-        time::delay_for(self.interval).await;
+        delay_for(self.interval).await;
    }
 }

@ -386,7 +408,6 @@ impl Timer {
 // running worker runs, this can result in misleading gaps and binning artifacts. In effect, we
 // are approximating nearest-neighbor interpolation on the runtime stats time series.
 async fn report_runtime_stats(
-    workers: usize,
    mut stats_channel: mpsc::UnboundedReceiver<RuntimeStats>,
    heartbeat_client: impl HeartbeatSender,
 ) -> Result<()> {
@ -394,8 +415,10 @@ async fn report_runtime_stats(
    //
    // When logging stats, the most recently reported runtime stats will be used for any
    // missing data. For time-triggered logging, it will be used for all workers.
-    let mut last_reported: Vec<Option<RuntimeStats>> =
-        std::iter::repeat(None).take(workers).collect();
+    let mut total = TotalStats::default();
+
+    // report all zeros to start
+    total.report();

    let timer = Timer::new(RUNTIME_STATS_PERIOD);

@ -403,18 +426,80 @@ async fn report_runtime_stats(
        tokio::select! {
            Some(stats) = stats_channel.next() => {
                heartbeat_client.alive();
-                stats.report();
-
-                let idx = stats.worker_id as usize;
-                last_reported[idx] = Some(stats);
+                total.update(stats);
+                total.report()
            }
            _ = timer.wait() => {
-                for stats in &last_reported {
-                    if let Some(stats) = stats {
-                        stats.report();
-                    }
-                }
+                total.report()
            }
-        };
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{RuntimeStats, TotalStats};
+    use anyhow::Result;
+    use uuid::Uuid;
+
+    #[test]
+    fn test_total_stats() -> Result<()> {
+        let mut total = TotalStats::default();
+        let mut a = RuntimeStats {
+            worker_id: 0,
+            run_id: Uuid::new_v4(),
+            count: 0,
+            execs_sec: 0.0,
+        };
+
+        total.update(a.clone());
+        assert!(total.execs_sec == 0.0);
+        assert!(total.count == 0);
+
+        // same run of existing worker, but counts & execs_sec increased.
+        a.count += 10;
+        a.execs_sec = 1.0;
+        total.update(a.clone());
+        assert!(total.execs_sec == 1.0);
+        assert!(total.count == 10);
+
+        // same run of existing worker.  counts and execs should stay the same.
+        total.update(a.clone());
+        assert!(total.count == 10);
+        assert!(total.execs_sec == 1.0);
+
+        // new run of existing worker. counts should go up, execs_sec should
+        // equal new worker value.
+        a.run_id = Uuid::new_v4();
+        a.execs_sec = 2.0;
+        total.update(a.clone());
+        assert!(total.count == 20);
+        assert!(total.execs_sec == 2.0);
+
+        // existing worker, now new data.  totals should stay the same.
+        total.update(a.clone());
+        assert!(total.count == 20);
+        assert!(total.execs_sec == 2.0);
+
+        // new worker, counts & execs_sec should go up by data from worker.
+        let mut b = RuntimeStats {
+            worker_id: 1,
+            run_id: Uuid::new_v4(),
+            count: 10,
+            execs_sec: 2.0,
+        };
+        total.update(b.clone());
+        assert!(total.count == 30);
+        assert!(total.execs_sec == 4.0);
+
+        // existing run for existing worker.
+        // count should go up by 1, execs_sec down by 1.
+        b.count += 1;
+        b.execs_sec -= 1.0;
+        total.update(b.clone());
+        assert!(total.count == 31);
+        assert!(total.execs_sec == 3.0);
+
+        Ok(())
    }
 }