Fix potential race while single stepping (#440)

When debugging multithreaded targets, there was a potential race in handling a breakpoint where thread A hits a breakpoint while thread B is single stepping to the breakpoint. The fix is, when single stepping, to resume a single thread instead of all threads that are single stepping. The step out functionality has also been removed from this layer in part to keep things simple here. Clients can implement their own step out logic if needed.
2025-06-22 22:28:50 +00:00 · 2021-02-05 14:17:50 -08:00
parent 2c3d91a3fa
commit be23e19cd6
4 changed files with 140 additions and 145 deletions
--- a/src/agent/Cargo.lock
+++ b/src/agent/Cargo.lock
@ -520,6 +520,7 @@ dependencies = [
 "iced-x86",
 "log",
 "memmap2",
 "rand",
 "serde",
 "win-util",
 "winapi 0.3.9",
--- a/src/agent/debugger/Cargo.toml
+++ b/src/agent/debugger/Cargo.toml
@ -12,6 +12,7 @@ goblin = "0.2"
 iced-x86 = "1.1"
 log = "0.4"
 memmap2 = "0.2.0"
 rand = "0.7.3"
 serde = { version = "1.0", features = ["derive"] }
 win-util = { path = "../win-util" }
--- a/src/agent/debugger/src/debugger.rs
+++ b/src/agent/debugger/src/debugger.rs
@ -50,7 +50,7 @@ pub struct BreakpointId(pub u64);
 #[derive(Copy, Clone)]
 pub(crate) enum StepState {
-    Breakpoint { pc: u64 },
+    RemoveBreakpoint { pc: u64, original_byte: u8 },
    SingleStep,
 }
@ -58,7 +58,6 @@ pub(crate) enum StepState {
 pub enum BreakpointType {
    Counter,
    OneTime,
    StepOut { rsp: u64 },
 }
 pub(crate) struct ModuleBreakpoint {
@ -371,7 +370,7 @@ impl Debugger {
        &mut self,
        callbacks: &mut impl DebugEventHandler,
        timeout_ms: DWORD,
-    ) -> Result<()> {
+    ) -> Result<bool> {
        let mut de = MaybeUninit::uninit();
        if unsafe { WaitForDebugEvent(de.as_mut_ptr(), timeout_ms) } == TRUE {
            let de = unsafe { de.assume_init() };
@ -384,6 +383,7 @@ impl Debugger {
                process_id: de.process_id(),
                thread_id: de.thread_id(),
            });
            Ok(true)
        } else {
            self.continue_args = None;
@ -393,9 +393,8 @@ impl Debugger {
            }
            trace!("timeout waiting for debug event");
            Ok(false)
        }
        Ok(())
    }
    pub fn continue_debugging(&mut self) -> Result<()> {
@ -570,14 +569,14 @@ impl Debugger {
                Ok(DBG_CONTINUE)
            }
            Some(DebuggerNotification::Clr) => Ok(DBG_CONTINUE),
-            Some(DebuggerNotification::Breakpoint(pc)) => {
+            Some(DebuggerNotification::Breakpoint { pc }) => {
                if let Some(bp_id) = self.target.handle_breakpoint(pc)? {
                    callbacks.on_breakpoint(self, bp_id);
                }
                Ok(DBG_CONTINUE)
            }
-            Some(DebuggerNotification::SingleStep(step_state)) => {
+            Some(DebuggerNotification::SingleStep { thread_id }) => {
-                self.target.handle_single_step(step_state)?;
+                self.target.complete_single_step(thread_id);
                Ok(DBG_CONTINUE)
            }
            None => {
@ -717,33 +716,14 @@ impl Debugger {
        self.continue_debugging()?;
        Ok(true)
    }
    /// Find the return address (and stack pointer to cover recursion), set a breakpoint
    /// (internal, not reported to the client) that gets cleared after stepping out.
    ///
    /// The return address and stack pointer are returned so the caller can check
    /// that the step out is complete regardless of other debug events that may happen before
    /// hitting the breakpoint at the return address.
    pub fn prepare_to_step_out(&mut self) -> Result<StackFrame> {
        let stack_frame = self.get_current_frame()?;
        let address = stack_frame.return_address();
        self.register_absolute_breakpoint(
            address,
            BreakpointType::StepOut {
                rsp: stack_frame.stack_pointer(),
            },
        )?;
        Ok(stack_frame)
    }
 }
 enum DebuggerNotification {
    Clr,
    InitialBreak,
    InitialWow64Break,
-    Breakpoint(u64),
+    Breakpoint { pc: u64 },
-    SingleStep(StepState),
+    SingleStep { thread_id: DWORD },
 }
 fn is_debugger_notification(
@ -764,7 +744,9 @@ fn is_debugger_notification(
        EXCEPTION_BREAKPOINT => {
            if target.saw_initial_bp() {
                if target.breakpoint_set_at_addr(exception_address) {
-                    Some(DebuggerNotification::Breakpoint(exception_address))
+                    Some(DebuggerNotification::Breakpoint {
                        pc: exception_address,
                    })
                } else {
                    None
                }
@ -784,8 +766,9 @@ fn is_debugger_notification(
        }
        EXCEPTION_SINGLE_STEP => {
-            if let Some(step_state) = target.single_step(target.current_thread_handle()) {
+            let thread_id = target.current_thread_id();
-                Some(DebuggerNotification::SingleStep(step_state))
+            if target.expecting_single_step(thread_id) {
                Some(DebuggerNotification::SingleStep { thread_id })
            } else {
                // Unexpected single step - could be a logic bug in the debugger or less
                // likely but possibly an intentional exception in the debug target.
--- a/src/agent/debugger/src/target.rs
+++ b/src/agent/debugger/src/target.rs
@ -11,6 +11,7 @@ use std::{
 use anyhow::Result;
 use log::{error, trace};
 use rand::{thread_rng, Rng};
 use win_util::{file, handle::Handle, last_os_error, process};
 use winapi::{
    shared::minwindef::{DWORD, LPVOID},
@ -26,6 +27,11 @@ use crate::{
    debugger::{Breakpoint, BreakpointId, BreakpointType, ModuleBreakpoint, StepState},
 };
 struct CodeByteToUpdate {
    address: u64,
    byte: u8,
 }
 struct ThreadInfo {
    id: u32,
    handle: HANDLE,
@ -168,16 +174,15 @@ pub struct Target {
    // see any the changes made in the Set call.
    current_context: Option<FrameContext>,
    // True if we need to set the thread context before resuming.
    context_is_modified: bool,
    // Key is base address (which also happens to be the HANDLE).
    modules: fnv::FnvHashMap<u64, Module>,
    breakpoints: fnv::FnvHashMap<u64, Breakpoint>,
    // Map of thread to stepping state (e.g. breakpoint address to restore breakpoints)
-    single_step: fnv::FnvHashMap<HANDLE, StepState>,
+    single_step: fnv::FnvHashMap<DWORD, StepState>,
    code_byte_to_update: Option<CodeByteToUpdate>,
 }
 impl Target {
@ -203,10 +208,10 @@ impl Target {
            exited: false,
            thread_info: thread_handles,
            current_context: None,
            context_is_modified: false,
            modules: fnv::FnvHashMap::default(),
            breakpoints: fnv::FnvHashMap::default(),
            single_step: fnv::FnvHashMap::default(),
            code_byte_to_update: None,
        }
    }
@ -287,8 +292,12 @@ impl Target {
        self.breakpoints.contains_key(&address)
    }
-    pub(crate) fn single_step(&self, thread_handle: HANDLE) -> Option<StepState> {
+    pub(crate) fn expecting_single_step(&self, thread_id: DWORD) -> bool {
-        self.single_step.get(&thread_handle).cloned()
+        self.single_step.contains_key(&thread_id)
    }
    pub(crate) fn complete_single_step(&mut self, thread_id: DWORD) {
        self.single_step.remove(&thread_id);
    }
    pub fn sym_initialize(&mut self) -> Result<()> {
@ -483,17 +492,83 @@ impl Target {
    }
    pub fn prepare_to_resume(&mut self) -> Result<()> {
-        if let Some(context) = self.current_context.take() {
+        // When resuming, we take extra care to avoid missing breakpoints. There are 2 race
-            if self.context_is_modified {
+        // conditions considered here:
-                context.set_thread_context(self.current_thread_handle)?;
+        //
        // * Threads A and B hit a breakpoint at literally the same time, so we have
        //   multiple events already queued to handle those breakpoints.
        //   Restoring the breakpoint for thread A should not interfere with restoring
        //   the original code for thread B.
        // * Thread A hit a breakpoint, thread B is 1 instruction away from hitting
        //   that breakpoint. Thread B cannot miss the breakpoint when thread A is resumed.
        //
        // To avoid these possible races, when resuming, we only let a single thread go **if**
        // we're single stepping any threads.
        //
        // First, if we last stepped because of hitting a breakpoint, we restore the breakpoint
        // so that whichever thread is resumed, it can't miss the breakpoint.
        if let Some(CodeByteToUpdate { address, byte }) = self.code_byte_to_update.take() {
            trace!("Updating breakpoint at 0x{:x}", address);
            write_instruction_byte(self.process_handle, address, byte)?;
        }
        if self.single_step.is_empty() {
            // Resume all threads if we aren't waiting for any threads to single step.
            trace!("Resuming all threads");
            for thread_info in self.thread_info.values_mut() {
                thread_info.resume_thread()?;
            }
        } else {
            // We will single step a single thread, but we must first make sure all
            // threads are suspended.
            for thread_info in self.thread_info.values_mut() {
                thread_info.suspend_thread()?;
            }
            // Now pick a random thread to resume.
            let idx = thread_rng().gen_range(0, self.single_step.len());
            let (handle, step_state) = self.single_step.iter().nth(idx).unwrap();
            let thread_info = self.thread_info.get_mut(handle).unwrap();
            thread_info.resume_thread()?;
            // We may also need to remove a breakpoint.
            if let StepState::RemoveBreakpoint { pc, original_byte } = step_state {
                trace!("Restoring original byte at 0x{:x}", *pc);
                write_instruction_byte(self.process_handle, *pc, *original_byte)?;
                // We are stepping to remove the breakpoint. After we've stepped,
                // we must restore the breakpoint (which is done on the subsequent
                // call to this function).
                self.code_byte_to_update = Some(CodeByteToUpdate {
                    address: *pc,
                    byte: 0xcc,
                });
            }
        }
-        self.context_is_modified = false;
+        // The current context will not be valid after resuming.
        self.current_context = None;
        Ok(())
    }
    pub fn prepare_to_step(&mut self) -> Result<bool> {
        // Don't change the reason we're single stepping on this thread if
        // we previously set the reason (e.g. so we would restore a breakpoint).
        self.single_step
            .entry(self.current_thread_id)
            .or_insert(StepState::SingleStep);
        let current_thread_handle = self.current_thread_handle; //borrowck
        let context = self.get_current_context_mut()?;
        context.set_single_step(true);
        context.set_thread_context(current_thread_handle)?;
        Ok(true)
    }
    fn ensure_current_context(&mut self) -> Result<()> {
        if self.current_context.is_none() {
            self.current_context = Some(dbghelp::get_thread_frame(
@ -512,10 +587,6 @@ impl Target {
    fn get_current_context_mut(&mut self) -> Result<&mut FrameContext> {
        self.ensure_current_context()?;
        // Assume the caller will modify the context. When it is modified,
        // we must set it before resuming the target.
        self.context_is_modified = true;
        Ok(self.current_context.as_mut().unwrap())
    }
@ -534,120 +605,59 @@ impl Target {
    ///
    /// Return the breakpoint id if it should be reported to the client.
    pub fn handle_breakpoint(&mut self, pc: u64) -> Result<Option<BreakpointId>> {
-        enum HandleBreakpoint {
+        let id;
-            User(BreakpointId, bool),
+        let original_byte;
-            StepOut(u64),
+        let mut restore_breakpoint = true;
-        }
+        {
        let handle_breakpoint = {
            let bp = self.breakpoints.get_mut(&pc).unwrap();
            id = bp.id();
            bp.increment_hit_count();
-            write_instruction_byte(self.process_handle, bp.ip(), bp.original_byte().unwrap())?;
+            original_byte = bp.original_byte().unwrap();
-            match bp.kind() {
+            if let BreakpointType::OneTime = bp.kind() {
-                BreakpointType::OneTime => {
+                // This assertion simplifies `prepare_to_resume` slightly.
-                    bp.set_enabled(false);
+                // If it fires, we could restore the original code byte here, otherwise we
-                    bp.set_original_byte(None);
+                // would need an additional Option to test in `prepare_to_resume`.
                assert!(self.code_byte_to_update.is_none());
-                    // We are clearing the breakpoint after hitting it, so we do not need
+                self.code_byte_to_update = Some(CodeByteToUpdate {
-                    // to single step.
+                    address: bp.ip(),
-                    HandleBreakpoint::User(bp.id(), false)
+                    byte: original_byte,
-                }
+                });
-                BreakpointType::Counter => {
+                bp.set_enabled(false);
-                    // Single step so we can restore the breakpoint after stepping.
+                bp.set_original_byte(None);
                    HandleBreakpoint::User(bp.id(), true)
                }
-                BreakpointType::StepOut { rsp } => HandleBreakpoint::StepOut(rsp),
+                restore_breakpoint = false;
            }
-        };
+        }
        let single_step = {
            let context = self.get_current_context_mut()?;
            context.set_program_counter(pc);
            // We need to single step if we need to restore the breakpoint.
            let single_step = match handle_breakpoint {
                HandleBreakpoint::User(_, single_step) => single_step,
                // Single step only when in a recursive call, which is inferred when the current
                // stack pointer (from context) is less then at the target to step out from (rsp).
                // Note this only works if the stack grows down.
                HandleBreakpoint::StepOut(rsp) => rsp > context.stack_pointer(),
            };
            if single_step {
                context.set_single_step(true);
                self.single_step
                    .insert(self.current_thread_handle, StepState::Breakpoint { pc });
            }
            single_step
        };
        let current_thread_handle = self.current_thread_handle;
        // We need to execute the instruction we overwrote with our breakpoint, so move the
        // instruction pointer back, and if we are going to restore the breakpoint, set
        // single stepping.
        let context = self.get_current_context_mut()?;
        context.set_program_counter(pc);
        if restore_breakpoint {
            context.set_single_step(true);
        }
        context.set_thread_context(current_thread_handle)?;
        self.context_is_modified = false;
-        if single_step {
+        if restore_breakpoint {
-            for thread_info in self.thread_info.values_mut() {
+            // Remember that on the current thread, we need to restore the original byte.
-                // Don't suspend any thread that we are single stepping. Typically
+            // When resuming, if we pick the current thread, we'll remove the breakpoint.
-                // this is only the current thread/breakpoint, but debug events can be hit
+            self.single_step.insert(
-                // on multiple threads at roughly the same time, so e.g. we may see a second
+                self.current_thread_id,
-                // breakpoint from the OS before our scheduled single step exception.
+                StepState::RemoveBreakpoint { pc, original_byte },
-                //
+            );
                // We must resume at least 1 thread when we are single stepping, but we can
                // safely resume all threads that are single stepping because we won't miss
                // any breakpoints as they are executing a single instruction.
                if self.single_step.contains_key(&thread_info.handle) {
                    thread_info.resume_thread()?;
                } else {
                    thread_info.suspend_thread()?;
                };
            }
        }
-        Ok(match handle_breakpoint {
+        Ok(Some(id))
            HandleBreakpoint::User(id, _) => Some(id),
            HandleBreakpoint::StepOut(_) => None,
        })
    }
    pub(crate) fn handle_single_step(&mut self, step_state: StepState) -> Result<()> {
        self.single_step.remove(&self.current_thread_handle);
        match step_state {
            StepState::Breakpoint { pc } => {
                write_instruction_byte(self.process_handle, pc, 0xcc)?;
                // Resume all threads if we aren't waiting for any threads to single step.
                if self.single_step.is_empty() {
                    for thread_info in self.thread_info.values_mut() {
                        thread_info.resume_thread()?;
                    }
                }
            }
            _ => {}
        }
        Ok(())
    }
    pub fn prepare_to_step(&mut self) -> Result<bool> {
        // Don't change the reason we're single stepping on this thread if
        // we previously set the reason (e.g. so we would restore a breakpoint).
        self.single_step
            .entry(self.current_thread_handle)
            .or_insert(StepState::SingleStep);
        let context = self.get_current_context_mut()?;
        context.set_single_step(true);
        Ok(true)
    }
    pub fn set_exited(&mut self) -> Result<()> {