diff --git a/repos/base-hw/include/spec/x86_64/cpu/vm_state_virtualization.h b/repos/base-hw/include/spec/x86_64/cpu/vm_state_virtualization.h index 1a059ee1ac..1b72ff5833 100644 --- a/repos/base-hw/include/spec/x86_64/cpu/vm_state_virtualization.h +++ b/repos/base-hw/include/spec/x86_64/cpu/vm_state_virtualization.h @@ -15,18 +15,23 @@ #define _INCLUDE__SPEC__PC__VM_STATE_H_ /* x86 CPU state */ -#include +#include +#include namespace Genode { /** * CPU context of a virtual machine */ - struct Vm_state; - using Vm_data = Vm_state; + struct Vm_data; } -struct Genode::Vm_state : Genode::Vcpu_state -{}; + +struct Genode::Vm_data +{ + Board::Vmcb vmcb; + Genode::addr_t vmcb_phys_addr; + Genode::Vm_state * vm_state; +}; #endif /* _INCLUDE__SPEC__PC__VM_STATE_H_ */ diff --git a/repos/base-hw/include/spec/x86_64/virtualization/extended_vcpu_state.h b/repos/base-hw/include/spec/x86_64/virtualization/extended_vcpu_state.h new file mode 100644 index 0000000000..db4acf13e5 --- /dev/null +++ b/repos/base-hw/include/spec/x86_64/virtualization/extended_vcpu_state.h @@ -0,0 +1,139 @@ +/* + * \brief Extended vCPU state + * \author Benjamin Lamowski + * \date 2023-05-25 + */ + +/* + * Copyright (C) 2023 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#ifndef _INCLUDE__SPEC__X86_64_VIRTUALIZATION__EXTENDED_VCPU_STATE_H +#define _INCLUDE__SPEC__X86_64_VIRTUALIZATION__EXTENDED_VCPU_STATE_H + +/* x86 CPU state */ +#include +#include + +namespace Genode { + struct Vm_state; + struct Vcpu_run_state; +} + +/* + * Run state of the Vcpu to sync between the VMM library and the + * kernel. + */ +class Genode::Vcpu_run_state : Noncopyable +{ + public: + + enum Value : int { + /* + * vCPU isn't initialized yet. Needed for initialization in + * Vm::exception() and to block premature pause requests. + */ + STARTUP = 0, + + /* + * The vCPU is runable but not yet running. Used in pause() to + * make the vCPU run once (RUN_ONCE) + */ + RUNNABLE = 1, + + /* + * The vCPU hasn't run yet, but a pause has been requested. + * Run the vCPU once, dispatch the result and then issue a pause + * request. + */ + RUN_ONCE = 2, + + /* + * The vCPU is running. Used in pause() to force an exit only when the + * vCPU is actually running. + */ + RUNNING = 3, + + /* + * vCPU has exited because of an external interrupt and could run + * without state syncing. Needed to skip state syncing in Vm::proceed + * and to request updating the state from the vCPU in case of a + * Vcpu::pause() (SYNC_FROM_VCPU) + */ + INTERRUPTIBLE = 4, + + /* + * vCPU is running and is being forced out by a thread on a remote core + * by signaling the vCPU's handler. Causes a state writeback and + * Vm::pause() after an external interrupt VM exit. + */ + EXITING = 5, + + /* + * A Vcpu::pause() request was issued while the vCPU was INTERRUPTIBLE. + * Skips the next run in Vm::proceed() and causes a full pause exit in + * the subsequent Vm::exception(). + */ + SYNC_FROM_VCPU = 6, + + /* + * The vCPU is dispatching a signal to the handler in the VMM. Needed to + * distinguish between a dispatch from the vCPU and a dispatch from an + * assynchronous pause request. + */ + DISPATCHING = 7, + + /* + * The vCPU needs to first dispatch an exit in the VMM, and a pause + * request needs to be injected right after. + */ + DISPATCHING_PAUSED = 8, + + /* + * An exit has been dispatched to the VMM. Needed to let + * an assynchrous pause request dispatch a new signal. + */ + DISPATCHED = 9, + + /* + * The vCPU was RUNNABLE, or DISPATCHED but a pause has been requested. + * Used to create a pause exit in the wrapper. + */ + PAUSING = 10, + + /* + * The vCPU handler in the VMM is dispatching and a pause + * signal has been issued. Needed to skip more pause requests. + * FIXME + */ + PAUSED = 11 + }; + + private: + + int _value { }; /* base type of Vcpu_run_state */ + + public: + + Value value() const { return Value(_value); } + + void set(Value const &value) + { + _value = value; + } + + bool cas(Value cmp_val, Value new_val) + { + return cmpxchg(&_value, cmp_val, new_val); + } +}; + +struct Genode::Vm_state : Genode::Vcpu_state +{ + Vcpu_run_state run_state; +}; + +#endif /* _INCLUDE__SPEC__X86_64_VIRTUALIZATION__EXTENDED_VCPU_STATE_H */ diff --git a/repos/base-hw/lib/mk/spec/x86_64/base-hw.mk b/repos/base-hw/lib/mk/spec/x86_64/base-hw.mk index 38ea05026d..5773071917 100644 --- a/repos/base-hw/lib/mk/spec/x86_64/base-hw.mk +++ b/repos/base-hw/lib/mk/spec/x86_64/base-hw.mk @@ -1 +1,3 @@ +vpath vm.cc $(REP_DIR)/src/lib/base/x86_64 + include $(REP_DIR)/lib/mk/base-hw.inc diff --git a/repos/base-hw/src/core/spec/x86_64/exception_vector.s b/repos/base-hw/src/core/spec/x86_64/exception_vector.s index e5e9deed8d..34102496d5 100644 --- a/repos/base-hw/src/core/spec/x86_64/exception_vector.s +++ b/repos/base-hw/src/core/spec/x86_64/exception_vector.s @@ -93,6 +93,7 @@ .set vec, vec + 1 .endr + .global _kernel_entry _kernel_entry: pushq %rbp diff --git a/repos/base-hw/src/core/spec/x86_64/virtualization/board.h b/repos/base-hw/src/core/spec/x86_64/virtualization/board.h index 119636cb5f..8e0a47418c 100644 --- a/repos/base-hw/src/core/spec/x86_64/virtualization/board.h +++ b/repos/base-hw/src/core/spec/x86_64/virtualization/board.h @@ -31,11 +31,23 @@ namespace Board { struct Vcpu_context; - using Vm_state = Genode::Vm_state; using Vm_data = Genode::Vm_data; + using Vm_state = Genode::Vm_state; enum { - VCPU_MAX = 16 + VCPU_MAX = 16 + }; + + /* FIXME move into Vcpu_context as 'enum class' when we have C++20 */ + enum Platform_exitcodes : Genode::uint64_t { + EXIT_NPF = 0xfc, + EXIT_STARTUP = 0xfe, + EXIT_PAUSED = 0xff, + }; + + enum Custom_trapnos { + TRAP_VMEXIT = 256, + TRAP_VMSKIP = 257, }; }; @@ -48,11 +60,16 @@ namespace Kernel { struct Board::Vcpu_context { - Vcpu_context(Kernel::Cpu & cpu); + Vcpu_context(unsigned id, void *vcpu_data_ptr, + Genode::addr_t context_phys_addr); void initialize_svm(Kernel::Cpu &cpu, void *table); + void read_vcpu_state(Genode::Vcpu_state &state); + void write_vcpu_state(Genode::Vcpu_state &state, unsigned exit_reason); - Vmcb vmcb; + Vmcb &vmcb; Genode::Align_at regs; + Genode::uint64_t tsc_aux_host = 0U; + Genode::uint64_t tsc_aux_guest = 0U; }; #endif /* _CORE__SPEC__PC__VIRTUALIZATION__BOARD_H_ */ diff --git a/repos/base-hw/src/core/spec/x86_64/virtualization/hypervisor.h b/repos/base-hw/src/core/spec/x86_64/virtualization/hypervisor.h index 4b378fbe62..d57ca8b17f 100644 --- a/repos/base-hw/src/core/spec/x86_64/virtualization/hypervisor.h +++ b/repos/base-hw/src/core/spec/x86_64/virtualization/hypervisor.h @@ -22,11 +22,83 @@ namespace Hypervisor { using Call_arg = Genode::umword_t; using Call_ret = Genode::umword_t; - inline void switch_world(Call_arg guest_state [[maybe_unused]], - Call_arg host_state [[maybe_unused]], - Call_arg pic_state [[maybe_unused]], - Call_arg ttbr [[maybe_unused]]) + + inline void restore_state_for_entry(Call_arg regs, Call_arg fpu_context) { + asm volatile( + "fxrstor (%[fpu_context]);" + "mov %[regs], %%rsp;" + "popq %%r8;" + "popq %%r9;" + "popq %%r10;" + "popq %%r11;" + "popq %%r12;" + "popq %%r13;" + "popq %%r14;" + "popq %%r15;" + "popq %%rax;" + "popq %%rbx;" + "popq %%rcx;" + "popq %%rdx;" + "popq %%rdi;" + "popq %%rsi;" + "popq %%rbp;" + "sti;" /* maybe enter the kernel to handle an external + interrupt that occured ... */ + "nop;" + "cli;" /* ... otherwise, just disable interrupts again */ + "jmp _kernel_entry;" + : + : [regs] "r"(regs), [fpu_context] "r"(fpu_context) + + : "memory"); + }; + + + inline void switch_world(Call_arg guest_state, Call_arg regs, + Call_arg fpu_context) + { + asm volatile( + "fxrstor (%[fpu_context]);" + "mov %[guest_state], %%rax;" + "mov %[regs], %%rsp;" + "popq %%r8;" + "popq %%r9;" + "popq %%r10;" + "popq %%r11;" + "popq %%r12;" + "popq %%r13;" + "popq %%r14;" + "popq %%r15;" + "add $8, %%rsp;" /* don't pop rax */ + "popq %%rbx;" + "popq %%rcx;" + "popq %%rdx;" + "popq %%rdi;" + "popq %%rsi;" + "popq %%rbp;" + "clgi;" + "sti;" + "vmload;" + "vmrun;" + "vmsave;" + "popq %%rax;" /* get the physical address of the host VMCB from + the stack */ + "vmload;" + "stgi;" /* maybe enter the kernel to handle an external interrupt + that occured ... */ + "nop;" + "cli;" /* ... otherwise, just disable interrupts again */ + "pushq $256;" /* make the stack point to trapno, the right place + to jump to _kernel_entry. We push 256 because + this is outside of the valid range for interrupts + */ + "jmp _kernel_entry;" /* jump to _kernel_entry to save the + GPRs without breaking any */ + : + : [regs] "r"(regs), [fpu_context] "r"(fpu_context), + [guest_state] "r"(guest_state) + : "rax", "memory"); } } diff --git a/repos/base-hw/src/core/spec/x86_64/virtualization/kernel/svm.cc b/repos/base-hw/src/core/spec/x86_64/virtualization/kernel/svm.cc index c021de210e..38ca81fb76 100644 --- a/repos/base-hw/src/core/spec/x86_64/virtualization/kernel/svm.cc +++ b/repos/base-hw/src/core/spec/x86_64/virtualization/kernel/svm.cc @@ -154,3 +154,275 @@ void Board::Vcpu_context::initialize_svm(Kernel::Cpu & cpu, void * table) vmcb.init(cpu.id(), table); } + + +void Board::Vcpu_context::write_vcpu_state(Genode::Vcpu_state &state, unsigned exit_reason) +{ + typedef Genode::Vcpu_state::Range Range; + + state.discharge(); + state.exit_reason = exit_reason; + + state.fpu.charge([&] (Genode::Vcpu_state::Fpu::State &fpu) { + memcpy(&fpu, (void *) regs->fpu_context(), sizeof(fpu)); + }); + + state.ax.charge(vmcb.rax); + state.cx.charge(regs->rcx); + state.dx.charge(regs->rdx); + state.bx.charge(regs->rbx); + + state.di.charge(regs->rdi); + state.si.charge(regs->rsi); + state.bp.charge(regs->rbp); + + + state.ip.charge(vmcb.rip); + /* + * SVM doesn't use ip_len, so just leave the old value. + * We still have to charge it when charging ip. + */ + state.ip_len.set_charged(); + + state.flags.charge(vmcb.rflags); + state.sp.charge(vmcb.rsp); + + state.dr7.charge(vmcb.dr7); + + state. r8.charge(regs->r8); + state. r9.charge(regs->r9); + state.r10.charge(regs->r10); + state.r11.charge(regs->r11); + state.r12.charge(regs->r12); + state.r13.charge(regs->r13); + state.r14.charge(regs->r14); + state.r15.charge(regs->r15); + + state.cr0.charge(vmcb.cr0); + state.cr2.charge(vmcb.cr2); + state.cr3.charge(vmcb.cr3); + state.cr4.charge(vmcb.cr4); + + state.cs.charge(vmcb.cs); + state.ss.charge(vmcb.ss); + state.es.charge(vmcb.es); + state.ds.charge(vmcb.ds); + state.fs.charge(vmcb.fs); + state.gs.charge(vmcb.gs); + state.tr.charge(vmcb.tr); + state.ldtr.charge(vmcb.ldtr); + state.gdtr.charge(Range { .limit = vmcb.gdtr.limit, + .base = vmcb.gdtr.base }); + + state.idtr.charge(Range { .limit = vmcb.idtr.limit, + .base = vmcb.idtr.base }); + + state.sysenter_cs.charge(vmcb.sysenter_cs); + state.sysenter_sp.charge(vmcb.sysenter_esp); + state.sysenter_ip.charge(vmcb.sysenter_eip); + + state.qual_primary.charge(vmcb.read()); + state.qual_secondary.charge(vmcb.read()); + + state.ctrl_primary.charge(vmcb.read()); + state.ctrl_secondary.charge(vmcb.read()); + + state.inj_info.charge(vmcb.read()& 0xFFFFFFFF); + state.inj_error.charge((Genode::uint32_t) + (vmcb.read() >> 32)); + + /* Guest is in an interrupt shadow, see 15.21.5 */ + state.intr_state.charge((unsigned) + vmcb.read()); + /* Guest activity state (actv) not used by SVM */ + state.actv_state.set_charged(); + + state.tsc.charge(Hw::Lapic::rdtsc()); + state.tsc_offset.charge(vmcb.read()); + + tsc_aux_guest = Cpu::Ia32_tsc_aux::read(); + state.tsc_aux.charge(tsc_aux_guest); + Cpu::Ia32_tsc_aux::write((Cpu::Ia32_tsc_aux::access_t) tsc_aux_host); + + state.efer.charge(vmcb.efer); + + /* pdpte not used by SVM */ + + state.star.charge(vmcb.star); + state.lstar.charge(vmcb.lstar); + state.cstar.charge(vmcb.cstar); + state.fmask.charge(vmcb.sfmask); + state.kernel_gs_base.charge(vmcb.kernel_gs_base); + + /* Task Priority Register, see 15.24 */ + state.tpr.charge((unsigned) vmcb.read()); + /* TPR threshold not used by SVM */ +} + + +void Board::Vcpu_context::read_vcpu_state(Genode::Vcpu_state &state) +{ + if (state.ax.charged() || state.cx.charged() || + state.dx.charged() || state.bx.charged()) { + vmcb.rax = state.ax.value(); + regs->rcx = state.cx.value(); + regs->rdx = state.dx.value(); + regs->rbx = state.bx.value(); + } + + if (state.bp.charged() || state.di.charged() || state.si.charged()) { + regs->rdi = state.di.value(); + regs->rsi = state.si.value(); + regs->rbp = state.bp.value(); + } + + if (state.flags.charged()) { + vmcb.rflags = state.flags.value(); + } + + if (state.sp.charged()) { + vmcb.rsp = state.sp.value(); + } + + if (state.ip.charged()) { + vmcb.rip = state.ip.value(); + /* ip_len not used by SVM */ + } + + if (state.dr7.charged()) { + vmcb.dr7 = state.dr7.value(); + } + + if (state.r8 .charged() || state.r9 .charged() || + state.r10.charged() || state.r11.charged() || + state.r12.charged() || state.r13.charged() || + state.r14.charged() || state.r15.charged()) { + + regs->r8 = state.r8.value(); + regs->r9 = state.r9.value(); + regs->r10 = state.r10.value(); + regs->r11 = state.r11.value(); + regs->r12 = state.r12.value(); + regs->r13 = state.r13.value(); + regs->r14 = state.r14.value(); + regs->r15 = state.r15.value(); + } + + if (state.cr0.charged() || state.cr2.charged() || + state.cr3.charged() || state.cr4.charged()) { + vmcb.cr0 = state.cr0.value(); + vmcb.cr2 = state.cr2.value(); + vmcb.cr3 = state.cr3.value(); + vmcb.cr4 = state.cr4.value(); + } + + if (state.cs.charged() || state.ss.charged()) { + vmcb.cs = state.cs.value(); + vmcb.ss = state.ss.value(); + } + + if (state.es.charged() || state.ds.charged()) { + vmcb.es = state.es.value(); + vmcb.ds = state.ds.value(); + } + + if (state.fs.charged() || state.gs.charged()) { + vmcb.fs = state.fs.value(); + vmcb.gs = state.gs.value(); + } + + if (state.tr.charged()) { + vmcb.tr = state.tr.value(); + } + + if (state.ldtr.charged()) { + vmcb.ldtr = state.ldtr.value(); + } + + if (state.gdtr.charged()) { + vmcb.gdtr.limit = state.gdtr.value().limit; + vmcb.gdtr.base = state.gdtr.value().base; + } + + if (state.idtr.charged()) { + vmcb.idtr.limit = state.idtr.value().limit; + vmcb.idtr.base = state.idtr.value().base; + } + + if (state.sysenter_cs.charged() || state.sysenter_sp.charged() || + state.sysenter_ip.charged()) { + vmcb.sysenter_cs = state.sysenter_cs.value(); + vmcb.sysenter_esp = state.sysenter_sp.value(); + vmcb.sysenter_eip = state.sysenter_ip.value(); + } + + if (state.ctrl_primary.charged() || state.ctrl_secondary.charged()) { + vmcb.enforce_intercepts(state.ctrl_primary.value(), + state.ctrl_secondary.value()); + } + + if (state.inj_info.charged() || state.inj_error.charged()) { + /* Honor special signaling bit */ + if (state.inj_info.value() & 0x1000) { + vmcb.write(1); + vmcb.write(1); + vmcb.write(1); + } else { + vmcb.write(0); + vmcb.write(0); + vmcb.write(0); + } + + + vmcb.write( + /* Filter out special signaling bits */ + (state.inj_info.value() & + (Genode::uint32_t) ~0x3000) | + (((Genode::uint64_t) state.inj_error.value()) << 32) + ); + } + + if (state.intr_state.charged()) { + vmcb.write(state.intr_state.value()); + } + /* Guest activity state (actv) not used by SVM */ + + if (state.tsc_offset.charged()) { + /* state.tsc not used by SVM */ + vmcb.write(vmcb.read() + + state.tsc_offset.value()); + } + + tsc_aux_host = Cpu::Ia32_tsc_aux::read(); + if (state.tsc_aux.charged()) { + tsc_aux_guest = state.tsc_aux.value(); + } + Cpu::Ia32_tsc_aux::write((Cpu::Ia32_tsc_aux::access_t) tsc_aux_guest); + + if (state.efer.charged()) { + vmcb.efer = state.efer.value(); + } + + /* pdpte not used by SVM */ + + if (state.star.charged() || state.lstar.charged() || + state.cstar.charged() || state.fmask.charged() || + state.kernel_gs_base.charged()) { + vmcb.star = state.star.value(); + vmcb.cstar = state.cstar.value(); + vmcb.lstar = state.lstar.value(); + vmcb.sfmask = state.lstar.value(); + vmcb.kernel_gs_base = state.kernel_gs_base.value(); + } + + if (state.tpr.charged()) { + vmcb.write(state.tpr.value()); + /* TPR threshold not used on AMD */ + } + + if (state.fpu.charged()) { + state.fpu.with_state([&] (Genode::Vcpu_state::Fpu::State const &fpu) { + memcpy((void *) regs->fpu_context(), &fpu, sizeof(fpu)); + }); + } +} diff --git a/repos/base-hw/src/core/spec/x86_64/virtualization/kernel/vm.cc b/repos/base-hw/src/core/spec/x86_64/virtualization/kernel/vm.cc index 636aea1539..4969c30eb7 100644 --- a/repos/base-hw/src/core/spec/x86_64/virtualization/kernel/vm.cc +++ b/repos/base-hw/src/core/spec/x86_64/virtualization/kernel/vm.cc @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -24,14 +25,15 @@ #include #include -#include -#include +#include +#include #include using Genode::addr_t; using Kernel::Cpu; using Kernel::Vm; using Board::Vmcb; +using Vcpu_run_state = Genode::Vcpu_run_state; Vm::Vm(Irq::Pool & user_irq_pool, @@ -43,13 +45,13 @@ Vm::Vm(Irq::Pool & user_irq_pool, Kernel::Object { *this }, Cpu_job(Cpu_priority::min(), 0), _user_irq_pool(user_irq_pool), - _state(data), + _state(*data.vm_state), _context(context), _id(id), - _vcpu_context(cpu) + _vcpu_context(id.id, &data.vmcb, data.vmcb_phys_addr) { affinity(cpu); - + _state.run_state.set(Vcpu_run_state::STARTUP); } @@ -58,19 +60,165 @@ Vm::~Vm() } -void Vm::proceed(Cpu &) +void Vm::proceed(Cpu & cpu) { + using namespace Board; + cpu.switch_to(*_vcpu_context.regs); + + bool do_world_switch = false; + + switch (_state.run_state.value()) { + case Vcpu_run_state::STARTUP: break; + case Vcpu_run_state::SYNC_FROM_VCPU: break; + case Vcpu_run_state::PAUSING: break; + case Vcpu_run_state::INTERRUPTIBLE: + if (_state.run_state.cas(Vcpu_run_state::INTERRUPTIBLE, + Vcpu_run_state::RUNNING)) + do_world_switch = true; + break; + case Vcpu_run_state::RUNNABLE: + _state.run_state.cas(Vcpu_run_state::RUNNABLE, + Vcpu_run_state::RUNNING); + [[fallthrough]]; + case Vcpu_run_state::RUN_ONCE: + _vcpu_context.read_vcpu_state(_state); + do_world_switch = true; + break; + default: + Genode::error("proceed: illegal state ", + Genode::Hex(_state.run_state.value())); + } + + if (do_world_switch) { + Cpu::Ia32_tsc_aux::write((Cpu::Ia32_tsc_aux::access_t) _vcpu_context.tsc_aux_guest); + + /* + * We push the host context's physical address to trapno so that + * we can pop it later + * */ + _vcpu_context.regs->trapno = _vcpu_context.vmcb.root_vmcb_phys; + Hypervisor::switch_world(_vcpu_context.vmcb.phys_addr, + (addr_t)&_vcpu_context.regs->r8, + _vcpu_context.regs->fpu_context()); + /* + * This will fall into an interrupt or otherwise jump into + * _kernel_entry + * */ + } else { + _vcpu_context.regs->trapno = TRAP_VMSKIP; + Hypervisor::restore_state_for_entry((addr_t)&_vcpu_context.regs->r8, + _vcpu_context.regs->fpu_context()); + /* jumps to _kernel_entry */ + } +} + +void Vm::exception(Cpu & cpu) +{ + using namespace Board; + using Genode::Cpu_state; + + switch (_vcpu_context.regs->trapno) { + case Cpu_state::INTERRUPTS_START ... Cpu_state::INTERRUPTS_END: + _interrupt(_user_irq_pool, cpu.id()); + break; + case TRAP_VMEXIT: + /* exception method was entered because of a VMEXIT */ + break; + case TRAP_VMSKIP: + /* exception method was entered without exception */ + break; + default: + Genode::error("VM: triggered unknown exception ", + _vcpu_context.regs->trapno, + " with error code ", _vcpu_context.regs->errcode, + " at ip=", + (void *)_vcpu_context.regs->ip, " sp=", + (void *)_vcpu_context.regs->sp); + pause(); + return; + }; + + enum Svm_exitcodes : Genode::uint64_t { + VMEXIT_INVALID = -1ULL, + VMEXIT_INTR = 0x60, + VMEXIT_NPF = 0x400, + }; + + switch (_state.run_state.value()) { + case Vcpu_run_state::STARTUP: + _vcpu_context.initialize_svm(cpu, _id.table); + _vcpu_context.tsc_aux_host = cpu.id(); + _vcpu_context.write_vcpu_state(_state, EXIT_STARTUP); + _state.run_state.set(Vcpu_run_state::DISPATCHING); + pause(); + _context.submit(1); + return; + case Vcpu_run_state::SYNC_FROM_VCPU: + _vcpu_context.write_vcpu_state(_state, EXIT_PAUSED); + _state.run_state.set(Vcpu_run_state::PAUSED); + pause(); + _context.submit(1); + return; + case Vcpu_run_state::EXITING: break; + case Vcpu_run_state::RUNNING: break; + case Vcpu_run_state::RUN_ONCE: break; + case Vcpu_run_state::PAUSING: return; + default: + Genode::error("exception: illegal state ", + Genode::Hex(_state.run_state.value())); + } + + Genode::uint64_t exitcode = _vcpu_context.vmcb.read(); + + switch (exitcode) { + case VMEXIT_INVALID: + Genode::error("Vm::exception: invalid SVM state!"); + return; + case 0x40 ... 0x5f: + Genode::error("Vm::exception: unhandled SVM exception ", + Genode::Hex(exitcode)); + return; + case VMEXIT_INTR: + if (!_state.run_state.cas(Vcpu_run_state::RUNNING, + Vcpu_run_state::INTERRUPTIBLE)) + { + _vcpu_context.write_vcpu_state(_state, EXIT_PAUSED); + + /* + * If the interruptible state couldn't be set, the state might + * be EXITING and a pause() signal might have already been send + * (to cause the vCPU exit in the first place). + */ + bool submit = false; + /* In the RUN_ONCE case, first we will need to send a signal. */ + if (_state.run_state.value() == Vcpu_run_state::RUN_ONCE) + submit = true; + + _state.run_state.set(Vcpu_run_state::PAUSED); + pause(); + + if (submit) + _context.submit(1); + } + return; + case VMEXIT_NPF: + exitcode = EXIT_NPF; + [[fallthrough]]; + default: + _vcpu_context.write_vcpu_state(_state, (unsigned) exitcode); + _state.run_state.set(Vcpu_run_state::DISPATCHING); + pause(); + _context.submit(1); + return; + }; } -void Vm::exception(Cpu &) -{ -} - - -Board::Vcpu_context::Vcpu_context(Cpu &) +Board::Vcpu_context::Vcpu_context(unsigned id, void *vcpu_data_ptr, + Genode::addr_t context_phys_addr) : - vmcb(0), + vmcb(*Genode::construct_at(vcpu_data_ptr, id, context_phys_addr)), regs(1) { + regs->trapno = TRAP_VMEXIT; } diff --git a/repos/base-hw/src/core/spec/x86_64/virtualization/vm_session_component.cc b/repos/base-hw/src/core/spec/x86_64/virtualization/vm_session_component.cc index 20aa10ab2f..4c301ad704 100644 --- a/repos/base-hw/src/core/spec/x86_64/virtualization/vm_session_component.cc +++ b/repos/base-hw/src/core/spec/x86_64/virtualization/vm_session_component.cc @@ -103,7 +103,21 @@ static Vmid_allocator &alloc() Genode::addr_t Vm_session_component::_alloc_vm_data(Genode::addr_t ds_addr) { - return ds_addr; + void * vm_data_ptr = cma() + .alloc_aligned(sizeof(Board::Vm_data), 12) + .convert( + [&](void *ptr) { return ptr; }, + [&](Range_allocator::Alloc_error) -> void * { + /* XXX handle individual error conditions */ + error("failed to allocate kernel object"); + throw Insufficient_ram_quota(); + } + ); + + Genode::Vm_data* vm_data = (Genode::Vm_data *) vm_data_ptr; + vm_data->vm_state = (Genode::Vm_state *) ds_addr; + vm_data->vmcb_phys_addr = (addr_t)cma().phys_addr(&vm_data->vmcb); + return (Genode::addr_t) vm_data_ptr; } diff --git a/repos/base-hw/src/core/vm_session_component.cc b/repos/base-hw/src/core/vm_session_component.cc index 74293bb736..8d1db76004 100644 --- a/repos/base-hw/src/core/vm_session_component.cc +++ b/repos/base-hw/src/core/vm_session_component.cc @@ -65,7 +65,7 @@ Capability Vm_session_component::create_vcpu(Thread_cap try { vcpu.ds_cap = _constrained_md_ram_alloc.alloc(_ds_size(), Cache::UNCACHED); - vcpu.ds_addr = _region_map.attach(vcpu.ds_cap); + vcpu.ds_addr = _alloc_vm_data(_region_map.attach(vcpu.ds_cap)); } catch (...) { if (vcpu.ds_cap.valid()) _constrained_md_ram_alloc.free(vcpu.ds_cap); diff --git a/repos/base-hw/src/lib/base/x86_64/vm.cc b/repos/base-hw/src/lib/base/x86_64/vm.cc new file mode 100644 index 0000000000..441360017f --- /dev/null +++ b/repos/base-hw/src/lib/base/x86_64/vm.cc @@ -0,0 +1,478 @@ +/* + * \brief Client-side VM session interface + * \author Alexander Boettcher + * \author Benjamin Lamowski + * \date 2018-08-27 + */ + +/* + * Copyright (C) 2018-2023 Genode Labs GmbH + * + * This file is part of the Genode OS framework, which is distributed + * under the terms of the GNU Affero General Public License version 3. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +using namespace Genode; + +using Exit_config = Vm_connection::Exit_config; + + +/**************************** + ** hw vCPU implementation ** + ****************************/ + +struct Hw_vcpu : Rpc_client, Noncopyable +{ + private: + + Attached_dataspace _state; + Native_capability _kernel_vcpu { }; + Vcpu_handler_base & _vcpu_handler; + Thread * _ep_handler { nullptr }; + unsigned _id { 0 }; + Vcpu_state _stashed_state { }; + bool _need_state_update { false }; + bool _extra_pause { false }; + Vcpu_handler _wrapper; + + void _wrapper_dispatch(); + void _prepare_pause_exit(); + void _update_charged_state(Vcpu_state & old_state, Vcpu_state & new_state); + Capability _create_vcpu(Vm_connection &, Vcpu_handler_base &); + + public: + + const Hw_vcpu& operator=(const Hw_vcpu &) = delete; + Hw_vcpu(const Hw_vcpu&) = delete; + + Hw_vcpu(Env &, Vm_connection &, Vcpu_handler_base &); + + void run(); + void pause(); + + Vm_state & state() { return *_state.local_addr(); } +}; + + +Hw_vcpu::Hw_vcpu(Env &env, Vm_connection &vm, Vcpu_handler_base &handler) +: + Rpc_client(_create_vcpu(vm, handler)), + _state(env.rm(), vm.with_upgrade([&] () { return call(); })), + _vcpu_handler(handler), + _wrapper(handler.ep(), *this, &Hw_vcpu::_wrapper_dispatch) +{ + static unsigned counter = 0; + call(_wrapper.signal_cap()); + _kernel_vcpu = call(); + _id = counter++; +} + + +void Hw_vcpu::_wrapper_dispatch() +{ + /* + * If this is running, the VM is not. Either it hasn't, or it has been + * forced out and has written any state back. + */ + + /* + * We run from the same EP as the orignal dispatch handler that + * will call run() from its dispatch loop, set _ep_handler. + */ + if (!_ep_handler) + _ep_handler = Thread::myself(); + + int run_state = state().run_state.value(); + + /* + * In case the VMM dispatch method waits for a pause signal, + * we need a different state to make the pause() method + * send another signal. + */ + if (run_state == Vcpu_run_state::DISPATCHING) + state().run_state.set(Vcpu_run_state::DISPATCHED); + + if (run_state == Vcpu_run_state::DISPATCHING_PAUSED) + state().run_state.set(Vcpu_run_state::PAUSING); + + /* + * Dispatch the exit originating from the vCPU + */ + if (run_state == Vcpu_run_state::DISPATCHING || + run_state == Vcpu_run_state::DISPATCHING_PAUSED || + run_state == Vcpu_run_state::PAUSED) { + /* Call the VMM's dispatch method. */ + _vcpu_handler.dispatch(1); + /* + * Dispatch will probably have called run(), but if the state is set + * to PAUSING it won't. + */ + } + + /* + * Dispatch a possible folded in pause signal. + * Note that we only the local run_state against pausing. + * If the DISPATCHED state was changed to PAUSING in between, pause() + * has sent a new signal. + */ + if (run_state == Vcpu_run_state::PAUSING || + run_state == Vcpu_run_state::DISPATCHING_PAUSED || + _extra_pause) { + Kernel::pause_vm(Capability_space::capid(_kernel_vcpu)); + _update_charged_state(_stashed_state, state()); + /* + * Tell run() to get any stashed state from the original dispatch. + * Necessary because that state is discharged when the VMM + * dispatches and would be lost otherwise. + */ + _need_state_update = true; + _extra_pause = false; + _prepare_pause_exit(); + state().run_state.set(Vcpu_run_state::PAUSED); + _vcpu_handler.dispatch(1); + } +} + + +void Hw_vcpu::run() +{ + if (_need_state_update) { + _update_charged_state(state(), _stashed_state); + _stashed_state.discharge(); + _need_state_update = false; + } + + switch (state().run_state.value()) { + case Vcpu_run_state::STARTUP: + break; + case Vcpu_run_state::DISPATCHED: + if (_ep_handler != Thread::myself()) { + Genode::error("Vcpu (", _id, ") run: setting run from remote CPU unsupported"); + return; + } + + if (!state().run_state.cas(Vcpu_run_state::DISPATCHED, + Vcpu_run_state::RUNNABLE)) + return; /* state changed to PAUSING */ + break; + case Vcpu_run_state::PAUSED: + state().run_state.set(Vcpu_run_state::RUNNABLE); + /* + * It is the VMM's responsibility to be reasonable here. + * If Vcpu::run() is called assynchronously while the vCPU handler + * is still dispatching a request before pause this breaks. + */ + if (_ep_handler != Thread::myself()) + Genode::warning("Vcpu (", _id, ") run: asynchronous call of run()"); + break; + case Vcpu_run_state::PAUSING: + return; + default: + Genode::error("Vcpu (", _id, ") run: ignoring state ", + Genode::Hex(state().run_state.value())); + return; + } + + Kernel::run_vm(Capability_space::capid(_kernel_vcpu)); +} + + +void Hw_vcpu::pause() +{ + switch (state().run_state.value()) { + /* + * Ignore pause requests before the vCPU has started up. + */ + case Vcpu_run_state::STARTUP: + return; + + /* + * When a pause is requested while starting or dispatching, the original + * exit needs to be handled before a pause exit can be injected. + * In these two cases it may happen be that the pause signal would be + * folded in with the signal from the kernel, therefore we need to make + * sure that the wrapper will prepare the pause exit anyway. + */ + case Vcpu_run_state::DISPATCHING: + if (!state().run_state.cas(Vcpu_run_state::DISPATCHING, + Vcpu_run_state::DISPATCHING_PAUSED)) + pause(); /* moved on to DISPATCHED, retry */ + return; + + /* + * The vCPU could run anytime. Switch to RUN_ONCE to make the kernel + * exit and send a signal after running. + * If the state has changed, it must be to running, in that case retry + * the pause. + */ + case Vcpu_run_state::RUNNABLE: + if (!state().run_state.cas(Vcpu_run_state::RUNNABLE, + Vcpu_run_state::RUN_ONCE)) + { + pause(); + return; + } + + _extra_pause = true; + return; + + /* + * The vCPU may be running, signal that any interrupt exit is because it + * is forced out. + * + * If the CPU is already at the beginning of the exception handling, + * the handler will get two signals: whatever the normal exit would have + * been and the pause exit straight after, which is ok. + * + * If the state is written after it was already switched to + * INTERRUPTIBLE in the exit handler, we simply retry. + */ + case Vcpu_run_state::RUNNING: + if (_ep_handler == Thread::myself()) { + Genode::error("Vcpu (", _id, " ) pause: illegal state in line ", __LINE__ ); + return; + }; + + if (!state().run_state.cas(Vcpu_run_state::RUNNING, + Vcpu_run_state::EXITING)) { + pause(); + return; + } + break; + + /* + * A pause request is received when the CPU was already forced out. + * In this case we need to write the state back first and send the + * signal later. If this comes from another thread then it may be + * interrupted after reading the state, while the vCPU thread starts + * RUNNING. Therefore if the swap fails, retry the pause(). + */ + case Vcpu_run_state::INTERRUPTIBLE: + if (!state().run_state.cas(Vcpu_run_state::INTERRUPTIBLE, + Vcpu_run_state::SYNC_FROM_VCPU)) + pause(); + return; + + /* + * A pause is requested while the VM has been dispatched. + * Send a new signal in case the VMM waits for a pause() exit + * before doing another run. + */ + case Vcpu_run_state::DISPATCHED: + if (!state().run_state.cas(Vcpu_run_state::DISPATCHED, + Vcpu_run_state::PAUSING)) { + pause(); + return; + } + break; + + /* + * We're already pausing or paused, ignore it. + */ + default: + return; + } + + _wrapper.local_submit(); +} + + +/* + * Prepare a pause exit to dispatch to the VMM. + * Because we don't do a round trip to the kernel we charge some state to keep + * seoul happy. + */ +void Hw_vcpu::_prepare_pause_exit() +{ + state().exit_reason = 0xFF; + state().ax.set_charged(); + state().bx.set_charged(); + state().cx.set_charged(); + state().dx.set_charged(); + + state().bp.set_charged(); + state().di.set_charged(); + state().si.set_charged(); + + state().flags.set_charged(); + + state().sp.set_charged(); + + state().ip.set_charged(); + state().ip_len.set_charged(); + + state().qual_primary.set_charged(); + state().qual_secondary.set_charged(); + + state().intr_state.set_charged(); + state().actv_state.set_charged(); + + state().inj_info.set_charged(); + state().inj_error.set_charged(); +} + + +/* + * Update fields not already charged from one Vcpu_state to the other. + */ +void Hw_vcpu::_update_charged_state(Vcpu_state & old_state, Vcpu_state & new_state) +{ + if (new_state.ax.charged() || new_state.cx.charged() || + new_state.dx.charged() || new_state.bx.charged()) { + old_state.ax.update(new_state.ax.value()); + old_state.cx.update(new_state.cx.value()); + old_state.dx.update(new_state.dx.value()); + old_state.bx.update(new_state.bx.value()); + } + if (new_state.bp.charged() || new_state.di.charged() || + new_state.si.charged()) { + old_state.bp.update(new_state.bp.value()); + old_state.si.update(new_state.si.value()); + old_state.di.update(new_state.di.value()); + } + if (new_state.sp.charged()) { + old_state.sp.update(new_state.sp.value()); + } + if (new_state.ip.charged()) { + old_state.ip.update(new_state.ip.value()); + old_state.ip_len.update(new_state.ip_len.value()); + } + if (new_state.flags.charged()) { + old_state.flags.update(new_state.flags.value()); + } + if (new_state.es.charged() || new_state.ds.charged()) { + old_state.es.update(new_state.es.value()); + old_state.ds.update(new_state.ds.value()); + } + if (new_state.fs.charged() || new_state.gs.charged()) { + old_state.fs.update(new_state.fs.value()); + old_state.gs.update(new_state.gs.value()); + } + if (new_state.cs.charged() || new_state.ss.charged()) { + old_state.cs.update(new_state.cs.value()); + old_state.ss.update(new_state.ss.value()); + } + if (new_state.tr.charged()) { + old_state.tr.update(new_state.tr.value()); + } + if (new_state.ldtr.charged()) { + old_state.ldtr.update(new_state.ldtr.value()); + } + if (new_state.gdtr.charged()) { + old_state.gdtr.update(new_state.gdtr.value()); + } + if (new_state.idtr.charged()) { + old_state.idtr.update(new_state.idtr.value()); + } + if (new_state.cr0.charged() || new_state.cr2.charged() || + new_state.cr3.charged() || new_state.cr4.charged()) { + old_state.cr0.update(new_state.cr0.value()); + old_state.cr2.update(new_state.cr2.value()); + old_state.cr3.update(new_state.cr3.value()); + old_state.cr4.update(new_state.cr4.value()); + } + if (new_state.dr7.charged()) { + old_state.dr7.update(new_state.dr7.value()); + } + if (new_state.sysenter_cs.charged() || new_state.sysenter_sp.charged() || + new_state.sysenter_ip.charged()) { + old_state.sysenter_ip.update(new_state.sysenter_ip.value()); + old_state.sysenter_sp.update(new_state.sysenter_sp.value()); + old_state.sysenter_cs.update(new_state.sysenter_cs.value()); + } + if (new_state.ctrl_primary.charged() || + new_state.ctrl_secondary.charged()) { + old_state.ctrl_primary.update(new_state.ctrl_primary.value()); + old_state.ctrl_secondary.update(new_state.ctrl_secondary.value()); + } + if (new_state.inj_info.charged() || new_state.inj_error.charged()) { + old_state.inj_info.update(new_state.inj_info.value()); + old_state.inj_error.update(new_state.inj_error.value()); + } + if (new_state.intr_state.charged() || new_state.actv_state.charged()) { + old_state.intr_state.update(new_state.intr_state.value()); + old_state.actv_state.update(new_state.actv_state.value()); + } + if (new_state.tsc_offset.charged()) { + old_state.tsc.update(new_state.tsc.value()); + old_state.tsc_offset.update(new_state.tsc_offset.value()); + old_state.tsc_aux.update(new_state.tsc_aux.value()); + } + if (new_state.efer.charged()) { + old_state.efer.update(new_state.efer.value()); + } + if (new_state.pdpte_0.charged() || new_state.pdpte_1.charged() || + new_state.pdpte_2.charged() || new_state.pdpte_3.charged()) { + old_state.pdpte_0.update(new_state.pdpte_0.value()); + old_state.pdpte_1.update(new_state.pdpte_1.value()); + old_state.pdpte_2.update(new_state.pdpte_2.value()); + old_state.pdpte_3.update(new_state.pdpte_3.value()); + } + + if (new_state.r8 .charged() || new_state.r9 .charged() || + new_state.r10.charged() || new_state.r11.charged() || + new_state.r12.charged() || new_state.r13.charged() || + new_state.r14.charged() || new_state.r15.charged()) { + old_state.r8.update(new_state.r8.value()); + old_state.r9.update(new_state.r9.value()); + old_state.r10.update(new_state.r10.value()); + old_state.r11.update(new_state.r11.value()); + old_state.r12.update(new_state.r12.value()); + old_state.r13.update(new_state.r13.value()); + old_state.r14.update(new_state.r14.value()); + old_state.r15.update(new_state.r15.value()); + } + if (new_state.star .charged() || new_state.lstar.charged() || + new_state.cstar.charged() || new_state.fmask.charged() || + new_state.kernel_gs_base.charged()) { + old_state.star.update(new_state.star.value()); + old_state.lstar.update(new_state.lstar.value()); + old_state.cstar.update(new_state.cstar.value()); + old_state.fmask.update(new_state.fmask.value()); + old_state.kernel_gs_base.update(new_state.kernel_gs_base.value()); + } + if (new_state.tpr.charged() || new_state.tpr_threshold.charged()) { + old_state.tpr.update(new_state.tpr.value()); + old_state.tpr_threshold.update(new_state.tpr_threshold.value()); + } +} + + +Capability Hw_vcpu::_create_vcpu(Vm_connection &vm, + Vcpu_handler_base &handler) +{ + Thread &tep { *reinterpret_cast(&handler.rpc_ep()) }; + + return vm.with_upgrade([&] () { + return vm.call(tep.cap()); }); +} + + +/************** + ** vCPU API ** + **************/ + +void Vm_connection::Vcpu::run() { static_cast(_native_vcpu).run(); } +void Vm_connection::Vcpu::pause() { static_cast(_native_vcpu).pause(); } +Vcpu_state & Vm_connection::Vcpu::state() { return static_cast(_native_vcpu).state(); } + + +Vm_connection::Vcpu::Vcpu(Vm_connection &vm, Allocator &alloc, + Vcpu_handler_base &handler, Exit_config const &) +: + _native_vcpu(*new (alloc) Hw_vcpu(vm._env, vm, handler)) +{ }