base-hw: virt: implement support for SVM

Ref #4826
This commit is contained in:
Benjamin Lamowski 2023-05-16 14:57:16 +02:00 committed by Christian Helmuth
parent b277b84e19
commit e882ca748d
11 changed files with 1176 additions and 28 deletions

View File

@ -15,18 +15,23 @@
#define _INCLUDE__SPEC__PC__VM_STATE_H_
/* x86 CPU state */
#include <cpu/vcpu_state.h>
#include <virtualization/extended_vcpu_state.h>
#include <virtualization/svm.h>
namespace Genode {
/**
* CPU context of a virtual machine
*/
struct Vm_state;
using Vm_data = Vm_state;
struct Vm_data;
}
struct Genode::Vm_state : Genode::Vcpu_state
{};
struct Genode::Vm_data
{
Board::Vmcb vmcb;
Genode::addr_t vmcb_phys_addr;
Genode::Vm_state * vm_state;
};
#endif /* _INCLUDE__SPEC__PC__VM_STATE_H_ */

View File

@ -0,0 +1,139 @@
/*
* \brief Extended vCPU state
* \author Benjamin Lamowski
* \date 2023-05-25
*/
/*
* Copyright (C) 2023 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/
#ifndef _INCLUDE__SPEC__X86_64_VIRTUALIZATION__EXTENDED_VCPU_STATE_H
#define _INCLUDE__SPEC__X86_64_VIRTUALIZATION__EXTENDED_VCPU_STATE_H
/* x86 CPU state */
#include <cpu/vcpu_state.h>
#include <cpu/atomic.h>
namespace Genode {
struct Vm_state;
struct Vcpu_run_state;
}
/*
* Run state of the Vcpu to sync between the VMM library and the
* kernel.
*/
class Genode::Vcpu_run_state : Noncopyable
{
public:
enum Value : int {
/*
* vCPU isn't initialized yet. Needed for initialization in
* Vm::exception() and to block premature pause requests.
*/
STARTUP = 0,
/*
* The vCPU is runable but not yet running. Used in pause() to
* make the vCPU run once (RUN_ONCE)
*/
RUNNABLE = 1,
/*
* The vCPU hasn't run yet, but a pause has been requested.
* Run the vCPU once, dispatch the result and then issue a pause
* request.
*/
RUN_ONCE = 2,
/*
* The vCPU is running. Used in pause() to force an exit only when the
* vCPU is actually running.
*/
RUNNING = 3,
/*
* vCPU has exited because of an external interrupt and could run
* without state syncing. Needed to skip state syncing in Vm::proceed
* and to request updating the state from the vCPU in case of a
* Vcpu::pause() (SYNC_FROM_VCPU)
*/
INTERRUPTIBLE = 4,
/*
* vCPU is running and is being forced out by a thread on a remote core
* by signaling the vCPU's handler. Causes a state writeback and
* Vm::pause() after an external interrupt VM exit.
*/
EXITING = 5,
/*
* A Vcpu::pause() request was issued while the vCPU was INTERRUPTIBLE.
* Skips the next run in Vm::proceed() and causes a full pause exit in
* the subsequent Vm::exception().
*/
SYNC_FROM_VCPU = 6,
/*
* The vCPU is dispatching a signal to the handler in the VMM. Needed to
* distinguish between a dispatch from the vCPU and a dispatch from an
* assynchronous pause request.
*/
DISPATCHING = 7,
/*
* The vCPU needs to first dispatch an exit in the VMM, and a pause
* request needs to be injected right after.
*/
DISPATCHING_PAUSED = 8,
/*
* An exit has been dispatched to the VMM. Needed to let
* an assynchrous pause request dispatch a new signal.
*/
DISPATCHED = 9,
/*
* The vCPU was RUNNABLE, or DISPATCHED but a pause has been requested.
* Used to create a pause exit in the wrapper.
*/
PAUSING = 10,
/*
* The vCPU handler in the VMM is dispatching and a pause
* signal has been issued. Needed to skip more pause requests.
* FIXME
*/
PAUSED = 11
};
private:
int _value { }; /* base type of Vcpu_run_state */
public:
Value value() const { return Value(_value); }
void set(Value const &value)
{
_value = value;
}
bool cas(Value cmp_val, Value new_val)
{
return cmpxchg(&_value, cmp_val, new_val);
}
};
struct Genode::Vm_state : Genode::Vcpu_state
{
Vcpu_run_state run_state;
};
#endif /* _INCLUDE__SPEC__X86_64_VIRTUALIZATION__EXTENDED_VCPU_STATE_H */

View File

@ -1 +1,3 @@
vpath vm.cc $(REP_DIR)/src/lib/base/x86_64
include $(REP_DIR)/lib/mk/base-hw.inc

View File

@ -93,6 +93,7 @@
.set vec, vec + 1
.endr
.global _kernel_entry
_kernel_entry:
pushq %rbp

View File

@ -31,11 +31,23 @@ namespace Board {
struct Vcpu_context;
using Vm_state = Genode::Vm_state;
using Vm_data = Genode::Vm_data;
using Vm_state = Genode::Vm_state;
enum {
VCPU_MAX = 16
VCPU_MAX = 16
};
/* FIXME move into Vcpu_context as 'enum class' when we have C++20 */
enum Platform_exitcodes : Genode::uint64_t {
EXIT_NPF = 0xfc,
EXIT_STARTUP = 0xfe,
EXIT_PAUSED = 0xff,
};
enum Custom_trapnos {
TRAP_VMEXIT = 256,
TRAP_VMSKIP = 257,
};
};
@ -48,11 +60,16 @@ namespace Kernel {
struct Board::Vcpu_context
{
Vcpu_context(Kernel::Cpu & cpu);
Vcpu_context(unsigned id, void *vcpu_data_ptr,
Genode::addr_t context_phys_addr);
void initialize_svm(Kernel::Cpu &cpu, void *table);
void read_vcpu_state(Genode::Vcpu_state &state);
void write_vcpu_state(Genode::Vcpu_state &state, unsigned exit_reason);
Vmcb vmcb;
Vmcb &vmcb;
Genode::Align_at<Core::Cpu::Context> regs;
Genode::uint64_t tsc_aux_host = 0U;
Genode::uint64_t tsc_aux_guest = 0U;
};
#endif /* _CORE__SPEC__PC__VIRTUALIZATION__BOARD_H_ */

View File

@ -22,11 +22,83 @@ namespace Hypervisor {
using Call_arg = Genode::umword_t;
using Call_ret = Genode::umword_t;
inline void switch_world(Call_arg guest_state [[maybe_unused]],
Call_arg host_state [[maybe_unused]],
Call_arg pic_state [[maybe_unused]],
Call_arg ttbr [[maybe_unused]])
inline void restore_state_for_entry(Call_arg regs, Call_arg fpu_context)
{
asm volatile(
"fxrstor (%[fpu_context]);"
"mov %[regs], %%rsp;"
"popq %%r8;"
"popq %%r9;"
"popq %%r10;"
"popq %%r11;"
"popq %%r12;"
"popq %%r13;"
"popq %%r14;"
"popq %%r15;"
"popq %%rax;"
"popq %%rbx;"
"popq %%rcx;"
"popq %%rdx;"
"popq %%rdi;"
"popq %%rsi;"
"popq %%rbp;"
"sti;" /* maybe enter the kernel to handle an external
interrupt that occured ... */
"nop;"
"cli;" /* ... otherwise, just disable interrupts again */
"jmp _kernel_entry;"
:
: [regs] "r"(regs), [fpu_context] "r"(fpu_context)
: "memory");
};
inline void switch_world(Call_arg guest_state, Call_arg regs,
Call_arg fpu_context)
{
asm volatile(
"fxrstor (%[fpu_context]);"
"mov %[guest_state], %%rax;"
"mov %[regs], %%rsp;"
"popq %%r8;"
"popq %%r9;"
"popq %%r10;"
"popq %%r11;"
"popq %%r12;"
"popq %%r13;"
"popq %%r14;"
"popq %%r15;"
"add $8, %%rsp;" /* don't pop rax */
"popq %%rbx;"
"popq %%rcx;"
"popq %%rdx;"
"popq %%rdi;"
"popq %%rsi;"
"popq %%rbp;"
"clgi;"
"sti;"
"vmload;"
"vmrun;"
"vmsave;"
"popq %%rax;" /* get the physical address of the host VMCB from
the stack */
"vmload;"
"stgi;" /* maybe enter the kernel to handle an external interrupt
that occured ... */
"nop;"
"cli;" /* ... otherwise, just disable interrupts again */
"pushq $256;" /* make the stack point to trapno, the right place
to jump to _kernel_entry. We push 256 because
this is outside of the valid range for interrupts
*/
"jmp _kernel_entry;" /* jump to _kernel_entry to save the
GPRs without breaking any */
:
: [regs] "r"(regs), [fpu_context] "r"(fpu_context),
[guest_state] "r"(guest_state)
: "rax", "memory");
}
}

View File

@ -154,3 +154,275 @@ void Board::Vcpu_context::initialize_svm(Kernel::Cpu & cpu, void * table)
vmcb.init(cpu.id(), table);
}
void Board::Vcpu_context::write_vcpu_state(Genode::Vcpu_state &state, unsigned exit_reason)
{
typedef Genode::Vcpu_state::Range Range;
state.discharge();
state.exit_reason = exit_reason;
state.fpu.charge([&] (Genode::Vcpu_state::Fpu::State &fpu) {
memcpy(&fpu, (void *) regs->fpu_context(), sizeof(fpu));
});
state.ax.charge(vmcb.rax);
state.cx.charge(regs->rcx);
state.dx.charge(regs->rdx);
state.bx.charge(regs->rbx);
state.di.charge(regs->rdi);
state.si.charge(regs->rsi);
state.bp.charge(regs->rbp);
state.ip.charge(vmcb.rip);
/*
* SVM doesn't use ip_len, so just leave the old value.
* We still have to charge it when charging ip.
*/
state.ip_len.set_charged();
state.flags.charge(vmcb.rflags);
state.sp.charge(vmcb.rsp);
state.dr7.charge(vmcb.dr7);
state. r8.charge(regs->r8);
state. r9.charge(regs->r9);
state.r10.charge(regs->r10);
state.r11.charge(regs->r11);
state.r12.charge(regs->r12);
state.r13.charge(regs->r13);
state.r14.charge(regs->r14);
state.r15.charge(regs->r15);
state.cr0.charge(vmcb.cr0);
state.cr2.charge(vmcb.cr2);
state.cr3.charge(vmcb.cr3);
state.cr4.charge(vmcb.cr4);
state.cs.charge(vmcb.cs);
state.ss.charge(vmcb.ss);
state.es.charge(vmcb.es);
state.ds.charge(vmcb.ds);
state.fs.charge(vmcb.fs);
state.gs.charge(vmcb.gs);
state.tr.charge(vmcb.tr);
state.ldtr.charge(vmcb.ldtr);
state.gdtr.charge(Range { .limit = vmcb.gdtr.limit,
.base = vmcb.gdtr.base });
state.idtr.charge(Range { .limit = vmcb.idtr.limit,
.base = vmcb.idtr.base });
state.sysenter_cs.charge(vmcb.sysenter_cs);
state.sysenter_sp.charge(vmcb.sysenter_esp);
state.sysenter_ip.charge(vmcb.sysenter_eip);
state.qual_primary.charge(vmcb.read<Vmcb::Exitinfo1>());
state.qual_secondary.charge(vmcb.read<Vmcb::Exitinfo2>());
state.ctrl_primary.charge(vmcb.read<Vmcb::Intercept_misc1>());
state.ctrl_secondary.charge(vmcb.read<Vmcb::Intercept_misc2>());
state.inj_info.charge(vmcb.read<Vmcb::Exitintinfo>()& 0xFFFFFFFF);
state.inj_error.charge((Genode::uint32_t)
(vmcb.read<Vmcb::Exitintinfo>() >> 32));
/* Guest is in an interrupt shadow, see 15.21.5 */
state.intr_state.charge((unsigned)
vmcb.read<Vmcb::Int_control_ext::Int_shadow>());
/* Guest activity state (actv) not used by SVM */
state.actv_state.set_charged();
state.tsc.charge(Hw::Lapic::rdtsc());
state.tsc_offset.charge(vmcb.read<Vmcb::Tsc_offset>());
tsc_aux_guest = Cpu::Ia32_tsc_aux::read();
state.tsc_aux.charge(tsc_aux_guest);
Cpu::Ia32_tsc_aux::write((Cpu::Ia32_tsc_aux::access_t) tsc_aux_host);
state.efer.charge(vmcb.efer);
/* pdpte not used by SVM */
state.star.charge(vmcb.star);
state.lstar.charge(vmcb.lstar);
state.cstar.charge(vmcb.cstar);
state.fmask.charge(vmcb.sfmask);
state.kernel_gs_base.charge(vmcb.kernel_gs_base);
/* Task Priority Register, see 15.24 */
state.tpr.charge((unsigned) vmcb.read<Vmcb::Int_control::V_tpr>());
/* TPR threshold not used by SVM */
}
void Board::Vcpu_context::read_vcpu_state(Genode::Vcpu_state &state)
{
if (state.ax.charged() || state.cx.charged() ||
state.dx.charged() || state.bx.charged()) {
vmcb.rax = state.ax.value();
regs->rcx = state.cx.value();
regs->rdx = state.dx.value();
regs->rbx = state.bx.value();
}
if (state.bp.charged() || state.di.charged() || state.si.charged()) {
regs->rdi = state.di.value();
regs->rsi = state.si.value();
regs->rbp = state.bp.value();
}
if (state.flags.charged()) {
vmcb.rflags = state.flags.value();
}
if (state.sp.charged()) {
vmcb.rsp = state.sp.value();
}
if (state.ip.charged()) {
vmcb.rip = state.ip.value();
/* ip_len not used by SVM */
}
if (state.dr7.charged()) {
vmcb.dr7 = state.dr7.value();
}
if (state.r8 .charged() || state.r9 .charged() ||
state.r10.charged() || state.r11.charged() ||
state.r12.charged() || state.r13.charged() ||
state.r14.charged() || state.r15.charged()) {
regs->r8 = state.r8.value();
regs->r9 = state.r9.value();
regs->r10 = state.r10.value();
regs->r11 = state.r11.value();
regs->r12 = state.r12.value();
regs->r13 = state.r13.value();
regs->r14 = state.r14.value();
regs->r15 = state.r15.value();
}
if (state.cr0.charged() || state.cr2.charged() ||
state.cr3.charged() || state.cr4.charged()) {
vmcb.cr0 = state.cr0.value();
vmcb.cr2 = state.cr2.value();
vmcb.cr3 = state.cr3.value();
vmcb.cr4 = state.cr4.value();
}
if (state.cs.charged() || state.ss.charged()) {
vmcb.cs = state.cs.value();
vmcb.ss = state.ss.value();
}
if (state.es.charged() || state.ds.charged()) {
vmcb.es = state.es.value();
vmcb.ds = state.ds.value();
}
if (state.fs.charged() || state.gs.charged()) {
vmcb.fs = state.fs.value();
vmcb.gs = state.gs.value();
}
if (state.tr.charged()) {
vmcb.tr = state.tr.value();
}
if (state.ldtr.charged()) {
vmcb.ldtr = state.ldtr.value();
}
if (state.gdtr.charged()) {
vmcb.gdtr.limit = state.gdtr.value().limit;
vmcb.gdtr.base = state.gdtr.value().base;
}
if (state.idtr.charged()) {
vmcb.idtr.limit = state.idtr.value().limit;
vmcb.idtr.base = state.idtr.value().base;
}
if (state.sysenter_cs.charged() || state.sysenter_sp.charged() ||
state.sysenter_ip.charged()) {
vmcb.sysenter_cs = state.sysenter_cs.value();
vmcb.sysenter_esp = state.sysenter_sp.value();
vmcb.sysenter_eip = state.sysenter_ip.value();
}
if (state.ctrl_primary.charged() || state.ctrl_secondary.charged()) {
vmcb.enforce_intercepts(state.ctrl_primary.value(),
state.ctrl_secondary.value());
}
if (state.inj_info.charged() || state.inj_error.charged()) {
/* Honor special signaling bit */
if (state.inj_info.value() & 0x1000) {
vmcb.write<Vmcb::Int_control::V_irq>(1);
vmcb.write<Vmcb::Int_control::V_ign_tpr>(1);
vmcb.write<Vmcb::Intercept_misc1::Vintr>(1);
} else {
vmcb.write<Vmcb::Int_control::V_irq>(0);
vmcb.write<Vmcb::Int_control::V_ign_tpr>(0);
vmcb.write<Vmcb::Intercept_misc1::Vintr>(0);
}
vmcb.write<Vmcb::Eventinj>(
/* Filter out special signaling bits */
(state.inj_info.value() &
(Genode::uint32_t) ~0x3000) |
(((Genode::uint64_t) state.inj_error.value()) << 32)
);
}
if (state.intr_state.charged()) {
vmcb.write<Vmcb::Int_control_ext::Int_shadow>(state.intr_state.value());
}
/* Guest activity state (actv) not used by SVM */
if (state.tsc_offset.charged()) {
/* state.tsc not used by SVM */
vmcb.write<Vmcb::Tsc_offset>(vmcb.read<Vmcb::Tsc_offset>() +
state.tsc_offset.value());
}
tsc_aux_host = Cpu::Ia32_tsc_aux::read();
if (state.tsc_aux.charged()) {
tsc_aux_guest = state.tsc_aux.value();
}
Cpu::Ia32_tsc_aux::write((Cpu::Ia32_tsc_aux::access_t) tsc_aux_guest);
if (state.efer.charged()) {
vmcb.efer = state.efer.value();
}
/* pdpte not used by SVM */
if (state.star.charged() || state.lstar.charged() ||
state.cstar.charged() || state.fmask.charged() ||
state.kernel_gs_base.charged()) {
vmcb.star = state.star.value();
vmcb.cstar = state.cstar.value();
vmcb.lstar = state.lstar.value();
vmcb.sfmask = state.lstar.value();
vmcb.kernel_gs_base = state.kernel_gs_base.value();
}
if (state.tpr.charged()) {
vmcb.write<Vmcb::Int_control::V_tpr>(state.tpr.value());
/* TPR threshold not used on AMD */
}
if (state.fpu.charged()) {
state.fpu.with_state([&] (Genode::Vcpu_state::Fpu::State const &fpu) {
memcpy((void *) regs->fpu_context(), &fpu, sizeof(fpu));
});
}
}

View File

@ -13,6 +13,7 @@
#include <base/log.h>
#include <cpu/vm_state_virtualization.h>
#include <util/construct_at.h>
#include <util/mmio.h>
#include <cpu/string.h>
@ -24,14 +25,15 @@
#include <kernel/vm.h>
#include <kernel/main.h>
#include <spec/x86_64/virtualization/hypervisor.h>
#include <spec/x86_64/virtualization/svm.h>
#include <virtualization/hypervisor.h>
#include <virtualization/svm.h>
#include <hw/spec/x86_64/x86_64.h>
using Genode::addr_t;
using Kernel::Cpu;
using Kernel::Vm;
using Board::Vmcb;
using Vcpu_run_state = Genode::Vcpu_run_state;
Vm::Vm(Irq::Pool & user_irq_pool,
@ -43,13 +45,13 @@ Vm::Vm(Irq::Pool & user_irq_pool,
Kernel::Object { *this },
Cpu_job(Cpu_priority::min(), 0),
_user_irq_pool(user_irq_pool),
_state(data),
_state(*data.vm_state),
_context(context),
_id(id),
_vcpu_context(cpu)
_vcpu_context(id.id, &data.vmcb, data.vmcb_phys_addr)
{
affinity(cpu);
_state.run_state.set(Vcpu_run_state::STARTUP);
}
@ -58,19 +60,165 @@ Vm::~Vm()
}
void Vm::proceed(Cpu &)
void Vm::proceed(Cpu & cpu)
{
using namespace Board;
cpu.switch_to(*_vcpu_context.regs);
bool do_world_switch = false;
switch (_state.run_state.value()) {
case Vcpu_run_state::STARTUP: break;
case Vcpu_run_state::SYNC_FROM_VCPU: break;
case Vcpu_run_state::PAUSING: break;
case Vcpu_run_state::INTERRUPTIBLE:
if (_state.run_state.cas(Vcpu_run_state::INTERRUPTIBLE,
Vcpu_run_state::RUNNING))
do_world_switch = true;
break;
case Vcpu_run_state::RUNNABLE:
_state.run_state.cas(Vcpu_run_state::RUNNABLE,
Vcpu_run_state::RUNNING);
[[fallthrough]];
case Vcpu_run_state::RUN_ONCE:
_vcpu_context.read_vcpu_state(_state);
do_world_switch = true;
break;
default:
Genode::error("proceed: illegal state ",
Genode::Hex(_state.run_state.value()));
}
if (do_world_switch) {
Cpu::Ia32_tsc_aux::write((Cpu::Ia32_tsc_aux::access_t) _vcpu_context.tsc_aux_guest);
/*
* We push the host context's physical address to trapno so that
* we can pop it later
* */
_vcpu_context.regs->trapno = _vcpu_context.vmcb.root_vmcb_phys;
Hypervisor::switch_world(_vcpu_context.vmcb.phys_addr,
(addr_t)&_vcpu_context.regs->r8,
_vcpu_context.regs->fpu_context());
/*
* This will fall into an interrupt or otherwise jump into
* _kernel_entry
* */
} else {
_vcpu_context.regs->trapno = TRAP_VMSKIP;
Hypervisor::restore_state_for_entry((addr_t)&_vcpu_context.regs->r8,
_vcpu_context.regs->fpu_context());
/* jumps to _kernel_entry */
}
}
void Vm::exception(Cpu & cpu)
{
using namespace Board;
using Genode::Cpu_state;
switch (_vcpu_context.regs->trapno) {
case Cpu_state::INTERRUPTS_START ... Cpu_state::INTERRUPTS_END:
_interrupt(_user_irq_pool, cpu.id());
break;
case TRAP_VMEXIT:
/* exception method was entered because of a VMEXIT */
break;
case TRAP_VMSKIP:
/* exception method was entered without exception */
break;
default:
Genode::error("VM: triggered unknown exception ",
_vcpu_context.regs->trapno,
" with error code ", _vcpu_context.regs->errcode,
" at ip=",
(void *)_vcpu_context.regs->ip, " sp=",
(void *)_vcpu_context.regs->sp);
pause();
return;
};
enum Svm_exitcodes : Genode::uint64_t {
VMEXIT_INVALID = -1ULL,
VMEXIT_INTR = 0x60,
VMEXIT_NPF = 0x400,
};
switch (_state.run_state.value()) {
case Vcpu_run_state::STARTUP:
_vcpu_context.initialize_svm(cpu, _id.table);
_vcpu_context.tsc_aux_host = cpu.id();
_vcpu_context.write_vcpu_state(_state, EXIT_STARTUP);
_state.run_state.set(Vcpu_run_state::DISPATCHING);
pause();
_context.submit(1);
return;
case Vcpu_run_state::SYNC_FROM_VCPU:
_vcpu_context.write_vcpu_state(_state, EXIT_PAUSED);
_state.run_state.set(Vcpu_run_state::PAUSED);
pause();
_context.submit(1);
return;
case Vcpu_run_state::EXITING: break;
case Vcpu_run_state::RUNNING: break;
case Vcpu_run_state::RUN_ONCE: break;
case Vcpu_run_state::PAUSING: return;
default:
Genode::error("exception: illegal state ",
Genode::Hex(_state.run_state.value()));
}
Genode::uint64_t exitcode = _vcpu_context.vmcb.read<Vmcb::Exitcode>();
switch (exitcode) {
case VMEXIT_INVALID:
Genode::error("Vm::exception: invalid SVM state!");
return;
case 0x40 ... 0x5f:
Genode::error("Vm::exception: unhandled SVM exception ",
Genode::Hex(exitcode));
return;
case VMEXIT_INTR:
if (!_state.run_state.cas(Vcpu_run_state::RUNNING,
Vcpu_run_state::INTERRUPTIBLE))
{
_vcpu_context.write_vcpu_state(_state, EXIT_PAUSED);
/*
* If the interruptible state couldn't be set, the state might
* be EXITING and a pause() signal might have already been send
* (to cause the vCPU exit in the first place).
*/
bool submit = false;
/* In the RUN_ONCE case, first we will need to send a signal. */
if (_state.run_state.value() == Vcpu_run_state::RUN_ONCE)
submit = true;
_state.run_state.set(Vcpu_run_state::PAUSED);
pause();
if (submit)
_context.submit(1);
}
return;
case VMEXIT_NPF:
exitcode = EXIT_NPF;
[[fallthrough]];
default:
_vcpu_context.write_vcpu_state(_state, (unsigned) exitcode);
_state.run_state.set(Vcpu_run_state::DISPATCHING);
pause();
_context.submit(1);
return;
};
}
void Vm::exception(Cpu &)
{
}
Board::Vcpu_context::Vcpu_context(Cpu &)
Board::Vcpu_context::Vcpu_context(unsigned id, void *vcpu_data_ptr,
Genode::addr_t context_phys_addr)
:
vmcb(0),
vmcb(*Genode::construct_at<Vmcb>(vcpu_data_ptr, id, context_phys_addr)),
regs(1)
{
regs->trapno = TRAP_VMEXIT;
}

View File

@ -103,7 +103,21 @@ static Vmid_allocator &alloc()
Genode::addr_t Vm_session_component::_alloc_vm_data(Genode::addr_t ds_addr)
{
return ds_addr;
void * vm_data_ptr = cma()
.alloc_aligned(sizeof(Board::Vm_data), 12)
.convert<void *>(
[&](void *ptr) { return ptr; },
[&](Range_allocator::Alloc_error) -> void * {
/* XXX handle individual error conditions */
error("failed to allocate kernel object");
throw Insufficient_ram_quota();
}
);
Genode::Vm_data* vm_data = (Genode::Vm_data *) vm_data_ptr;
vm_data->vm_state = (Genode::Vm_state *) ds_addr;
vm_data->vmcb_phys_addr = (addr_t)cma().phys_addr(&vm_data->vmcb);
return (Genode::addr_t) vm_data_ptr;
}

View File

@ -65,7 +65,7 @@ Capability<Vm_session::Native_vcpu> Vm_session_component::create_vcpu(Thread_cap
try {
vcpu.ds_cap = _constrained_md_ram_alloc.alloc(_ds_size(), Cache::UNCACHED);
vcpu.ds_addr = _region_map.attach(vcpu.ds_cap);
vcpu.ds_addr = _alloc_vm_data(_region_map.attach(vcpu.ds_cap));
} catch (...) {
if (vcpu.ds_cap.valid())
_constrained_md_ram_alloc.free(vcpu.ds_cap);

View File

@ -0,0 +1,478 @@
/*
* \brief Client-side VM session interface
* \author Alexander Boettcher
* \author Benjamin Lamowski
* \date 2018-08-27
*/
/*
* Copyright (C) 2018-2023 Genode Labs GmbH
*
* This file is part of the Genode OS framework, which is distributed
* under the terms of the GNU Affero General Public License version 3.
*/
#include <base/allocator.h>
#include <base/attached_dataspace.h>
#include <base/env.h>
#include <base/registry.h>
#include <base/signal.h>
#include <base/internal/capability_space.h>
#include <virtualization/extended_vcpu_state.h>
#include <kernel/interface.h>
#include <vm_session/connection.h>
#include <vm_session/handler.h>
#include <hw_native_vcpu/hw_native_vcpu.h>
using namespace Genode;
using Exit_config = Vm_connection::Exit_config;
/****************************
** hw vCPU implementation **
****************************/
struct Hw_vcpu : Rpc_client<Vm_session::Native_vcpu>, Noncopyable
{
private:
Attached_dataspace _state;
Native_capability _kernel_vcpu { };
Vcpu_handler_base & _vcpu_handler;
Thread * _ep_handler { nullptr };
unsigned _id { 0 };
Vcpu_state _stashed_state { };
bool _need_state_update { false };
bool _extra_pause { false };
Vcpu_handler<Hw_vcpu> _wrapper;
void _wrapper_dispatch();
void _prepare_pause_exit();
void _update_charged_state(Vcpu_state & old_state, Vcpu_state & new_state);
Capability<Native_vcpu> _create_vcpu(Vm_connection &, Vcpu_handler_base &);
public:
const Hw_vcpu& operator=(const Hw_vcpu &) = delete;
Hw_vcpu(const Hw_vcpu&) = delete;
Hw_vcpu(Env &, Vm_connection &, Vcpu_handler_base &);
void run();
void pause();
Vm_state & state() { return *_state.local_addr<Vm_state>(); }
};
Hw_vcpu::Hw_vcpu(Env &env, Vm_connection &vm, Vcpu_handler_base &handler)
:
Rpc_client<Native_vcpu>(_create_vcpu(vm, handler)),
_state(env.rm(), vm.with_upgrade([&] () { return call<Rpc_state>(); })),
_vcpu_handler(handler),
_wrapper(handler.ep(), *this, &Hw_vcpu::_wrapper_dispatch)
{
static unsigned counter = 0;
call<Rpc_exception_handler>(_wrapper.signal_cap());
_kernel_vcpu = call<Rpc_native_vcpu>();
_id = counter++;
}
void Hw_vcpu::_wrapper_dispatch()
{
/*
* If this is running, the VM is not. Either it hasn't, or it has been
* forced out and has written any state back.
*/
/*
* We run from the same EP as the orignal dispatch handler that
* will call run() from its dispatch loop, set _ep_handler.
*/
if (!_ep_handler)
_ep_handler = Thread::myself();
int run_state = state().run_state.value();
/*
* In case the VMM dispatch method waits for a pause signal,
* we need a different state to make the pause() method
* send another signal.
*/
if (run_state == Vcpu_run_state::DISPATCHING)
state().run_state.set(Vcpu_run_state::DISPATCHED);
if (run_state == Vcpu_run_state::DISPATCHING_PAUSED)
state().run_state.set(Vcpu_run_state::PAUSING);
/*
* Dispatch the exit originating from the vCPU
*/
if (run_state == Vcpu_run_state::DISPATCHING ||
run_state == Vcpu_run_state::DISPATCHING_PAUSED ||
run_state == Vcpu_run_state::PAUSED) {
/* Call the VMM's dispatch method. */
_vcpu_handler.dispatch(1);
/*
* Dispatch will probably have called run(), but if the state is set
* to PAUSING it won't.
*/
}
/*
* Dispatch a possible folded in pause signal.
* Note that we only the local run_state against pausing.
* If the DISPATCHED state was changed to PAUSING in between, pause()
* has sent a new signal.
*/
if (run_state == Vcpu_run_state::PAUSING ||
run_state == Vcpu_run_state::DISPATCHING_PAUSED ||
_extra_pause) {
Kernel::pause_vm(Capability_space::capid(_kernel_vcpu));
_update_charged_state(_stashed_state, state());
/*
* Tell run() to get any stashed state from the original dispatch.
* Necessary because that state is discharged when the VMM
* dispatches and would be lost otherwise.
*/
_need_state_update = true;
_extra_pause = false;
_prepare_pause_exit();
state().run_state.set(Vcpu_run_state::PAUSED);
_vcpu_handler.dispatch(1);
}
}
void Hw_vcpu::run()
{
if (_need_state_update) {
_update_charged_state(state(), _stashed_state);
_stashed_state.discharge();
_need_state_update = false;
}
switch (state().run_state.value()) {
case Vcpu_run_state::STARTUP:
break;
case Vcpu_run_state::DISPATCHED:
if (_ep_handler != Thread::myself()) {
Genode::error("Vcpu (", _id, ") run: setting run from remote CPU unsupported");
return;
}
if (!state().run_state.cas(Vcpu_run_state::DISPATCHED,
Vcpu_run_state::RUNNABLE))
return; /* state changed to PAUSING */
break;
case Vcpu_run_state::PAUSED:
state().run_state.set(Vcpu_run_state::RUNNABLE);
/*
* It is the VMM's responsibility to be reasonable here.
* If Vcpu::run() is called assynchronously while the vCPU handler
* is still dispatching a request before pause this breaks.
*/
if (_ep_handler != Thread::myself())
Genode::warning("Vcpu (", _id, ") run: asynchronous call of run()");
break;
case Vcpu_run_state::PAUSING:
return;
default:
Genode::error("Vcpu (", _id, ") run: ignoring state ",
Genode::Hex(state().run_state.value()));
return;
}
Kernel::run_vm(Capability_space::capid(_kernel_vcpu));
}
void Hw_vcpu::pause()
{
switch (state().run_state.value()) {
/*
* Ignore pause requests before the vCPU has started up.
*/
case Vcpu_run_state::STARTUP:
return;
/*
* When a pause is requested while starting or dispatching, the original
* exit needs to be handled before a pause exit can be injected.
* In these two cases it may happen be that the pause signal would be
* folded in with the signal from the kernel, therefore we need to make
* sure that the wrapper will prepare the pause exit anyway.
*/
case Vcpu_run_state::DISPATCHING:
if (!state().run_state.cas(Vcpu_run_state::DISPATCHING,
Vcpu_run_state::DISPATCHING_PAUSED))
pause(); /* moved on to DISPATCHED, retry */
return;
/*
* The vCPU could run anytime. Switch to RUN_ONCE to make the kernel
* exit and send a signal after running.
* If the state has changed, it must be to running, in that case retry
* the pause.
*/
case Vcpu_run_state::RUNNABLE:
if (!state().run_state.cas(Vcpu_run_state::RUNNABLE,
Vcpu_run_state::RUN_ONCE))
{
pause();
return;
}
_extra_pause = true;
return;
/*
* The vCPU may be running, signal that any interrupt exit is because it
* is forced out.
*
* If the CPU is already at the beginning of the exception handling,
* the handler will get two signals: whatever the normal exit would have
* been and the pause exit straight after, which is ok.
*
* If the state is written after it was already switched to
* INTERRUPTIBLE in the exit handler, we simply retry.
*/
case Vcpu_run_state::RUNNING:
if (_ep_handler == Thread::myself()) {
Genode::error("Vcpu (", _id, " ) pause: illegal state in line ", __LINE__ );
return;
};
if (!state().run_state.cas(Vcpu_run_state::RUNNING,
Vcpu_run_state::EXITING)) {
pause();
return;
}
break;
/*
* A pause request is received when the CPU was already forced out.
* In this case we need to write the state back first and send the
* signal later. If this comes from another thread then it may be
* interrupted after reading the state, while the vCPU thread starts
* RUNNING. Therefore if the swap fails, retry the pause().
*/
case Vcpu_run_state::INTERRUPTIBLE:
if (!state().run_state.cas(Vcpu_run_state::INTERRUPTIBLE,
Vcpu_run_state::SYNC_FROM_VCPU))
pause();
return;
/*
* A pause is requested while the VM has been dispatched.
* Send a new signal in case the VMM waits for a pause() exit
* before doing another run.
*/
case Vcpu_run_state::DISPATCHED:
if (!state().run_state.cas(Vcpu_run_state::DISPATCHED,
Vcpu_run_state::PAUSING)) {
pause();
return;
}
break;
/*
* We're already pausing or paused, ignore it.
*/
default:
return;
}
_wrapper.local_submit();
}
/*
* Prepare a pause exit to dispatch to the VMM.
* Because we don't do a round trip to the kernel we charge some state to keep
* seoul happy.
*/
void Hw_vcpu::_prepare_pause_exit()
{
state().exit_reason = 0xFF;
state().ax.set_charged();
state().bx.set_charged();
state().cx.set_charged();
state().dx.set_charged();
state().bp.set_charged();
state().di.set_charged();
state().si.set_charged();
state().flags.set_charged();
state().sp.set_charged();
state().ip.set_charged();
state().ip_len.set_charged();
state().qual_primary.set_charged();
state().qual_secondary.set_charged();
state().intr_state.set_charged();
state().actv_state.set_charged();
state().inj_info.set_charged();
state().inj_error.set_charged();
}
/*
* Update fields not already charged from one Vcpu_state to the other.
*/
void Hw_vcpu::_update_charged_state(Vcpu_state & old_state, Vcpu_state & new_state)
{
if (new_state.ax.charged() || new_state.cx.charged() ||
new_state.dx.charged() || new_state.bx.charged()) {
old_state.ax.update(new_state.ax.value());
old_state.cx.update(new_state.cx.value());
old_state.dx.update(new_state.dx.value());
old_state.bx.update(new_state.bx.value());
}
if (new_state.bp.charged() || new_state.di.charged() ||
new_state.si.charged()) {
old_state.bp.update(new_state.bp.value());
old_state.si.update(new_state.si.value());
old_state.di.update(new_state.di.value());
}
if (new_state.sp.charged()) {
old_state.sp.update(new_state.sp.value());
}
if (new_state.ip.charged()) {
old_state.ip.update(new_state.ip.value());
old_state.ip_len.update(new_state.ip_len.value());
}
if (new_state.flags.charged()) {
old_state.flags.update(new_state.flags.value());
}
if (new_state.es.charged() || new_state.ds.charged()) {
old_state.es.update(new_state.es.value());
old_state.ds.update(new_state.ds.value());
}
if (new_state.fs.charged() || new_state.gs.charged()) {
old_state.fs.update(new_state.fs.value());
old_state.gs.update(new_state.gs.value());
}
if (new_state.cs.charged() || new_state.ss.charged()) {
old_state.cs.update(new_state.cs.value());
old_state.ss.update(new_state.ss.value());
}
if (new_state.tr.charged()) {
old_state.tr.update(new_state.tr.value());
}
if (new_state.ldtr.charged()) {
old_state.ldtr.update(new_state.ldtr.value());
}
if (new_state.gdtr.charged()) {
old_state.gdtr.update(new_state.gdtr.value());
}
if (new_state.idtr.charged()) {
old_state.idtr.update(new_state.idtr.value());
}
if (new_state.cr0.charged() || new_state.cr2.charged() ||
new_state.cr3.charged() || new_state.cr4.charged()) {
old_state.cr0.update(new_state.cr0.value());
old_state.cr2.update(new_state.cr2.value());
old_state.cr3.update(new_state.cr3.value());
old_state.cr4.update(new_state.cr4.value());
}
if (new_state.dr7.charged()) {
old_state.dr7.update(new_state.dr7.value());
}
if (new_state.sysenter_cs.charged() || new_state.sysenter_sp.charged() ||
new_state.sysenter_ip.charged()) {
old_state.sysenter_ip.update(new_state.sysenter_ip.value());
old_state.sysenter_sp.update(new_state.sysenter_sp.value());
old_state.sysenter_cs.update(new_state.sysenter_cs.value());
}
if (new_state.ctrl_primary.charged() ||
new_state.ctrl_secondary.charged()) {
old_state.ctrl_primary.update(new_state.ctrl_primary.value());
old_state.ctrl_secondary.update(new_state.ctrl_secondary.value());
}
if (new_state.inj_info.charged() || new_state.inj_error.charged()) {
old_state.inj_info.update(new_state.inj_info.value());
old_state.inj_error.update(new_state.inj_error.value());
}
if (new_state.intr_state.charged() || new_state.actv_state.charged()) {
old_state.intr_state.update(new_state.intr_state.value());
old_state.actv_state.update(new_state.actv_state.value());
}
if (new_state.tsc_offset.charged()) {
old_state.tsc.update(new_state.tsc.value());
old_state.tsc_offset.update(new_state.tsc_offset.value());
old_state.tsc_aux.update(new_state.tsc_aux.value());
}
if (new_state.efer.charged()) {
old_state.efer.update(new_state.efer.value());
}
if (new_state.pdpte_0.charged() || new_state.pdpte_1.charged() ||
new_state.pdpte_2.charged() || new_state.pdpte_3.charged()) {
old_state.pdpte_0.update(new_state.pdpte_0.value());
old_state.pdpte_1.update(new_state.pdpte_1.value());
old_state.pdpte_2.update(new_state.pdpte_2.value());
old_state.pdpte_3.update(new_state.pdpte_3.value());
}
if (new_state.r8 .charged() || new_state.r9 .charged() ||
new_state.r10.charged() || new_state.r11.charged() ||
new_state.r12.charged() || new_state.r13.charged() ||
new_state.r14.charged() || new_state.r15.charged()) {
old_state.r8.update(new_state.r8.value());
old_state.r9.update(new_state.r9.value());
old_state.r10.update(new_state.r10.value());
old_state.r11.update(new_state.r11.value());
old_state.r12.update(new_state.r12.value());
old_state.r13.update(new_state.r13.value());
old_state.r14.update(new_state.r14.value());
old_state.r15.update(new_state.r15.value());
}
if (new_state.star .charged() || new_state.lstar.charged() ||
new_state.cstar.charged() || new_state.fmask.charged() ||
new_state.kernel_gs_base.charged()) {
old_state.star.update(new_state.star.value());
old_state.lstar.update(new_state.lstar.value());
old_state.cstar.update(new_state.cstar.value());
old_state.fmask.update(new_state.fmask.value());
old_state.kernel_gs_base.update(new_state.kernel_gs_base.value());
}
if (new_state.tpr.charged() || new_state.tpr_threshold.charged()) {
old_state.tpr.update(new_state.tpr.value());
old_state.tpr_threshold.update(new_state.tpr_threshold.value());
}
}
Capability<Vm_session::Native_vcpu> Hw_vcpu::_create_vcpu(Vm_connection &vm,
Vcpu_handler_base &handler)
{
Thread &tep { *reinterpret_cast<Thread *>(&handler.rpc_ep()) };
return vm.with_upgrade([&] () {
return vm.call<Vm_session::Rpc_create_vcpu>(tep.cap()); });
}
/**************
** vCPU API **
**************/
void Vm_connection::Vcpu::run() { static_cast<Hw_vcpu &>(_native_vcpu).run(); }
void Vm_connection::Vcpu::pause() { static_cast<Hw_vcpu &>(_native_vcpu).pause(); }
Vcpu_state & Vm_connection::Vcpu::state() { return static_cast<Hw_vcpu &>(_native_vcpu).state(); }
Vm_connection::Vcpu::Vcpu(Vm_connection &vm, Allocator &alloc,
Vcpu_handler_base &handler, Exit_config const &)
:
_native_vcpu(*new (alloc) Hw_vcpu(vm._env, vm, handler))
{ }