diff --git a/.linkcheck-ignore.yml b/.linkcheck-ignore.yml index 2bb72c4fe..6235362d0 100644 --- a/.linkcheck-ignore.yml +++ b/.linkcheck-ignore.yml @@ -1,2 +1,3 @@ urls: - "https://developer.arm.com/downloads/*" + - "https://cdrdv2.intel.com/*" diff --git a/docs/manual.md b/docs/manual.md index 5b9c3f1cd..8227c7cb6 100644 --- a/docs/manual.md +++ b/docs/manual.md @@ -637,7 +637,10 @@ as to what fault occurred. The `reply_msginfo` argument is given by libmicrokit and can be used to reply to the fault. The returned `seL4_Bool` is whether or not to reply to the fault with the message `reply_msginfo`. -Returning `seL4_True` will reply to the fault. Returning `seL4_False` will not reply to the fault. +Returning `seL4_True` will reply to the fault and resume the child PD or vCPU at the fault restart program counter. +For more details, please see the "Faults" section of the [seL4 Manual](https://sel4.systems/Info/Docs/seL4-manual-latest.pdf). + +Returning `seL4_False` will not reply to the fault and causes the child PD or vCPU to no longer run. You can use `microkit_msginfo_get_label` on `msginfo` to deduce what kind of fault happened (for example, whether it was a user exception or a virtual memory fault). @@ -649,6 +652,43 @@ To find the full list of possible faults that could occur and details regarding kind of fault, please see the 'Faults' section of the [seL4 reference manual](https://sel4.systems/Info/Docs/seL4-manual-latest.pdf). +### x86 VCPU fault +Please see the 'VMX BASIC EXIT REASONS' section of the +[Intel® 64 and IA-32 Architectures Software Developer’s Manual Combined Volumes: 1, 2A, 2B, 2C, 2D, 3A, 3B, 3C, 3D, and 4] +(https://cdrdv2.intel.com/v1/dl/getContent/671200) for a list of possible VM Exit reasons. + +These message registers contain data relating to the VM Exit: +- `SEL4_VMENTER_CALL_EIP_MR`: Instruction Pointer, +- `SEL4_VMENTER_CALL_CONTROL_PPC_MR`: Primary Processor Based VM Execution Controls, +- `SEL4_VMENTER_CALL_INTERRUPT_INFO_MR`: VM Entry Interruption-Information, +- `SEL4_VMENTER_FAULT_REASON_MR`: VM Exit reason, +- `SEL4_VMENTER_FAULT_QUALIFICATION_MR`: VM Exit qualification, +- `SEL4_VMENTER_FAULT_INSTRUCTION_LEN_MR`: Length of instruction that caused the VM Exit, +- `SEL4_VMENTER_FAULT_GUEST_PHYSICAL_MR`: Guest Physical Address of the VM Exit, +- `SEL4_VMENTER_FAULT_RFLAGS_MR`: Guest FLAGS register, +- `SEL4_VMENTER_FAULT_GUEST_INT_MR`: Guest interruptability, +- `SEL4_VMENTER_FAULT_CR3_MR`: Guest CR3. + +Some of these message registers may not contain valid data depending on the VM Exit reason, +please consult the Intel SDM for more details. + +These message registers contain the guest general purpose registers at the time of VM Exit: +- `SEL4_VMENTER_FAULT_EAX` +- `SEL4_VMENTER_FAULT_EBX` +- `SEL4_VMENTER_FAULT_ECX` +- `SEL4_VMENTER_FAULT_EDX` +- `SEL4_VMENTER_FAULT_ESI` +- `SEL4_VMENTER_FAULT_EDI` +- `SEL4_VMENTER_FAULT_EBP` +- `SEL4_VMENTER_FAULT_R8` +- `SEL4_VMENTER_FAULT_R9` +- `SEL4_VMENTER_FAULT_R10` +- `SEL4_VMENTER_FAULT_R11` +- `SEL4_VMENTER_FAULT_R12` +- `SEL4_VMENTER_FAULT_R13` +- `SEL4_VMENTER_FAULT_R14` +- `SEL4_VMENTER_FAULT_R15` + ## `microkit_msginfo microkit_ppcall(microkit_channel ch, microkit_msginfo msginfo)` Performs a call to a protected procedure in a different PD. @@ -826,6 +866,22 @@ virtual CPU with ID `vcpu`. Write the registers of a given virtual CPU with ID `vcpu`. The `regs` argument is the pointer to the struct of registers `seL4_VCPUContext` that are written from. +## `void microkit_vcpu_x86_on(void)` + +Start running the vCPU bound to this PD. + +The vCPU runs after every return from a Microkit entrypoint; it does not run concurrently with the native thread. + +The kernel performs minimal setup of the vCPU's architectural state, configuring only the hardware-mandated +fixed bits and setting VMCS fields required to safely context-switch between VMX root and non-root operations. +The caller is responsible for initialising all other architectural state such as the instruction pointer using +`microkit_vcpu_x86_write_vmcs()`, in accordance with the +[Intel SDM](https://cdrdv2.intel.com/v1/dl/getContent/671200). + +## `void microkit_vcpu_x86_off(void)` + +Stop the PD from switching to guest execution mode when a Microkit entrypoint returns. + ## `seL4_CPtr microkit_cspace_root_slot_to_cptr(seL4_Word slot)` {#libmicrokit_cspace_root_slot_to_cptr} Converts the slot identifier of the ``'s capability element into an @@ -948,6 +1004,7 @@ The `virtual_machine` element has the following attributes: Additionally, it supports the following child elements: * `vcpu`: (one or more) Describes the virtual CPU that will be tied to the virtual machine. + * On x86-64, there is a limit of one VCPU per PD. * `map`: (zero or more) Describes mapping of memory regions into the virtual machine. The `vcpu` element has the following attributes: @@ -1027,6 +1084,7 @@ The `end` element has the following attributes: * `id`: Channel identifier in the context of the named protection domain. Must be at least 0 and less than 63. * `pp`: (optional) Indicates that the protection domain for this end can perform a protected procedure call to the other end; defaults to false. Protected procedure calls can only be to PDs of strictly higher priority. + On x86-64, PDs with virtual machines cannot receive protected procedure calls. * `notify`: (optional) Indicates that the protection domain for this end can send a notification to the other end; defaults to true. * `setvar_id`: (optional) Specifies a symbol in the program image. This symbol will be rewritten with the channel identifier. diff --git a/libmicrokit/include/microkit.h b/libmicrokit/include/microkit.h index 5c6b76e0b..c25550f76 100644 --- a/libmicrokit/include/microkit.h +++ b/libmicrokit/include/microkit.h @@ -9,7 +9,11 @@ #pragma once +#include #include +#ifdef CONFIG_VTX +#include +#endif /* CONFIG_VTX */ typedef unsigned int microkit_channel; typedef unsigned int microkit_child; @@ -47,6 +51,15 @@ extern char microkit_name[MICROKIT_PD_NAME_LENGTH]; extern seL4_Bool microkit_have_signal; extern seL4_CPtr microkit_signal_cap; extern seL4_MessageInfo_t microkit_signal_msg; +#if defined(CONFIG_VTX) +struct microkit_x86_vcpu_state { + seL4_Bool is_on; + seL4_Word rip; + seL4_Word prim_proc_ctl; + seL4_Word irq_info; +}; +extern struct microkit_x86_vcpu_state microkit_x86_vcpu_state; +#endif /* Symbols for error checking libmicrokit API calls. Patched by the Microkit tool * to set bits corresponding to valid channels for this PD. */ @@ -190,13 +203,8 @@ static inline void microkit_vcpu_restart(microkit_child vcpu, seL4_Word entry_po { seL4_Error err; seL4_UserContext ctxt = {0}; -#if defined(CONFIG_ARCH_AARCH64) ctxt.pc = entry_point; -#elif defined(CONFIG_ARCH_X86_64) - ctxt.rip = entry_point; -#else -#error "unknown architecture for 'microkit_vcpu_restart'" -#endif + err = seL4_TCB_WriteRegisters( BASE_VM_TCB_CAP + vcpu, seL4_True, @@ -220,9 +228,7 @@ static inline void microkit_vcpu_stop(microkit_child vcpu) microkit_internal_crash(err); } } -#endif -#if defined(CONFIG_ARM_HYPERVISOR_SUPPORT) static inline void microkit_vcpu_arm_inject_irq(microkit_child vcpu, seL4_Uint16 irq, seL4_Uint8 priority, seL4_Uint8 group, seL4_Uint8 index) { @@ -265,7 +271,7 @@ static inline void microkit_vcpu_arm_write_reg(microkit_child vcpu, seL4_Word re microkit_internal_crash(err); } } -#endif +#endif /* CONFIG_ARM_HYPERVISOR_SUPPORT */ #if defined(CONFIG_ALLOW_SMC_CALLS) static inline void microkit_arm_smc_call(seL4_ARM_SMCContext *args, seL4_ARM_SMCContext *response) @@ -277,7 +283,7 @@ static inline void microkit_arm_smc_call(seL4_ARM_SMCContext *args, seL4_ARM_SMC microkit_internal_crash(err); } } -#endif +#endif /* CONFIG_ALLOW_SMC_CALLS */ #if defined(CONFIG_ARCH_X86_64) static inline void microkit_x86_ioport_write_8(microkit_ioport ioport_id, seL4_Word port_addr, seL4_Word data) @@ -393,28 +399,86 @@ static inline seL4_Uint32 microkit_x86_ioport_read_32(microkit_ioport ioport_id, return ret.result; } -#endif -#if defined(CONFIG_ARCH_X86_64) && defined(CONFIG_VTX) +#if defined(CONFIG_VTX) +/* Architecturally defined identifiers for a x86 VCPU's VMCS fields, + * see seL4 source: `include/arch/x86/arch/object/vcpu.h` + * or Intel® 64 and IA-32 Architectures Software Developer’s Manual + * Combined Volumes: 1, 2A, 2B, 2C, 2D, 3A, 3B, 3C, 3D, and 4 + * Order Number: 325462-084US June 2024 + * "Table B-8. Encodings for 32-Bit Control Fields (0100_00xx_xxxx_xxx0B)" and + * "Table B-14. Encodings for Natural-Width Guest-State Fields (0110_10xx_xxxx_xxx0B) (Contd.)". + * */ +#define VMX_GUEST_RIP 0x0000681E +#define VMX_CONTROL_PRIMARY_PROCESSOR_CONTROLS 0x00004002 +#define VMX_CONTROL_ENTRY_INTERRUPTION_INFO 0x00004016 + static inline seL4_Word microkit_vcpu_x86_read_vmcs(microkit_child vcpu, seL4_Word field) { - seL4_X86_VCPU_ReadVMCS_t ret; - ret = seL4_X86_VCPU_ReadVMCS(BASE_VCPU_CAP + vcpu, field); - if (ret.error != seL4_NoError) { - microkit_dbg_puts("microkit_x86_read_vmcs: error reading data\n"); - microkit_internal_crash(ret.error); + /* This function assumes that a PD would only have access to 1 VCPU object. + * For these fields: + * - Guest RIP, + * - Primary Processor Based VM Execution Controls, and + * - VM Entry Interruption-Information + * they will be read from local variables that were written + * to by `microkit_vcpu_x86_write_vmcs()`. The kernel will service + * reading from other fields. + */ + + seL4_Word value; + switch (field) { + case VMX_GUEST_RIP: + value = microkit_x86_vcpu_state.rip; + break; + case VMX_CONTROL_PRIMARY_PROCESSOR_CONTROLS: + value = microkit_x86_vcpu_state.prim_proc_ctl; + break; + case VMX_CONTROL_ENTRY_INTERRUPTION_INFO: + value = microkit_x86_vcpu_state.irq_info; + break; + default: { + seL4_X86_VCPU_ReadVMCS_t ret = seL4_X86_VCPU_ReadVMCS(BASE_VCPU_CAP + vcpu, field); + if (ret.error != seL4_NoError) { + microkit_dbg_puts("microkit_x86_read_vmcs: error reading data\n"); + microkit_internal_crash(ret.error); + } + value = ret.value; + break; + } } - return ret.value; + return value; } static inline void microkit_vcpu_x86_write_vmcs(microkit_child vcpu, seL4_Word field, seL4_Word value) { - seL4_X86_VCPU_WriteVMCS_t ret; - ret = seL4_X86_VCPU_WriteVMCS(BASE_VCPU_CAP + vcpu, field, value); - if (ret.error != seL4_NoError) { - microkit_dbg_puts("microkit_x86_write_vmcs: error writing data\n"); - microkit_internal_crash(ret.error); + /* Assumes that a PD would only have access to 1 VCPU object. + * These fields will be written to local variables rather than the actual VMCS: + * - RIP, + * - Primary Processor Based VM Execution Controls, and + * - VM Entry Interruption-Information. + * Because they will be passed to the kernel on `seL4_VMEnter()`, + * then the kernel will write them to the VMCS for us. + * Writing other VMCS fields will go to the real VMCS immediately via a syscall. + */ + switch (field) { + case VMX_GUEST_RIP: + microkit_x86_vcpu_state.rip = value; + break; + case VMX_CONTROL_PRIMARY_PROCESSOR_CONTROLS: + microkit_x86_vcpu_state.prim_proc_ctl = value; + break; + case VMX_CONTROL_ENTRY_INTERRUPTION_INFO: + microkit_x86_vcpu_state.irq_info = value; + break; + default: { + seL4_X86_VCPU_WriteVMCS_t ret = seL4_X86_VCPU_WriteVMCS(BASE_VCPU_CAP + vcpu, field, value); + if (ret.error != seL4_NoError) { + microkit_dbg_puts("microkit_x86_write_vmcs: error writing data\n"); + microkit_internal_crash(ret.error); + } + break; + } } } @@ -479,7 +543,20 @@ static inline void microkit_vcpu_x86_write_regs(microkit_child vcpu, seL4_VCPUCo } } -#endif +static inline void microkit_vcpu_x86_on(void) +{ + /* On x86, a TCB can only have one bound VCPU at any given time. + * So we don't take a `microkit_child vcpu` here. */ + microkit_x86_vcpu_state.is_on = true; +} + +static inline void microkit_vcpu_x86_off(void) +{ + microkit_x86_vcpu_state.is_on = false; +} + +#endif /* CONFIG_VTX */ +#endif /* CONFIG_ARCH_X86_64 */ static inline void microkit_deferred_notify(microkit_channel ch) { diff --git a/libmicrokit/src/main.c b/libmicrokit/src/main.c index 347ab0e23..55d6c38b6 100644 --- a/libmicrokit/src/main.c +++ b/libmicrokit/src/main.c @@ -17,6 +17,9 @@ #define PD_MASK 0xff #define CHANNEL_MASK 0x3f +#define BADGE_FAULT_BIT 62 +#define BADGE_ENDPOINT_BIT 63 + /* All globals are prefixed with microkit_* to avoid clashes with user defined globals. */ bool microkit_passive; @@ -27,6 +30,10 @@ seL4_Bool microkit_have_signal = seL4_False; seL4_CPtr microkit_signal_cap; seL4_MessageInfo_t microkit_signal_msg; +#if defined(CONFIG_VTX) +struct microkit_x86_vcpu_state microkit_x86_vcpu_state; +#endif /* CONFIG_VTX */ + seL4_Word microkit_irqs; seL4_Word microkit_notifications; seL4_Word microkit_pps; @@ -71,10 +78,46 @@ static void deferred_flush(void) } } +#if defined(CONFIG_VTX) +static seL4_MessageInfo_t x86_vcpu_resume(seL4_Word *badge) +{ + /* There is no seL4 invocation which combines a VMEnter and a non-blocking send. + * Thus, we must perform any deferred signals from `microkit_deferred_notify()` / + * `microkit_deferred_irq_ack()` before invoking VMEnter. */ + deferred_flush(); + + seL4_Word is_fault, fault_reason; + + x64_sys_send_recv(seL4_SysVMEnter, 0, badge, 0, &is_fault, + µkit_x86_vcpu_state.rip, µkit_x86_vcpu_state.prim_proc_ctl, µkit_x86_vcpu_state.irq_info, &fault_reason, + 0); + /* We want to follow the documented kernel behaviour where these 4 values are always populated in the message registers. + * However, due to `setMR()`'s behaviour in include/object/tcb.h of the kernel, these 4 values will only be set in + * CPU registers. So we need to copy them to the IPC buffer to conform to the documented kernel behaviour. + * + * The other reason why we want to manage these values is so that the user does not have to worry about saving/updating them + * when servicing a notification. */ + microkit_mr_set(SEL4_VMENTER_CALL_EIP_MR, microkit_x86_vcpu_state.rip); + microkit_mr_set(SEL4_VMENTER_CALL_CONTROL_PPC_MR, microkit_x86_vcpu_state.prim_proc_ctl); + microkit_mr_set(SEL4_VMENTER_CALL_INTERRUPT_INFO_MR, microkit_x86_vcpu_state.irq_info); + /* The 4th value (the exit reason) is only valid when fault is true. */ + + /* Create a dummy msgInfo so that we can call `fault()`, as on x86 a VMExit is not an IPC as on other architectures. */ + if (is_fault) { + microkit_mr_set(SEL4_VMENTER_FAULT_REASON_MR, fault_reason); + *badge |= 1ull << BADGE_FAULT_BIT; + return seL4_MessageInfo_new(0, 0, 0, SEL4_VMENTER_NUM_FAULT_MSGS); + } else { + /* VCPU got interrupted due to a notification, no msgInfo. */ + return seL4_MessageInfo_new(0, 0, 0, 0); + } +} +#endif + static void handler_loop(void) { bool have_reply = false; - seL4_MessageInfo_t reply_tag; + seL4_MessageInfo_t reply_tag = seL4_MessageInfo_new(0, 0, 0, 0); /** * Because of https://github.com/seL4/seL4/issues/1536 @@ -97,7 +140,16 @@ static void handler_loop(void) seL4_Word badge; seL4_MessageInfo_t tag; +#if defined(CONFIG_VTX) + if (microkit_x86_vcpu_state.is_on) { + /* We should never have a reply message from the `protected()` endpoint, + * as on x86 a PD with a bound vCPU cannot receive PPCs.*/ + // assert(!have_reply); + tag = x86_vcpu_resume(&badge); + } else if (have_reply) { +#else if (have_reply) { +#endif deferred_flush(); tag = seL4_ReplyRecv(INPUT_CAP, reply_tag, &badge, REPLY_CAP); } else if (microkit_have_signal) { @@ -107,13 +159,21 @@ static void handler_loop(void) tag = seL4_Recv(INPUT_CAP, &badge, REPLY_CAP); } - uint64_t is_endpoint = badge >> 63; - uint64_t is_fault = (badge >> 62) & 1; + uint64_t is_endpoint = badge >> BADGE_ENDPOINT_BIT; + uint64_t is_fault = (badge >> BADGE_FAULT_BIT) & 1; have_reply = false; if (is_fault) { seL4_Bool reply_to_fault = fault(badge & PD_MASK, tag, &reply_tag); +#if defined(CONFIG_VTX) + /* If fault() returns false then we shouldn't resume the VCPU. */ + if (!reply_to_fault) { + microkit_x86_vcpu_state.is_on = false; + } + /* There won't be anything to reply to for a VCPU fault. */ + reply_to_fault = seL4_False; +#endif if (reply_to_fault) { have_reply = true; } diff --git a/tool/microkit/src/sdf.rs b/tool/microkit/src/sdf.rs index 365650100..d6e25af05 100644 --- a/tool/microkit/src/sdf.rs +++ b/tool/microkit/src/sdf.rs @@ -2071,6 +2071,18 @@ pub fn parse( } vms.push(&vm.name); } + + if pd.has_children { + // When seL4_VMEnter() is called, the kernel only checks the VMM's bound + // notification for pending signals. Because the endpoint object isn't passed + // or checked for pending messages, Child PDs won't work while the VCPU is on. + // Technically, Child PDs could still work when the VCPU is off, but we shouldn't + // expose this footgun to users. + return Err(format!( + "Error: It is not possible for PD '{}' with a bound vCPU to have children on x86_64: {}", + pd.name(), + loc_string(&xml_sdf, pd.text_pos.unwrap()))); + } } // Ensure no duplicate IRQs @@ -2151,6 +2163,21 @@ pub fn parse( )); } + if config.arch == Arch::X86_64 + && ((ch.end_a.pp && pd_b.virtual_machine.is_some()) + || (ch.end_b.pp && pd_a.virtual_machine.is_some())) + { + // Same cause as child PD above + return Err(format!( + "Error: It is not possible to PPC to PD '{}' with a bound vCPU from PD '{}' on x86_64 @ {}:{}:{}", + if ch.end_a.pp {&pd_b.name} else {&pd_a.name}, + if ch.end_a.pp {&pd_a.name} else {&pd_b.name}, + filename.display(), + pd_a.text_pos.unwrap().row, + pd_a.text_pos.unwrap().col + )); + } + ch_ids[ch.end_a.pd].push(ch.end_a.id); ch_ids[ch.end_b.pd].push(ch.end_b.id); } diff --git a/tool/microkit/tests/sdf/vm_x86_ppc_invalid_1.system b/tool/microkit/tests/sdf/vm_x86_ppc_invalid_1.system new file mode 100644 index 000000000..f1fab020a --- /dev/null +++ b/tool/microkit/tests/sdf/vm_x86_ppc_invalid_1.system @@ -0,0 +1,39 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tool/microkit/tests/sdf/vm_x86_ppc_invalid_2.system b/tool/microkit/tests/sdf/vm_x86_ppc_invalid_2.system new file mode 100644 index 000000000..db763e43e --- /dev/null +++ b/tool/microkit/tests/sdf/vm_x86_ppc_invalid_2.system @@ -0,0 +1,39 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tool/microkit/tests/sdf/vm_x86_ppc_valid.system b/tool/microkit/tests/sdf/vm_x86_ppc_valid.system new file mode 100644 index 000000000..66d0faa1f --- /dev/null +++ b/tool/microkit/tests/sdf/vm_x86_ppc_valid.system @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tool/microkit/tests/sdf/vm_x86_vm_with_child_invalid.system b/tool/microkit/tests/sdf/vm_x86_vm_with_child_invalid.system new file mode 100644 index 000000000..682f1ef58 --- /dev/null +++ b/tool/microkit/tests/sdf/vm_x86_vm_with_child_invalid.system @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tool/microkit/tests/test.rs b/tool/microkit/tests/test.rs index 4a25465b3..1c5046ebb 100644 --- a/tool/microkit/tests/test.rs +++ b/tool/microkit/tests/test.rs @@ -761,6 +761,38 @@ mod virtual_machine { "Error: invalid memory region name 'mr1' on 'map' @", ) } + + #[test] + fn test_x86_ppc_invalid_1() { + check_error( + &DEFAULT_X86_64_KERNEL_CONFIG, + "vm_x86_ppc_invalid_1.system", + "Error: It is not possible to PPC to PD 'VMM' with a bound vCPU from PD 'some_service' on x86_64", + ) + } + + #[test] + fn test_x86_ppc_invalid_2() { + check_error( + &DEFAULT_X86_64_KERNEL_CONFIG, + "vm_x86_ppc_invalid_2.system", + "Error: It is not possible to PPC to PD 'VMM' with a bound vCPU from PD 'some_service' on x86_64", + ) + } + + #[test] + fn test_x86_ppc_valid() { + check_success(&DEFAULT_X86_64_KERNEL_CONFIG, "vm_x86_ppc_valid.system") + } + + #[test] + fn test_vm_x86_vm_with_child_invalid() { + check_error( + &DEFAULT_X86_64_KERNEL_CONFIG, + "vm_x86_vm_with_child_invalid.system", + "Error: It is not possible for PD 'VMM' with a bound vCPU to have children on x86_64", + ) + } } #[cfg(test)]