hyperv-next for v6.19
-----BEGIN PGP SIGNATURE-----
iQFHBAABCgAxFiEEIbPD0id6easf0xsudhRwX5BBoF4FAmk2b0ITHHdlaS5saXVA
a2VybmVsLm9yZwAKCRB2FHBfkEGgXkefCACpUWTK0U0i47hXT+s4aA0T3sq6V3/T
+su9WnT3GPQ3BuRCRk51w6u9ADYt1EXtu8gRwq/wZiES9PJtz+9DmNuLT8nkkHXH
exbaRIBAiwLGg6QFC2VpbQzeHLp7qeko0MsLWyMiVPkw+lw9QPqcLKVEWuzPZfOn
UCkPB+XpzZg9Ft4vKRjXLyUMpwKzkqJw/aiXMfwonuaelcrzLw0hkzO3/I+eKRHv
JKxaHCwLgrPZyGCJpWtwiLxgu0DKLeDDhj0WSqDz/kUNhjo/GEshLA25UQJUdzI0
O+tFN9my7SZSYtq7fGoyfo16mAsLaXh0oYuwP8UnR4CDm4UF4JB4QTsM
=laZR
-----END PGP SIGNATURE-----
Merge tag 'hyperv-next-signed-20251207' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux
Pull hyperv updates from Wei Liu:
- Enhancements to Linux as the root partition for Microsoft Hypervisor:
- Support a new mode called L1VH, which allows Linux to drive the
hypervisor running the Azure Host directly
- Support for MSHV crash dump collection
- Allow Linux's memory management subsystem to better manage guest
memory regions
- Fix issues that prevented a clean shutdown of the whole system on
bare metal and nested configurations
- ARM64 support for the MSHV driver
- Various other bug fixes and cleanups
- Add support for Confidential VMBus for Linux guest on Hyper-V
- Secure AVIC support for Linux guests on Hyper-V
- Add the mshv_vtl driver to allow Linux to run as the secure kernel in
a higher virtual trust level for Hyper-V
* tag 'hyperv-next-signed-20251207' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (58 commits)
mshv: Cleanly shutdown root partition with MSHV
mshv: Use reboot notifier to configure sleep state
mshv: Add definitions for MSHV sleep state configuration
mshv: Add support for movable memory regions
mshv: Add refcount and locking to mem regions
mshv: Fix huge page handling in memory region traversal
mshv: Move region management to mshv_regions.c
mshv: Centralize guest memory region destruction
mshv: Refactor and rename memory region handling functions
mshv: adjust interrupt control structure for ARM64
Drivers: hv: use kmalloc_array() instead of kmalloc()
mshv: Add ioctl for self targeted passthrough hvcalls
Drivers: hv: Introduce mshv_vtl driver
Drivers: hv: Export some symbols for mshv_vtl
static_call: allow using STATIC_CALL_TRAMP_STR() from assembly
mshv: Extend create partition ioctl to support cpu features
mshv: Allow mappings that overlap in uaddr
mshv: Fix create memory region overlap check
mshv: add WQ_PERCPU to alloc_workqueue users
Drivers: hv: Use kmalloc_array() instead of kmalloc()
...
master
commit
feb06d2690
|
|
@ -178,7 +178,7 @@ These Hyper-V and VMBus memory pages are marked as decrypted:
|
|||
|
||||
* VMBus monitor pages
|
||||
|
||||
* Synthetic interrupt controller (synic) related pages (unless supplied by
|
||||
* Synthetic interrupt controller (SynIC) related pages (unless supplied by
|
||||
the paravisor)
|
||||
|
||||
* Per-cpu hypercall input and output pages (unless running with a paravisor)
|
||||
|
|
@ -232,6 +232,143 @@ with arguments explicitly describing the access. See
|
|||
_hv_pcifront_read_config() and _hv_pcifront_write_config() and the
|
||||
"use_calls" flag indicating to use hypercalls.
|
||||
|
||||
Confidential VMBus
|
||||
------------------
|
||||
The confidential VMBus enables the confidential guest not to interact with
|
||||
the untrusted host partition and the untrusted hypervisor. Instead, the guest
|
||||
relies on the trusted paravisor to communicate with the devices processing
|
||||
sensitive data. The hardware (SNP or TDX) encrypts the guest memory and the
|
||||
register state while measuring the paravisor image using the platform security
|
||||
processor to ensure trusted and confidential computing.
|
||||
|
||||
Confidential VMBus provides a secure communication channel between the guest
|
||||
and the paravisor, ensuring that sensitive data is protected from hypervisor-
|
||||
level access through memory encryption and register state isolation.
|
||||
|
||||
Confidential VMBus is an extension of Confidential Computing (CoCo) VMs
|
||||
(a.k.a. "Isolated" VMs in Hyper-V terminology). Without Confidential VMBus,
|
||||
guest VMBus device drivers (the "VSC"s in VMBus terminology) communicate
|
||||
with VMBus servers (the VSPs) running on the Hyper-V host. The
|
||||
communication must be through memory that has been decrypted so the
|
||||
host can access it. With Confidential VMBus, one or more of the VSPs reside
|
||||
in the trusted paravisor layer in the guest VM. Since the paravisor layer also
|
||||
operates in encrypted memory, the memory used for communication with
|
||||
such VSPs does not need to be decrypted and thereby exposed to the
|
||||
Hyper-V host. The paravisor is responsible for communicating securely
|
||||
with the Hyper-V host as necessary.
|
||||
|
||||
The data is transferred directly between the VM and a vPCI device (a.k.a.
|
||||
a PCI pass-thru device, see :doc:`vpci`) that is directly assigned to VTL2
|
||||
and that supports encrypted memory. In such a case, neither the host partition
|
||||
nor the hypervisor has any access to the data. The guest needs to establish
|
||||
a VMBus connection only with the paravisor for the channels that process
|
||||
sensitive data, and the paravisor abstracts the details of communicating
|
||||
with the specific devices away providing the guest with the well-established
|
||||
VSP (Virtual Service Provider) interface that has had support in the Hyper-V
|
||||
drivers for a decade.
|
||||
|
||||
In the case the device does not support encrypted memory, the paravisor
|
||||
provides bounce-buffering, and although the data is not encrypted, the backing
|
||||
pages aren't mapped into the host partition through SLAT. While not impossible,
|
||||
it becomes much more difficult for the host partition to exfiltrate the data
|
||||
than it would be with a conventional VMBus connection where the host partition
|
||||
has direct access to the memory used for communication.
|
||||
|
||||
Here is the data flow for a conventional VMBus connection (`C` stands for the
|
||||
client or VSC, `S` for the server or VSP, the `DEVICE` is a physical one, might
|
||||
be with multiple virtual functions)::
|
||||
|
||||
+---- GUEST ----+ +----- DEVICE ----+ +----- HOST -----+
|
||||
| | | | | |
|
||||
| | | | | |
|
||||
| | | ========== |
|
||||
| | | | | |
|
||||
| | | | | |
|
||||
| | | | | |
|
||||
+----- C -------+ +-----------------+ +------- S ------+
|
||||
|| ||
|
||||
|| ||
|
||||
+------||------------------ VMBus --------------------------||------+
|
||||
| Interrupts, MMIO |
|
||||
+-------------------------------------------------------------------+
|
||||
|
||||
and the Confidential VMBus connection::
|
||||
|
||||
+---- GUEST --------------- VTL0 ------+ +-- DEVICE --+
|
||||
| | | |
|
||||
| +- PARAVISOR --------- VTL2 -----+ | | |
|
||||
| | +-- VMBus Relay ------+ ====+================ |
|
||||
| | | Interrupts, MMIO | | | | |
|
||||
| | +-------- S ----------+ | | +------------+
|
||||
| | || | |
|
||||
| +---------+ || | |
|
||||
| | Linux | || OpenHCL | |
|
||||
| | kernel | || | |
|
||||
| +---- C --+-----||---------------+ |
|
||||
| || || |
|
||||
+-------++------- C -------------------+ +------------+
|
||||
|| | HOST |
|
||||
|| +---- S -----+
|
||||
+-------||----------------- VMBus ---------------------------||-----+
|
||||
| Interrupts, MMIO |
|
||||
+-------------------------------------------------------------------+
|
||||
|
||||
An implementation of the VMBus relay that offers the Confidential VMBus
|
||||
channels is available in the OpenVMM project as a part of the OpenHCL
|
||||
paravisor. Please refer to
|
||||
|
||||
* https://openvmm.dev/, and
|
||||
* https://github.com/microsoft/openvmm
|
||||
|
||||
for more information about the OpenHCL paravisor.
|
||||
|
||||
A guest that is running with a paravisor must determine at runtime if
|
||||
Confidential VMBus is supported by the current paravisor. The x86_64-specific
|
||||
approach relies on the CPUID Virtualization Stack leaf; the ARM64 implementation
|
||||
is expected to support the Confidential VMBus unconditionally when running
|
||||
ARM CCA guests.
|
||||
|
||||
Confidential VMBus is a characteristic of the VMBus connection as a whole,
|
||||
and of each VMBus channel that is created. When a Confidential VMBus
|
||||
connection is established, the paravisor provides the guest the message-passing
|
||||
path that is used for VMBus device creation and deletion, and it provides a
|
||||
per-CPU synthetic interrupt controller (SynIC) just like the SynIC that is
|
||||
offered by the Hyper-V host. Each VMBus device that is offered to the guest
|
||||
indicates the degree to which it participates in Confidential VMBus. The offer
|
||||
indicates if the device uses encrypted ring buffers, and if the device uses
|
||||
encrypted memory for DMA that is done outside the ring buffer. These settings
|
||||
may be different for different devices using the same Confidential VMBus
|
||||
connection.
|
||||
|
||||
Although these settings are separate, in practice it'll always be encrypted
|
||||
ring buffer only, or both encrypted ring buffer and external data. If a channel
|
||||
is offered by the paravisor with confidential VMBus, the ring buffer can always
|
||||
be encrypted since it's strictly for communication between the VTL2 paravisor
|
||||
and the VTL0 guest. However, other memory regions are often used for e.g. DMA,
|
||||
so they need to be accessible by the underlying hardware, and must be
|
||||
unencrypted (unless the device supports encrypted memory). Currently, there are
|
||||
not any VSPs in OpenHCL that support encrypted external memory, but future
|
||||
versions are expected to enable this capability.
|
||||
|
||||
Because some devices on a Confidential VMBus may require decrypted ring buffers
|
||||
and DMA transfers, the guest must interact with two SynICs -- the one provided
|
||||
by the paravisor and the one provided by the Hyper-V host when Confidential
|
||||
VMBus is not offered. Interrupts are always signaled by the paravisor SynIC,
|
||||
but the guest must check for messages and for channel interrupts on both SynICs.
|
||||
|
||||
In the case of a confidential VMBus, regular SynIC access by the guest is
|
||||
intercepted by the paravisor (this includes various MSRs such as the SIMP and
|
||||
SIEFP, as well as hypercalls like HvPostMessage and HvSignalEvent). If the
|
||||
guest actually wants to communicate with the hypervisor, it has to use special
|
||||
mechanisms (GHCB page on SNP, or tdcall on TDX). Messages can be of either
|
||||
kind: with confidential VMBus, messages use the paravisor SynIC, and if the
|
||||
guest chose to communicate directly to the hypervisor, they use the hypervisor
|
||||
SynIC. For interrupt signaling, some channels may be running on the host
|
||||
(non-confidential, using the VMBus relay) and use the hypervisor SynIC, and
|
||||
some on the paravisor and use its SynIC. The RelIDs are coordinated by the
|
||||
OpenHCL VMBus server and are guaranteed to be unique regardless of whether
|
||||
the channel originated on the host or the paravisor.
|
||||
|
||||
load_unaligned_zeropad()
|
||||
------------------------
|
||||
When transitioning memory between encrypted and decrypted, the caller of
|
||||
|
|
|
|||
|
|
@ -11705,6 +11705,7 @@ M: "K. Y. Srinivasan" <kys@microsoft.com>
|
|||
M: Haiyang Zhang <haiyangz@microsoft.com>
|
||||
M: Wei Liu <wei.liu@kernel.org>
|
||||
M: Dexuan Cui <decui@microsoft.com>
|
||||
M: Long Li <longli@microsoft.com>
|
||||
L: linux-hyperv@vger.kernel.org
|
||||
S: Supported
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git
|
||||
|
|
@ -11722,6 +11723,7 @@ F: arch/x86/kernel/cpu/mshyperv.c
|
|||
F: drivers/clocksource/hyperv_timer.c
|
||||
F: drivers/hid/hid-hyperv.c
|
||||
F: drivers/hv/
|
||||
F: drivers/infiniband/hw/mana/
|
||||
F: drivers/input/serio/hyperv-keyboard.c
|
||||
F: drivers/iommu/hyperv-iommu.c
|
||||
F: drivers/net/ethernet/microsoft/
|
||||
|
|
@ -11740,6 +11742,7 @@ F: include/hyperv/hvhdk_mini.h
|
|||
F: include/linux/hyperv.h
|
||||
F: include/net/mana
|
||||
F: include/uapi/linux/hyperv.h
|
||||
F: include/uapi/rdma/mana-abi.h
|
||||
F: net/vmw_vsock/hyperv_transport.c
|
||||
F: tools/hv/
|
||||
|
||||
|
|
|
|||
|
|
@ -1,8 +1,22 @@
|
|||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o
|
||||
obj-$(CONFIG_X86_64) += hv_apic.o
|
||||
obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o
|
||||
obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o mshv_vtl_asm.o
|
||||
|
||||
$(obj)/mshv_vtl_asm.o: $(obj)/mshv-asm-offsets.h
|
||||
|
||||
$(obj)/mshv-asm-offsets.h: $(obj)/mshv-asm-offsets.s FORCE
|
||||
$(call filechk,offsets,__MSHV_ASM_OFFSETS_H__)
|
||||
|
||||
ifdef CONFIG_X86_64
|
||||
obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o
|
||||
|
||||
ifdef CONFIG_MSHV_ROOT
|
||||
CFLAGS_REMOVE_hv_trampoline.o += -pg
|
||||
CFLAGS_hv_trampoline.o += -fno-stack-protector
|
||||
obj-$(CONFIG_CRASH_DUMP) += hv_crash.o hv_trampoline.o
|
||||
endif
|
||||
endif
|
||||
|
||||
targets += mshv-asm-offsets.s
|
||||
clean-files += mshv-asm-offsets.h
|
||||
|
|
|
|||
|
|
@ -53,6 +53,11 @@ static void hv_apic_icr_write(u32 low, u32 id)
|
|||
wrmsrq(HV_X64_MSR_ICR, reg_val);
|
||||
}
|
||||
|
||||
void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set)
|
||||
{
|
||||
apic_update_vector(cpu, vector, set);
|
||||
}
|
||||
|
||||
static u32 hv_apic_read(u32 reg)
|
||||
{
|
||||
u32 reg_val, hi;
|
||||
|
|
@ -293,6 +298,9 @@ static void hv_send_ipi_self(int vector)
|
|||
|
||||
void __init hv_apic_init(void)
|
||||
{
|
||||
if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
|
||||
return;
|
||||
|
||||
if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) {
|
||||
pr_info("Hyper-V: Using IPI hypercalls\n");
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -0,0 +1,642 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* X86 specific Hyper-V root partition kdump/crash support module
|
||||
*
|
||||
* Copyright (C) 2025, Microsoft, Inc.
|
||||
*
|
||||
* This module implements hypervisor RAM collection into vmcore for both
|
||||
* cases of the hypervisor crash and Linux root crash. Hyper-V implements
|
||||
* a disable hypercall with a 32bit protected mode ABI callback. This
|
||||
* mechanism must be used to unlock hypervisor RAM. Since the hypervisor RAM
|
||||
* is already mapped in Linux, it is automatically collected into Linux vmcore,
|
||||
* and can be examined by the crash command (raw RAM dump) or windbg.
|
||||
*
|
||||
* At a high level:
|
||||
*
|
||||
* Hypervisor Crash:
|
||||
* Upon crash, hypervisor goes into an emergency minimal dispatch loop, a
|
||||
* restrictive mode with very limited hypercall and MSR support. Each cpu
|
||||
* then injects NMIs into root vcpus. A shared page is used to check
|
||||
* by Linux in the NMI handler if the hypervisor has crashed. This shared
|
||||
* page is setup in hv_root_crash_init during boot.
|
||||
*
|
||||
* Linux Crash:
|
||||
* In case of Linux crash, the callback hv_crash_stop_other_cpus will send
|
||||
* NMIs to all cpus, then proceed to the crash_nmi_callback where it waits
|
||||
* for all cpus to be in NMI.
|
||||
*
|
||||
* NMI Handler (upon quorum):
|
||||
* Eventually, in both cases, all cpus will end up in the NMI handler.
|
||||
* Hyper-V requires the disable hypervisor must be done from the BSP. So
|
||||
* the BSP NMI handler saves current context, does some fixups and makes
|
||||
* the hypercall to disable the hypervisor, ie, devirtualize. Hypervisor
|
||||
* at that point will suspend all vcpus (except the BSP), unlock all its
|
||||
* RAM, and return to Linux at the 32bit mode entry RIP.
|
||||
*
|
||||
* Linux 32bit entry trampoline will then restore long mode and call C
|
||||
* function here to restore context and continue execution to crash kexec.
|
||||
*/
|
||||
|
||||
#include <linux/delay.h>
|
||||
#include <linux/kexec.h>
|
||||
#include <linux/crash_dump.h>
|
||||
#include <linux/panic.h>
|
||||
#include <asm/apic.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/mshyperv.h>
|
||||
#include <asm/nmi.h>
|
||||
#include <asm/idtentry.h>
|
||||
#include <asm/reboot.h>
|
||||
#include <asm/intel_pt.h>
|
||||
|
||||
bool hv_crash_enabled;
|
||||
EXPORT_SYMBOL_GPL(hv_crash_enabled);
|
||||
|
||||
struct hv_crash_ctxt {
|
||||
ulong rsp;
|
||||
ulong cr0;
|
||||
ulong cr2;
|
||||
ulong cr4;
|
||||
ulong cr8;
|
||||
|
||||
u16 cs;
|
||||
u16 ss;
|
||||
u16 ds;
|
||||
u16 es;
|
||||
u16 fs;
|
||||
u16 gs;
|
||||
|
||||
u16 gdt_fill;
|
||||
struct desc_ptr gdtr;
|
||||
char idt_fill[6];
|
||||
struct desc_ptr idtr;
|
||||
|
||||
u64 gsbase;
|
||||
u64 efer;
|
||||
u64 pat;
|
||||
};
|
||||
static struct hv_crash_ctxt hv_crash_ctxt;
|
||||
|
||||
/* Shared hypervisor page that contains crash dump area we peek into.
|
||||
* NB: windbg looks for "hv_cda" symbol so don't change it.
|
||||
*/
|
||||
static struct hv_crashdump_area *hv_cda;
|
||||
|
||||
static u32 trampoline_pa, devirt_arg;
|
||||
static atomic_t crash_cpus_wait;
|
||||
static void *hv_crash_ptpgs[4];
|
||||
static bool hv_has_crashed, lx_has_crashed;
|
||||
|
||||
static void __noreturn hv_panic_timeout_reboot(void)
|
||||
{
|
||||
#define PANIC_TIMER_STEP 100
|
||||
|
||||
if (panic_timeout > 0) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP)
|
||||
mdelay(PANIC_TIMER_STEP);
|
||||
}
|
||||
|
||||
if (panic_timeout)
|
||||
native_wrmsrq(HV_X64_MSR_RESET, 1); /* get hyp to reboot */
|
||||
|
||||
for (;;)
|
||||
cpu_relax();
|
||||
}
|
||||
|
||||
/* This cannot be inlined as it needs stack */
|
||||
static noinline __noclone void hv_crash_restore_tss(void)
|
||||
{
|
||||
load_TR_desc();
|
||||
}
|
||||
|
||||
/* This cannot be inlined as it needs stack */
|
||||
static noinline void hv_crash_clear_kernpt(void)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
|
||||
/* Clear entry so it's not confusing to someone looking at the core */
|
||||
pgd = pgd_offset_k(trampoline_pa);
|
||||
p4d = p4d_offset(pgd, trampoline_pa);
|
||||
native_p4d_clear(p4d);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the C entry point from the asm glue code after the disable hypercall.
|
||||
* We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
|
||||
* page tables with our below 4G page identity mapped, but using a temporary
|
||||
* GDT. ds/fs/gs/es are null. ss is not usable. bp is null. stack is not
|
||||
* available. We restore kernel GDT, and rest of the context, and continue
|
||||
* to kexec.
|
||||
*/
|
||||
static asmlinkage void __noreturn hv_crash_c_entry(void)
|
||||
{
|
||||
struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
|
||||
|
||||
/* first thing, restore kernel gdt */
|
||||
native_load_gdt(&ctxt->gdtr);
|
||||
|
||||
asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
|
||||
asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
|
||||
|
||||
asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
|
||||
asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
|
||||
asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
|
||||
asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
|
||||
|
||||
native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
|
||||
asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
|
||||
|
||||
asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
|
||||
asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
|
||||
asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
|
||||
|
||||
native_load_idt(&ctxt->idtr);
|
||||
native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
|
||||
native_wrmsrq(MSR_EFER, ctxt->efer);
|
||||
|
||||
/* restore the original kernel CS now via far return */
|
||||
asm volatile("movzwq %0, %%rax\n\t"
|
||||
"pushq %%rax\n\t"
|
||||
"pushq $1f\n\t"
|
||||
"lretq\n\t"
|
||||
"1:nop\n\t" : : "m"(ctxt->cs) : "rax");
|
||||
|
||||
/* We are in asmlinkage without stack frame, hence make C function
|
||||
* calls which will buy stack frames.
|
||||
*/
|
||||
hv_crash_restore_tss();
|
||||
hv_crash_clear_kernpt();
|
||||
|
||||
/* we are now fully in devirtualized normal kernel mode */
|
||||
__crash_kexec(NULL);
|
||||
|
||||
hv_panic_timeout_reboot();
|
||||
}
|
||||
/* Tell gcc we are using lretq long jump in the above function intentionally */
|
||||
STACK_FRAME_NON_STANDARD(hv_crash_c_entry);
|
||||
|
||||
static void hv_mark_tss_not_busy(void)
|
||||
{
|
||||
struct desc_struct *desc = get_current_gdt_rw();
|
||||
tss_desc tss;
|
||||
|
||||
memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
|
||||
tss.type = 0x9; /* available 64-bit TSS. 0xB is busy TSS */
|
||||
write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS);
|
||||
}
|
||||
|
||||
/* Save essential context */
|
||||
static void hv_hvcrash_ctxt_save(void)
|
||||
{
|
||||
struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
|
||||
|
||||
asm volatile("movq %%rsp,%0" : "=m"(ctxt->rsp));
|
||||
|
||||
ctxt->cr0 = native_read_cr0();
|
||||
ctxt->cr4 = native_read_cr4();
|
||||
|
||||
asm volatile("movq %%cr2, %0" : "=a"(ctxt->cr2));
|
||||
asm volatile("movq %%cr8, %0" : "=a"(ctxt->cr8));
|
||||
|
||||
asm volatile("movl %%cs, %%eax" : "=a"(ctxt->cs));
|
||||
asm volatile("movl %%ss, %%eax" : "=a"(ctxt->ss));
|
||||
asm volatile("movl %%ds, %%eax" : "=a"(ctxt->ds));
|
||||
asm volatile("movl %%es, %%eax" : "=a"(ctxt->es));
|
||||
asm volatile("movl %%fs, %%eax" : "=a"(ctxt->fs));
|
||||
asm volatile("movl %%gs, %%eax" : "=a"(ctxt->gs));
|
||||
|
||||
native_store_gdt(&ctxt->gdtr);
|
||||
store_idt(&ctxt->idtr);
|
||||
|
||||
ctxt->gsbase = __rdmsr(MSR_GS_BASE);
|
||||
ctxt->efer = __rdmsr(MSR_EFER);
|
||||
ctxt->pat = __rdmsr(MSR_IA32_CR_PAT);
|
||||
}
|
||||
|
||||
/* Add trampoline page to the kernel pagetable for transition to kernel PT */
|
||||
static void hv_crash_fixup_kernpt(void)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
|
||||
pgd = pgd_offset_k(trampoline_pa);
|
||||
p4d = p4d_offset(pgd, trampoline_pa);
|
||||
|
||||
/* trampoline_pa is below 4G, so no pre-existing entry to clobber */
|
||||
p4d_populate(&init_mm, p4d, (pud_t *)hv_crash_ptpgs[1]);
|
||||
p4d->p4d = p4d->p4d & ~(_PAGE_NX); /* enable execute */
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify the hyp that Linux has crashed. This will cause the hyp to quiesce
|
||||
* and suspend all guest VPs.
|
||||
*/
|
||||
static void hv_notify_prepare_hyp(void)
|
||||
{
|
||||
u64 status;
|
||||
struct hv_input_notify_partition_event *input;
|
||||
struct hv_partition_event_root_crashdump_input *cda;
|
||||
|
||||
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
|
||||
cda = &input->input.crashdump_input;
|
||||
memset(input, 0, sizeof(*input));
|
||||
input->event = HV_PARTITION_EVENT_ROOT_CRASHDUMP;
|
||||
|
||||
cda->crashdump_action = HV_CRASHDUMP_ENTRY;
|
||||
status = hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
|
||||
if (!hv_result_success(status))
|
||||
return;
|
||||
|
||||
cda->crashdump_action = HV_CRASHDUMP_SUSPEND_ALL_VPS;
|
||||
hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Common function for all cpus before devirtualization.
|
||||
*
|
||||
* Hypervisor crash: all cpus get here in NMI context.
|
||||
* Linux crash: the panicing cpu gets here at base level, all others in NMI
|
||||
* context. Note, panicing cpu may not be the BSP.
|
||||
*
|
||||
* The function is not inlined so it will show on the stack. It is named so
|
||||
* because the crash cmd looks for certain well known function names on the
|
||||
* stack before looking into the cpu saved note in the elf section, and
|
||||
* that work is currently incomplete.
|
||||
*
|
||||
* Notes:
|
||||
* Hypervisor crash:
|
||||
* - the hypervisor is in a very restrictive mode at this point and any
|
||||
* vmexit it cannot handle would result in reboot. So, no mumbo jumbo,
|
||||
* just get to kexec as quickly as possible.
|
||||
*
|
||||
* Devirtualization is supported from the BSP only at present.
|
||||
*/
|
||||
static noinline __noclone void crash_nmi_callback(struct pt_regs *regs)
|
||||
{
|
||||
struct hv_input_disable_hyp_ex *input;
|
||||
u64 status;
|
||||
int msecs = 1000, ccpu = smp_processor_id();
|
||||
|
||||
if (ccpu == 0) {
|
||||
/* crash_save_cpu() will be done in the kexec path */
|
||||
cpu_emergency_stop_pt(); /* disable performance trace */
|
||||
atomic_inc(&crash_cpus_wait);
|
||||
} else {
|
||||
crash_save_cpu(regs, ccpu);
|
||||
cpu_emergency_stop_pt(); /* disable performance trace */
|
||||
atomic_inc(&crash_cpus_wait);
|
||||
for (;;)
|
||||
cpu_relax();
|
||||
}
|
||||
|
||||
while (atomic_read(&crash_cpus_wait) < num_online_cpus() && msecs--)
|
||||
mdelay(1);
|
||||
|
||||
stop_nmi();
|
||||
if (!hv_has_crashed)
|
||||
hv_notify_prepare_hyp();
|
||||
|
||||
if (crashing_cpu == -1)
|
||||
crashing_cpu = ccpu; /* crash cmd uses this */
|
||||
|
||||
hv_hvcrash_ctxt_save();
|
||||
hv_mark_tss_not_busy();
|
||||
hv_crash_fixup_kernpt();
|
||||
|
||||
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
|
||||
memset(input, 0, sizeof(*input));
|
||||
input->rip = trampoline_pa;
|
||||
input->arg = devirt_arg;
|
||||
|
||||
status = hv_do_hypercall(HVCALL_DISABLE_HYP_EX, input, NULL);
|
||||
|
||||
hv_panic_timeout_reboot();
|
||||
}
|
||||
|
||||
|
||||
static DEFINE_SPINLOCK(hv_crash_reboot_lk);
|
||||
|
||||
/*
|
||||
* Generic NMI callback handler: could be called without any crash also.
|
||||
* hv crash: hypervisor injects NMI's into all cpus
|
||||
* lx crash: panicing cpu sends NMI to all but self via crash_stop_other_cpus
|
||||
*/
|
||||
static int hv_crash_nmi_local(unsigned int cmd, struct pt_regs *regs)
|
||||
{
|
||||
if (!hv_has_crashed && hv_cda && hv_cda->cda_valid)
|
||||
hv_has_crashed = true;
|
||||
|
||||
if (!hv_has_crashed && !lx_has_crashed)
|
||||
return NMI_DONE; /* ignore the NMI */
|
||||
|
||||
if (hv_has_crashed && !kexec_crash_loaded()) {
|
||||
if (spin_trylock(&hv_crash_reboot_lk))
|
||||
hv_panic_timeout_reboot();
|
||||
else
|
||||
for (;;)
|
||||
cpu_relax();
|
||||
}
|
||||
|
||||
crash_nmi_callback(regs);
|
||||
|
||||
return NMI_DONE;
|
||||
}
|
||||
|
||||
/*
|
||||
* hv_crash_stop_other_cpus() == smp_ops.crash_stop_other_cpus
|
||||
*
|
||||
* On normal Linux panic, this is called twice: first from panic and then again
|
||||
* from native_machine_crash_shutdown.
|
||||
*
|
||||
* In case of hyperv, 3 ways to get here:
|
||||
* 1. hv crash (only BSP will get here):
|
||||
* BSP : NMI callback -> DisableHv -> hv_crash_asm32 -> hv_crash_c_entry
|
||||
* -> __crash_kexec -> native_machine_crash_shutdown
|
||||
* -> crash_smp_send_stop -> smp_ops.crash_stop_other_cpus
|
||||
* Linux panic:
|
||||
* 2. panic cpu x: panic() -> crash_smp_send_stop
|
||||
* -> smp_ops.crash_stop_other_cpus
|
||||
* 3. BSP: native_machine_crash_shutdown -> crash_smp_send_stop
|
||||
*
|
||||
* NB: noclone and non standard stack because of call to crash_setup_regs().
|
||||
*/
|
||||
static void __noclone hv_crash_stop_other_cpus(void)
|
||||
{
|
||||
static bool crash_stop_done;
|
||||
struct pt_regs lregs;
|
||||
int ccpu = smp_processor_id();
|
||||
|
||||
if (hv_has_crashed)
|
||||
return; /* all cpus already in NMI handler path */
|
||||
|
||||
if (!kexec_crash_loaded()) {
|
||||
hv_notify_prepare_hyp();
|
||||
hv_panic_timeout_reboot(); /* no return */
|
||||
}
|
||||
|
||||
/* If the hv crashes also, we could come here again before cpus_stopped
|
||||
* is set in crash_smp_send_stop(). So use our own check.
|
||||
*/
|
||||
if (crash_stop_done)
|
||||
return;
|
||||
crash_stop_done = true;
|
||||
|
||||
/* Linux has crashed: hv is healthy, we can IPI safely */
|
||||
lx_has_crashed = true;
|
||||
wmb(); /* NMI handlers look at lx_has_crashed */
|
||||
|
||||
apic->send_IPI_allbutself(NMI_VECTOR);
|
||||
|
||||
if (crashing_cpu == -1)
|
||||
crashing_cpu = ccpu; /* crash cmd uses this */
|
||||
|
||||
/* crash_setup_regs() happens in kexec also, but for the kexec cpu which
|
||||
* is the BSP. We could be here on non-BSP cpu, collect regs if so.
|
||||
*/
|
||||
if (ccpu)
|
||||
crash_setup_regs(&lregs, NULL);
|
||||
|
||||
crash_nmi_callback(&lregs);
|
||||
}
|
||||
STACK_FRAME_NON_STANDARD(hv_crash_stop_other_cpus);
|
||||
|
||||
/* This GDT is accessed in IA32-e compat mode which uses 32bits addresses */
|
||||
struct hv_gdtreg_32 {
|
||||
u16 fill;
|
||||
u16 limit;
|
||||
u32 address;
|
||||
} __packed;
|
||||
|
||||
/* We need a CS with L bit to goto IA32-e long mode from 32bit compat mode */
|
||||
struct hv_crash_tramp_gdt {
|
||||
u64 null; /* index 0, selector 0, null selector */
|
||||
u64 cs64; /* index 1, selector 8, cs64 selector */
|
||||
} __packed;
|
||||
|
||||
/* No stack, so jump via far ptr in memory to load the 64bit CS */
|
||||
struct hv_cs_jmptgt {
|
||||
u32 address;
|
||||
u16 csval;
|
||||
u16 fill;
|
||||
} __packed;
|
||||
|
||||
/* Linux use only, hypervisor doesn't look at this struct */
|
||||
struct hv_crash_tramp_data {
|
||||
u64 tramp32_cr3;
|
||||
u64 kernel_cr3;
|
||||
struct hv_gdtreg_32 gdtr32;
|
||||
struct hv_crash_tramp_gdt tramp_gdt;
|
||||
struct hv_cs_jmptgt cs_jmptgt;
|
||||
u64 c_entry_addr;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* Setup a temporary gdt to allow the asm code to switch to the long mode.
|
||||
* Since the asm code is relocated/copied to a below 4G page, it cannot use rip
|
||||
* relative addressing, hence we must use trampoline_pa here. Also, save other
|
||||
* info like jmp and C entry targets for same reasons.
|
||||
*
|
||||
* Returns: 0 on success, -1 on error
|
||||
*/
|
||||
static int hv_crash_setup_trampdata(u64 trampoline_va)
|
||||
{
|
||||
int size, offs;
|
||||
void *dest;
|
||||
struct hv_crash_tramp_data *tramp;
|
||||
|
||||
/* These must match exactly the ones in the corresponding asm file */
|
||||
BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, tramp32_cr3) != 0);
|
||||
BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, kernel_cr3) != 8);
|
||||
BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, gdtr32.limit) != 18);
|
||||
BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data,
|
||||
cs_jmptgt.address) != 40);
|
||||
BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, c_entry_addr) != 48);
|
||||
|
||||
/* hv_crash_asm_end is beyond last byte by 1 */
|
||||
size = &hv_crash_asm_end - &hv_crash_asm32;
|
||||
if (size + sizeof(struct hv_crash_tramp_data) > PAGE_SIZE) {
|
||||
pr_err("%s: trampoline page overflow\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
dest = (void *)trampoline_va;
|
||||
memcpy(dest, &hv_crash_asm32, size);
|
||||
|
||||
dest += size;
|
||||
dest = (void *)round_up((ulong)dest, 16);
|
||||
tramp = (struct hv_crash_tramp_data *)dest;
|
||||
|
||||
/* see MAX_ASID_AVAILABLE in tlb.c: "PCID 0 is reserved for use by
|
||||
* non-PCID-aware users". Build cr3 with pcid 0
|
||||
*/
|
||||
tramp->tramp32_cr3 = __sme_pa(hv_crash_ptpgs[0]);
|
||||
|
||||
/* Note, when restoring X86_CR4_PCIDE, cr3[11:0] must be zero */
|
||||
tramp->kernel_cr3 = __sme_pa(init_mm.pgd);
|
||||
|
||||
tramp->gdtr32.limit = sizeof(struct hv_crash_tramp_gdt);
|
||||
tramp->gdtr32.address = trampoline_pa +
|
||||
(ulong)&tramp->tramp_gdt - trampoline_va;
|
||||
|
||||
/* base:0 limit:0xfffff type:b dpl:0 P:1 L:1 D:0 avl:0 G:1 */
|
||||
tramp->tramp_gdt.cs64 = 0x00af9a000000ffff;
|
||||
|
||||
tramp->cs_jmptgt.csval = 0x8;
|
||||
offs = (ulong)&hv_crash_asm64 - (ulong)&hv_crash_asm32;
|
||||
tramp->cs_jmptgt.address = trampoline_pa + offs;
|
||||
|
||||
tramp->c_entry_addr = (u64)&hv_crash_c_entry;
|
||||
|
||||
devirt_arg = trampoline_pa + (ulong)dest - trampoline_va;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Build 32bit trampoline page table for transition from protected mode
|
||||
* non-paging to long-mode paging. This transition needs pagetables below 4G.
|
||||
*/
|
||||
static void hv_crash_build_tramp_pt(void)
|
||||
{
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
u64 pa, addr = trampoline_pa;
|
||||
|
||||
p4d = hv_crash_ptpgs[0] + pgd_index(addr) * sizeof(p4d);
|
||||
pa = virt_to_phys(hv_crash_ptpgs[1]);
|
||||
set_p4d(p4d, __p4d(_PAGE_TABLE | pa));
|
||||
p4d->p4d &= ~(_PAGE_NX); /* enable execute */
|
||||
|
||||
pud = hv_crash_ptpgs[1] + pud_index(addr) * sizeof(pud);
|
||||
pa = virt_to_phys(hv_crash_ptpgs[2]);
|
||||
set_pud(pud, __pud(_PAGE_TABLE | pa));
|
||||
|
||||
pmd = hv_crash_ptpgs[2] + pmd_index(addr) * sizeof(pmd);
|
||||
pa = virt_to_phys(hv_crash_ptpgs[3]);
|
||||
set_pmd(pmd, __pmd(_PAGE_TABLE | pa));
|
||||
|
||||
pte = hv_crash_ptpgs[3] + pte_index(addr) * sizeof(pte);
|
||||
set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
|
||||
}
|
||||
|
||||
/*
|
||||
* Setup trampoline for devirtualization:
|
||||
* - a page below 4G, ie 32bit addr containing asm glue code that hyp jmps to
|
||||
* in protected mode.
|
||||
* - 4 pages for a temporary page table that asm code uses to turn paging on
|
||||
* - a temporary gdt to use in the compat mode.
|
||||
*
|
||||
* Returns: 0 on success
|
||||
*/
|
||||
static int hv_crash_trampoline_setup(void)
|
||||
{
|
||||
int i, rc, order;
|
||||
struct page *page;
|
||||
u64 trampoline_va;
|
||||
gfp_t flags32 = GFP_KERNEL | GFP_DMA32 | __GFP_ZERO;
|
||||
|
||||
/* page for 32bit trampoline assembly code + hv_crash_tramp_data */
|
||||
page = alloc_page(flags32);
|
||||
if (page == NULL) {
|
||||
pr_err("%s: failed to alloc asm stub page\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
trampoline_va = (u64)page_to_virt(page);
|
||||
trampoline_pa = (u32)page_to_phys(page);
|
||||
|
||||
order = 2; /* alloc 2^2 pages */
|
||||
page = alloc_pages(flags32, order);
|
||||
if (page == NULL) {
|
||||
pr_err("%s: failed to alloc pt pages\n", __func__);
|
||||
free_page(trampoline_va);
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (i = 0; i < 4; i++, page++)
|
||||
hv_crash_ptpgs[i] = page_to_virt(page);
|
||||
|
||||
hv_crash_build_tramp_pt();
|
||||
|
||||
rc = hv_crash_setup_trampdata(trampoline_va);
|
||||
if (rc)
|
||||
goto errout;
|
||||
|
||||
return 0;
|
||||
|
||||
errout:
|
||||
free_page(trampoline_va);
|
||||
free_pages((ulong)hv_crash_ptpgs[0], order);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* Setup for kdump kexec to collect hypervisor RAM when running as root */
|
||||
void hv_root_crash_init(void)
|
||||
{
|
||||
int rc;
|
||||
struct hv_input_get_system_property *input;
|
||||
struct hv_output_get_system_property *output;
|
||||
unsigned long flags;
|
||||
u64 status;
|
||||
union hv_pfn_range cda_info;
|
||||
|
||||
if (pgtable_l5_enabled()) {
|
||||
pr_err("Hyper-V: crash dump not yet supported on 5level PTs\n");
|
||||
return;
|
||||
}
|
||||
|
||||
rc = register_nmi_handler(NMI_LOCAL, hv_crash_nmi_local, NMI_FLAG_FIRST,
|
||||
"hv_crash_nmi");
|
||||
if (rc) {
|
||||
pr_err("Hyper-V: failed to register crash nmi handler\n");
|
||||
return;
|
||||
}
|
||||
|
||||
local_irq_save(flags);
|
||||
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
|
||||
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
|
||||
|
||||
memset(input, 0, sizeof(*input));
|
||||
input->property_id = HV_SYSTEM_PROPERTY_CRASHDUMPAREA;
|
||||
|
||||
status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
|
||||
cda_info.as_uint64 = output->hv_cda_info.as_uint64;
|
||||
local_irq_restore(flags);
|
||||
|
||||
if (!hv_result_success(status)) {
|
||||
pr_err("Hyper-V: %s: property:%d %s\n", __func__,
|
||||
input->property_id, hv_result_to_string(status));
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
if (cda_info.base_pfn == 0) {
|
||||
pr_err("Hyper-V: hypervisor crash dump area pfn is 0\n");
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
hv_cda = phys_to_virt(cda_info.base_pfn << HV_HYP_PAGE_SHIFT);
|
||||
|
||||
rc = hv_crash_trampoline_setup();
|
||||
if (rc)
|
||||
goto err_out;
|
||||
|
||||
smp_ops.crash_stop_other_cpus = hv_crash_stop_other_cpus;
|
||||
|
||||
crash_kexec_post_notifiers = true;
|
||||
hv_crash_enabled = true;
|
||||
pr_info("Hyper-V: both linux and hypervisor kdump support enabled\n");
|
||||
|
||||
return;
|
||||
|
||||
err_out:
|
||||
unregister_nmi_handler(NMI_LOCAL, "hv_crash_nmi");
|
||||
pr_err("Hyper-V: only linux root kdump support enabled\n");
|
||||
}
|
||||
|
|
@ -170,6 +170,10 @@ static int hv_cpu_init(unsigned int cpu)
|
|||
wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
|
||||
}
|
||||
|
||||
/* Allow Hyper-V stimer vector to be injected from Hypervisor. */
|
||||
if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
|
||||
apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, true);
|
||||
|
||||
return hyperv_init_ghcb();
|
||||
}
|
||||
|
||||
|
|
@ -277,6 +281,9 @@ static int hv_cpu_die(unsigned int cpu)
|
|||
*ghcb_va = NULL;
|
||||
}
|
||||
|
||||
if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
|
||||
apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, false);
|
||||
|
||||
hv_common_cpu_die(cpu);
|
||||
|
||||
if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
|
||||
|
|
@ -551,6 +558,8 @@ void __init hyperv_init(void)
|
|||
memunmap(src);
|
||||
|
||||
hv_remap_tsc_clocksource();
|
||||
hv_root_crash_init();
|
||||
hv_sleep_notifiers_register();
|
||||
} else {
|
||||
hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
|
||||
wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,101 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
* X86 specific Hyper-V kdump/crash related code.
|
||||
*
|
||||
* Copyright (C) 2025, Microsoft, Inc.
|
||||
*
|
||||
*/
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/msr.h>
|
||||
#include <asm/processor-flags.h>
|
||||
#include <asm/nospec-branch.h>
|
||||
|
||||
/*
|
||||
* void noreturn hv_crash_asm32(arg1)
|
||||
* arg1 == edi == 32bit PA of struct hv_crash_tramp_data
|
||||
*
|
||||
* The hypervisor jumps here upon devirtualization in protected mode. This
|
||||
* code gets copied to a page in the low 4G ie, 32bit space so it can run
|
||||
* in the protected mode. Hence we cannot use any compile/link time offsets or
|
||||
* addresses. It restores long mode via temporary gdt and page tables and
|
||||
* eventually jumps to kernel code entry at HV_CRASHDATA_OFFS_C_entry.
|
||||
*
|
||||
* PreCondition (ie, Hypervisor call back ABI):
|
||||
* o CR0 is set to 0x0021: PE(prot mode) and NE are set, paging is disabled
|
||||
* o CR4 is set to 0x0
|
||||
* o IA32_EFER is set to 0x901 (SCE and NXE are set)
|
||||
* o EDI is set to the Arg passed to HVCALL_DISABLE_HYP_EX.
|
||||
* o CS, DS, ES, FS, GS are all initialized with a base of 0 and limit 0xFFFF
|
||||
* o IDTR, TR and GDTR are initialized with a base of 0 and limit of 0xFFFF
|
||||
* o LDTR is initialized as invalid (limit of 0)
|
||||
* o MSR PAT is power on default.
|
||||
* o Other state/registers are cleared. All TLBs flushed.
|
||||
*/
|
||||
|
||||
#define HV_CRASHDATA_OFFS_TRAMPCR3 0x0 /* 0 */
|
||||
#define HV_CRASHDATA_OFFS_KERNCR3 0x8 /* 8 */
|
||||
#define HV_CRASHDATA_OFFS_GDTRLIMIT 0x12 /* 18 */
|
||||
#define HV_CRASHDATA_OFFS_CS_JMPTGT 0x28 /* 40 */
|
||||
#define HV_CRASHDATA_OFFS_C_entry 0x30 /* 48 */
|
||||
|
||||
.text
|
||||
.code32
|
||||
|
||||
SYM_CODE_START(hv_crash_asm32)
|
||||
UNWIND_HINT_UNDEFINED
|
||||
ENDBR
|
||||
movl $X86_CR4_PAE, %ecx
|
||||
movl %ecx, %cr4
|
||||
|
||||
movl %edi, %ebx
|
||||
add $HV_CRASHDATA_OFFS_TRAMPCR3, %ebx
|
||||
movl %cs:(%ebx), %eax
|
||||
movl %eax, %cr3
|
||||
|
||||
/* Setup EFER for long mode now */
|
||||
movl $MSR_EFER, %ecx
|
||||
rdmsr
|
||||
btsl $_EFER_LME, %eax
|
||||
wrmsr
|
||||
|
||||
/* Turn paging on using the temp 32bit trampoline page table */
|
||||
movl %cr0, %eax
|
||||
orl $(X86_CR0_PG), %eax
|
||||
movl %eax, %cr0
|
||||
|
||||
/* since kernel cr3 could be above 4G, we need to be in the long mode
|
||||
* before we can load 64bits of the kernel cr3. We use a temp gdt for
|
||||
* that with CS.L=1 and CS.D=0 */
|
||||
mov %edi, %eax
|
||||
add $HV_CRASHDATA_OFFS_GDTRLIMIT, %eax
|
||||
lgdtl %cs:(%eax)
|
||||
|
||||
/* not done yet, restore CS now to switch to CS.L=1 */
|
||||
mov %edi, %eax
|
||||
add $HV_CRASHDATA_OFFS_CS_JMPTGT, %eax
|
||||
ljmp %cs:*(%eax)
|
||||
SYM_CODE_END(hv_crash_asm32)
|
||||
|
||||
/* we now run in full 64bit IA32-e long mode, CS.L=1 and CS.D=0 */
|
||||
.code64
|
||||
.balign 8
|
||||
SYM_CODE_START(hv_crash_asm64)
|
||||
UNWIND_HINT_UNDEFINED
|
||||
ENDBR
|
||||
/* restore kernel page tables so we can jump to kernel code */
|
||||
mov %edi, %eax
|
||||
add $HV_CRASHDATA_OFFS_KERNCR3, %eax
|
||||
movq %cs:(%eax), %rbx
|
||||
movq %rbx, %cr3
|
||||
|
||||
mov %edi, %eax
|
||||
add $HV_CRASHDATA_OFFS_C_entry, %eax
|
||||
movq %cs:(%eax), %rbx
|
||||
ANNOTATE_RETPOLINE_SAFE
|
||||
jmp *%rbx
|
||||
|
||||
int $3
|
||||
|
||||
SYM_INNER_LABEL(hv_crash_asm_end, SYM_L_GLOBAL)
|
||||
SYM_CODE_END(hv_crash_asm64)
|
||||
|
|
@ -9,12 +9,17 @@
|
|||
#include <asm/apic.h>
|
||||
#include <asm/boot.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm/fpu/types.h>
|
||||
#include <asm/i8259.h>
|
||||
#include <asm/mshyperv.h>
|
||||
#include <asm/msr.h>
|
||||
#include <asm/realmode.h>
|
||||
#include <asm/reboot.h>
|
||||
#include <asm/smap.h>
|
||||
#include <linux/export.h>
|
||||
#include <../kernel/smpboot.h>
|
||||
#include "../../kernel/fpu/legacy.h"
|
||||
|
||||
extern struct boot_params boot_params;
|
||||
static struct real_mode_header hv_vtl_real_mode_header;
|
||||
|
|
@ -249,3 +254,28 @@ int __init hv_vtl_early_init(void)
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
DEFINE_STATIC_CALL_NULL(__mshv_vtl_return_hypercall, void (*)(void));
|
||||
|
||||
void mshv_vtl_return_call_init(u64 vtl_return_offset)
|
||||
{
|
||||
static_call_update(__mshv_vtl_return_hypercall,
|
||||
(void *)((u8 *)hv_hypercall_pg + vtl_return_offset));
|
||||
}
|
||||
EXPORT_SYMBOL(mshv_vtl_return_call_init);
|
||||
|
||||
void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0)
|
||||
{
|
||||
struct hv_vp_assist_page *hvp;
|
||||
|
||||
hvp = hv_vp_assist_page[smp_processor_id()];
|
||||
hvp->vtl_ret_x64rax = vtl0->rax;
|
||||
hvp->vtl_ret_x64rcx = vtl0->rcx;
|
||||
|
||||
kernel_fpu_begin_mask(0);
|
||||
fxrstor(&vtl0->fx_state);
|
||||
__mshv_vtl_return_call(vtl0);
|
||||
fxsave(&vtl0->fx_state);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
EXPORT_SYMBOL(mshv_vtl_return_call);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,37 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Generate definitions needed by assembly language modules.
|
||||
* This code generates raw asm output which is post-processed to extract
|
||||
* and format the required data.
|
||||
*
|
||||
* Copyright (c) 2025, Microsoft Corporation.
|
||||
*
|
||||
* Author:
|
||||
* Naman Jain <namjain@microsoft.com>
|
||||
*/
|
||||
#define COMPILE_OFFSETS
|
||||
|
||||
#include <linux/kbuild.h>
|
||||
#include <asm/mshyperv.h>
|
||||
|
||||
static void __used common(void)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_HYPERV_VTL_MODE)) {
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_rax, mshv_vtl_cpu_context, rax);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_rcx, mshv_vtl_cpu_context, rcx);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_rdx, mshv_vtl_cpu_context, rdx);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_rbx, mshv_vtl_cpu_context, rbx);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_rbp, mshv_vtl_cpu_context, rbp);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_rsi, mshv_vtl_cpu_context, rsi);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_rdi, mshv_vtl_cpu_context, rdi);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_r8, mshv_vtl_cpu_context, r8);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_r9, mshv_vtl_cpu_context, r9);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_r10, mshv_vtl_cpu_context, r10);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_r11, mshv_vtl_cpu_context, r11);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_r12, mshv_vtl_cpu_context, r12);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_r13, mshv_vtl_cpu_context, r13);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_r14, mshv_vtl_cpu_context, r14);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_r15, mshv_vtl_cpu_context, r15);
|
||||
OFFSET(MSHV_VTL_CPU_CONTEXT_cr2, mshv_vtl_cpu_context, cr2);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,116 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0
|
||||
*
|
||||
* Assembly level code for mshv_vtl VTL transition
|
||||
*
|
||||
* Copyright (c) 2025, Microsoft Corporation.
|
||||
*
|
||||
* Author:
|
||||
* Naman Jain <namjain@microsoft.com>
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <linux/static_call_types.h>
|
||||
#include <asm/asm.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/frame.h>
|
||||
#include "mshv-asm-offsets.h"
|
||||
|
||||
.text
|
||||
.section .noinstr.text, "ax"
|
||||
/*
|
||||
* void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0)
|
||||
*
|
||||
* This function is used to context switch between different Virtual Trust Levels.
|
||||
* It is marked as 'noinstr' to prevent against instrumentation and debugging facilities.
|
||||
* NMIs aren't a problem because the NMI handler saves/restores CR2 specifically to guard
|
||||
* against #PFs in NMI context clobbering the guest state.
|
||||
*/
|
||||
SYM_FUNC_START(__mshv_vtl_return_call)
|
||||
/* Push callee save registers */
|
||||
pushq %rbp
|
||||
mov %rsp, %rbp
|
||||
pushq %r12
|
||||
pushq %r13
|
||||
pushq %r14
|
||||
pushq %r15
|
||||
pushq %rbx
|
||||
|
||||
/* register switch to VTL0 clobbers all registers except rax/rcx */
|
||||
mov %_ASM_ARG1, %rax
|
||||
|
||||
/* grab rbx/rbp/rsi/rdi/r8-r15 */
|
||||
mov MSHV_VTL_CPU_CONTEXT_rbx(%rax), %rbx
|
||||
mov MSHV_VTL_CPU_CONTEXT_rbp(%rax), %rbp
|
||||
mov MSHV_VTL_CPU_CONTEXT_rsi(%rax), %rsi
|
||||
mov MSHV_VTL_CPU_CONTEXT_rdi(%rax), %rdi
|
||||
mov MSHV_VTL_CPU_CONTEXT_r8(%rax), %r8
|
||||
mov MSHV_VTL_CPU_CONTEXT_r9(%rax), %r9
|
||||
mov MSHV_VTL_CPU_CONTEXT_r10(%rax), %r10
|
||||
mov MSHV_VTL_CPU_CONTEXT_r11(%rax), %r11
|
||||
mov MSHV_VTL_CPU_CONTEXT_r12(%rax), %r12
|
||||
mov MSHV_VTL_CPU_CONTEXT_r13(%rax), %r13
|
||||
mov MSHV_VTL_CPU_CONTEXT_r14(%rax), %r14
|
||||
mov MSHV_VTL_CPU_CONTEXT_r15(%rax), %r15
|
||||
|
||||
mov MSHV_VTL_CPU_CONTEXT_cr2(%rax), %rdx
|
||||
mov %rdx, %cr2
|
||||
mov MSHV_VTL_CPU_CONTEXT_rdx(%rax), %rdx
|
||||
|
||||
/* stash host registers on stack */
|
||||
pushq %rax
|
||||
pushq %rcx
|
||||
|
||||
xor %ecx, %ecx
|
||||
|
||||
/* make a hypercall to switch VTL */
|
||||
call STATIC_CALL_TRAMP_STR(__mshv_vtl_return_hypercall)
|
||||
|
||||
/* stash guest registers on stack, restore saved host copies */
|
||||
pushq %rax
|
||||
pushq %rcx
|
||||
mov 16(%rsp), %rcx
|
||||
mov 24(%rsp), %rax
|
||||
|
||||
mov %rdx, MSHV_VTL_CPU_CONTEXT_rdx(%rax)
|
||||
mov %cr2, %rdx
|
||||
mov %rdx, MSHV_VTL_CPU_CONTEXT_cr2(%rax)
|
||||
pop MSHV_VTL_CPU_CONTEXT_rcx(%rax)
|
||||
pop MSHV_VTL_CPU_CONTEXT_rax(%rax)
|
||||
add $16, %rsp
|
||||
|
||||
/* save rbx/rbp/rsi/rdi/r8-r15 */
|
||||
mov %rbx, MSHV_VTL_CPU_CONTEXT_rbx(%rax)
|
||||
mov %rbp, MSHV_VTL_CPU_CONTEXT_rbp(%rax)
|
||||
mov %rsi, MSHV_VTL_CPU_CONTEXT_rsi(%rax)
|
||||
mov %rdi, MSHV_VTL_CPU_CONTEXT_rdi(%rax)
|
||||
mov %r8, MSHV_VTL_CPU_CONTEXT_r8(%rax)
|
||||
mov %r9, MSHV_VTL_CPU_CONTEXT_r9(%rax)
|
||||
mov %r10, MSHV_VTL_CPU_CONTEXT_r10(%rax)
|
||||
mov %r11, MSHV_VTL_CPU_CONTEXT_r11(%rax)
|
||||
mov %r12, MSHV_VTL_CPU_CONTEXT_r12(%rax)
|
||||
mov %r13, MSHV_VTL_CPU_CONTEXT_r13(%rax)
|
||||
mov %r14, MSHV_VTL_CPU_CONTEXT_r14(%rax)
|
||||
mov %r15, MSHV_VTL_CPU_CONTEXT_r15(%rax)
|
||||
|
||||
/* pop callee-save registers r12-r15, rbx */
|
||||
pop %rbx
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
|
||||
pop %rbp
|
||||
RET
|
||||
SYM_FUNC_END(__mshv_vtl_return_call)
|
||||
/*
|
||||
* Make sure that static_call_key symbol: __SCK____mshv_vtl_return_hypercall is accessible here.
|
||||
* Below code is inspired from __ADDRESSABLE(sym) macro. Symbol name is kept simple, to avoid
|
||||
* naming it something like "__UNIQUE_ID_addressable___SCK____mshv_vtl_return_hypercall_662.0"
|
||||
* which would otherwise have been generated by the macro.
|
||||
*/
|
||||
.section .discard.addressable,"aw"
|
||||
.align 8
|
||||
.type mshv_vtl_return_sym, @object
|
||||
.size mshv_vtl_return_sym, 8
|
||||
mshv_vtl_return_sym:
|
||||
.quad __SCK____mshv_vtl_return_hypercall
|
||||
|
|
@ -11,6 +11,7 @@
|
|||
#include <asm/paravirt.h>
|
||||
#include <asm/msr.h>
|
||||
#include <hyperv/hvhdk.h>
|
||||
#include <asm/fpu/types.h>
|
||||
|
||||
/*
|
||||
* Hyper-V always provides a single IO-APIC at this MMIO address.
|
||||
|
|
@ -176,6 +177,8 @@ int hyperv_flush_guest_mapping_range(u64 as,
|
|||
int hyperv_fill_flush_guest_mapping_list(
|
||||
struct hv_guest_mapping_flush_list *flush,
|
||||
u64 start_gfn, u64 end_gfn);
|
||||
void hv_sleep_notifiers_register(void);
|
||||
void hv_machine_power_off(void);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
void hv_apic_init(void);
|
||||
|
|
@ -237,6 +240,15 @@ static __always_inline u64 hv_raw_get_msr(unsigned int reg)
|
|||
}
|
||||
int hv_apicid_to_vp_index(u32 apic_id);
|
||||
|
||||
#if IS_ENABLED(CONFIG_MSHV_ROOT) && IS_ENABLED(CONFIG_CRASH_DUMP)
|
||||
void hv_root_crash_init(void);
|
||||
void hv_crash_asm32(void);
|
||||
void hv_crash_asm64(void);
|
||||
void hv_crash_asm_end(void);
|
||||
#else /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */
|
||||
static inline void hv_root_crash_init(void) {}
|
||||
#endif /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */
|
||||
|
||||
#else /* CONFIG_HYPERV */
|
||||
static inline void hyperv_init(void) {}
|
||||
static inline void hyperv_setup_mmu_ops(void) {}
|
||||
|
|
@ -260,13 +272,46 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; }
|
|||
static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; }
|
||||
#endif /* CONFIG_HYPERV */
|
||||
|
||||
struct mshv_vtl_cpu_context {
|
||||
union {
|
||||
struct {
|
||||
u64 rax;
|
||||
u64 rcx;
|
||||
u64 rdx;
|
||||
u64 rbx;
|
||||
u64 cr2;
|
||||
u64 rbp;
|
||||
u64 rsi;
|
||||
u64 rdi;
|
||||
u64 r8;
|
||||
u64 r9;
|
||||
u64 r10;
|
||||
u64 r11;
|
||||
u64 r12;
|
||||
u64 r13;
|
||||
u64 r14;
|
||||
u64 r15;
|
||||
};
|
||||
u64 gp_regs[16];
|
||||
};
|
||||
|
||||
struct fxregs_state fx_state;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_HYPERV_VTL_MODE
|
||||
void __init hv_vtl_init_platform(void);
|
||||
int __init hv_vtl_early_init(void);
|
||||
void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0);
|
||||
void mshv_vtl_return_call_init(u64 vtl_return_offset);
|
||||
void mshv_vtl_return_hypercall(void);
|
||||
void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0);
|
||||
#else
|
||||
static inline void __init hv_vtl_init_platform(void) {}
|
||||
static inline int __init hv_vtl_early_init(void) { return 0; }
|
||||
static inline void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
|
||||
static inline void mshv_vtl_return_call_init(u64 vtl_return_offset) {}
|
||||
static inline void mshv_vtl_return_hypercall(void) {}
|
||||
static inline void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
|
||||
#endif
|
||||
|
||||
#include <asm-generic/mshyperv.h>
|
||||
|
|
|
|||
|
|
@ -28,9 +28,9 @@
|
|||
#include <asm/apic.h>
|
||||
#include <asm/timer.h>
|
||||
#include <asm/reboot.h>
|
||||
#include <asm/msr.h>
|
||||
#include <asm/nmi.h>
|
||||
#include <clocksource/hyperv_timer.h>
|
||||
#include <asm/msr.h>
|
||||
#include <asm/numa.h>
|
||||
#include <asm/svm.h>
|
||||
|
||||
|
|
@ -39,6 +39,12 @@ bool hv_nested;
|
|||
struct ms_hyperv_info ms_hyperv;
|
||||
|
||||
#if IS_ENABLED(CONFIG_HYPERV)
|
||||
/*
|
||||
* When running with the paravisor, controls proxying the synthetic interrupts
|
||||
* from the host
|
||||
*/
|
||||
static bool hv_para_sint_proxy;
|
||||
|
||||
static inline unsigned int hv_get_nested_msr(unsigned int reg)
|
||||
{
|
||||
if (hv_is_sint_msr(reg))
|
||||
|
|
@ -75,17 +81,51 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_msr);
|
|||
void hv_set_non_nested_msr(unsigned int reg, u64 value)
|
||||
{
|
||||
if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) {
|
||||
/* The hypervisor will get the intercept. */
|
||||
hv_ivm_msr_write(reg, value);
|
||||
|
||||
/* Write proxy bit via wrmsl instruction */
|
||||
if (hv_is_sint_msr(reg))
|
||||
wrmsrq(reg, value | 1 << 20);
|
||||
/* Using wrmsrq so the following goes to the paravisor. */
|
||||
if (hv_is_sint_msr(reg)) {
|
||||
union hv_synic_sint sint = { .as_uint64 = value };
|
||||
|
||||
sint.proxy = hv_para_sint_proxy;
|
||||
native_wrmsrq(reg, sint.as_uint64);
|
||||
}
|
||||
} else {
|
||||
wrmsrq(reg, value);
|
||||
native_wrmsrq(reg, value);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
|
||||
|
||||
/*
|
||||
* Enable or disable proxying synthetic interrupts
|
||||
* to the paravisor.
|
||||
*/
|
||||
void hv_para_set_sint_proxy(bool enable)
|
||||
{
|
||||
hv_para_sint_proxy = enable;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the SynIC register value from the paravisor.
|
||||
*/
|
||||
u64 hv_para_get_synic_register(unsigned int reg)
|
||||
{
|
||||
if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
|
||||
return ~0ULL;
|
||||
return native_read_msr(reg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the SynIC register value with the paravisor.
|
||||
*/
|
||||
void hv_para_set_synic_register(unsigned int reg, u64 val)
|
||||
{
|
||||
if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
|
||||
return;
|
||||
native_write_msr(reg, val);
|
||||
}
|
||||
|
||||
u64 hv_get_msr(unsigned int reg)
|
||||
{
|
||||
if (hv_nested)
|
||||
|
|
@ -215,7 +255,7 @@ static void hv_machine_shutdown(void)
|
|||
#endif /* CONFIG_KEXEC_CORE */
|
||||
|
||||
#ifdef CONFIG_CRASH_DUMP
|
||||
static void hv_machine_crash_shutdown(struct pt_regs *regs)
|
||||
static void hv_guest_crash_shutdown(struct pt_regs *regs)
|
||||
{
|
||||
if (hv_crash_handler)
|
||||
hv_crash_handler(regs);
|
||||
|
|
@ -440,7 +480,7 @@ EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
|
|||
|
||||
static void __init ms_hyperv_init_platform(void)
|
||||
{
|
||||
int hv_max_functions_eax;
|
||||
int hv_max_functions_eax, eax;
|
||||
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
pv_info.name = "Hyper-V";
|
||||
|
|
@ -470,11 +510,27 @@ static void __init ms_hyperv_init_platform(void)
|
|||
|
||||
hv_identify_partition_type();
|
||||
|
||||
if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
|
||||
ms_hyperv.hints |= HV_DEPRECATING_AEOI_RECOMMENDED;
|
||||
|
||||
if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
|
||||
hv_nested = true;
|
||||
pr_info("Hyper-V: running on a nested hypervisor\n");
|
||||
}
|
||||
|
||||
/*
|
||||
* There is no check against the max function for HYPERV_CPUID_VIRT_STACK_* CPUID
|
||||
* leaves as the hypervisor doesn't handle them. Even a nested root partition (L2
|
||||
* root) will not get them because the nested (L1) hypervisor filters them out.
|
||||
* These are handled through intercept processing by the Windows Hyper-V stack
|
||||
* or the paravisor.
|
||||
*/
|
||||
eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
|
||||
ms_hyperv.confidential_vmbus_available =
|
||||
eax & HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE;
|
||||
ms_hyperv.msi_ext_dest_id =
|
||||
eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
|
||||
|
||||
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
|
||||
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
|
||||
x86_platform.calibrate_tsc = hv_get_tsc_khz;
|
||||
|
|
@ -565,11 +621,14 @@ static void __init ms_hyperv_init_platform(void)
|
|||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_HYPERV)
|
||||
if (hv_root_partition())
|
||||
machine_ops.power_off = hv_machine_power_off;
|
||||
#if defined(CONFIG_KEXEC_CORE)
|
||||
machine_ops.shutdown = hv_machine_shutdown;
|
||||
#endif
|
||||
#if defined(CONFIG_CRASH_DUMP)
|
||||
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
|
||||
if (!hv_root_partition())
|
||||
machine_ops.crash_shutdown = hv_guest_crash_shutdown;
|
||||
#endif
|
||||
#endif
|
||||
/*
|
||||
|
|
@ -675,21 +734,10 @@ static bool __init ms_hyperv_x2apic_available(void)
|
|||
* pci-hyperv host bridge.
|
||||
*
|
||||
* Note: for a Hyper-V root partition, this will always return false.
|
||||
* The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by
|
||||
* default, they are implemented as intercepts by the Windows Hyper-V stack.
|
||||
* Even a nested root partition (L2 root) will not get them because the
|
||||
* nested (L1) hypervisor filters them out.
|
||||
*/
|
||||
static bool __init ms_hyperv_msi_ext_dest_id(void)
|
||||
{
|
||||
u32 eax;
|
||||
|
||||
eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE);
|
||||
if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE)
|
||||
return false;
|
||||
|
||||
eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
|
||||
return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
|
||||
return ms_hyperv.msi_ext_dest_id;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_AMD_MEM_ENCRYPT
|
||||
|
|
|
|||
|
|
@ -17,7 +17,8 @@ config HYPERV
|
|||
|
||||
config HYPERV_VTL_MODE
|
||||
bool "Enable Linux to boot in VTL context"
|
||||
depends on (X86_64 || ARM64) && HYPERV
|
||||
depends on (X86_64 && HAVE_STATIC_CALL) || ARM64
|
||||
depends on HYPERV
|
||||
depends on SMP
|
||||
default n
|
||||
help
|
||||
|
|
@ -75,6 +76,8 @@ config MSHV_ROOT
|
|||
depends on PAGE_SIZE_4KB
|
||||
select EVENTFD
|
||||
select VIRT_XFER_TO_GUEST_WORK
|
||||
select HMM_MIRROR
|
||||
select MMU_NOTIFIER
|
||||
default n
|
||||
help
|
||||
Select this option to enable support for booting and running as root
|
||||
|
|
@ -82,4 +85,28 @@ config MSHV_ROOT
|
|||
|
||||
If unsure, say N.
|
||||
|
||||
config MSHV_VTL
|
||||
tristate "Microsoft Hyper-V VTL driver"
|
||||
depends on X86_64 && HYPERV_VTL_MODE
|
||||
depends on HYPERV_VMBUS
|
||||
# Mapping VTL0 memory to a userspace process in VTL2 is supported in OpenHCL.
|
||||
# VTL2 for OpenHCL makes use of Huge Pages to improve performance on VMs,
|
||||
# specially with large memory requirements.
|
||||
depends on TRANSPARENT_HUGEPAGE
|
||||
# MTRRs are controlled by VTL0, and are not specific to individual VTLs.
|
||||
# Therefore, do not attempt to access or modify MTRRs here.
|
||||
depends on !MTRR
|
||||
select CPUMASK_OFFSTACK
|
||||
select VIRT_XFER_TO_GUEST_WORK
|
||||
default n
|
||||
help
|
||||
Select this option to enable Hyper-V VTL driver support.
|
||||
This driver provides interfaces for Virtual Machine Manager (VMM) running in VTL2
|
||||
userspace to create VTLs and partitions, setup and manage VTL0 memory and
|
||||
allow userspace to make direct hypercalls. This also allows to map VTL0's address
|
||||
space to a usermode process in VTL2 and supports getting new VMBus messages and channel
|
||||
events in VTL2.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
endmenu
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ obj-$(CONFIG_HYPERV_VMBUS) += hv_vmbus.o
|
|||
obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
|
||||
obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
|
||||
obj-$(CONFIG_MSHV_ROOT) += mshv_root.o
|
||||
obj-$(CONFIG_MSHV_VTL) += mshv_vtl.o
|
||||
|
||||
CFLAGS_hv_trace.o = -I$(src)
|
||||
CFLAGS_hv_balloon.o = -I$(src)
|
||||
|
|
@ -13,8 +14,12 @@ hv_vmbus-y := vmbus_drv.o \
|
|||
hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o
|
||||
hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
|
||||
mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
|
||||
mshv_root_hv_call.o mshv_portid_table.o
|
||||
mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o
|
||||
mshv_vtl-y := mshv_vtl_main.o
|
||||
|
||||
# Code that must be built-in
|
||||
obj-$(CONFIG_HYPERV) += hv_common.o
|
||||
obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o
|
||||
obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o
|
||||
ifneq ($(CONFIG_MSHV_ROOT)$(CONFIG_MSHV_VTL),)
|
||||
obj-y += mshv_common.o
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -410,6 +410,21 @@ static int create_gpadl_header(enum hv_gpadl_type type, void *kbuffer,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void vmbus_free_channel_msginfo(struct vmbus_channel_msginfo *msginfo)
|
||||
{
|
||||
struct vmbus_channel_msginfo *submsginfo, *tmp;
|
||||
|
||||
if (!msginfo)
|
||||
return;
|
||||
|
||||
list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist,
|
||||
msglistentry) {
|
||||
kfree(submsginfo);
|
||||
}
|
||||
|
||||
kfree(msginfo);
|
||||
}
|
||||
|
||||
/*
|
||||
* __vmbus_establish_gpadl - Establish a GPADL for a buffer or ringbuffer
|
||||
*
|
||||
|
|
@ -429,7 +444,7 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
|
|||
struct vmbus_channel_gpadl_header *gpadlmsg;
|
||||
struct vmbus_channel_gpadl_body *gpadl_body;
|
||||
struct vmbus_channel_msginfo *msginfo = NULL;
|
||||
struct vmbus_channel_msginfo *submsginfo, *tmp;
|
||||
struct vmbus_channel_msginfo *submsginfo;
|
||||
struct list_head *curr;
|
||||
u32 next_gpadl_handle;
|
||||
unsigned long flags;
|
||||
|
|
@ -444,20 +459,24 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
|
|||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the "decrypted" flag to true for the set_memory_decrypted()
|
||||
* success case. In the failure case, the encryption state of the
|
||||
* memory is unknown. Leave "decrypted" as true to ensure the
|
||||
* memory will be leaked instead of going back on the free list.
|
||||
*/
|
||||
gpadl->decrypted = true;
|
||||
ret = set_memory_decrypted((unsigned long)kbuffer,
|
||||
PFN_UP(size));
|
||||
if (ret) {
|
||||
dev_warn(&channel->device_obj->device,
|
||||
"Failed to set host visibility for new GPADL %d.\n",
|
||||
ret);
|
||||
return ret;
|
||||
gpadl->decrypted = !((channel->co_external_memory && type == HV_GPADL_BUFFER) ||
|
||||
(channel->co_ring_buffer && type == HV_GPADL_RING));
|
||||
if (gpadl->decrypted) {
|
||||
/*
|
||||
* The "decrypted" flag being true assumes that set_memory_decrypted() succeeds.
|
||||
* But if it fails, the encryption state of the memory is unknown. In that case,
|
||||
* leave "decrypted" as true to ensure the memory is leaked instead of going back
|
||||
* on the free list.
|
||||
*/
|
||||
ret = set_memory_decrypted((unsigned long)kbuffer,
|
||||
PFN_UP(size));
|
||||
if (ret) {
|
||||
dev_warn(&channel->device_obj->device,
|
||||
"Failed to set host visibility for new GPADL %d.\n",
|
||||
ret);
|
||||
vmbus_free_channel_msginfo(msginfo);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
init_completion(&msginfo->waitevent);
|
||||
|
|
@ -532,12 +551,8 @@ cleanup:
|
|||
spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
|
||||
list_del(&msginfo->msglistentry);
|
||||
spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
|
||||
list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist,
|
||||
msglistentry) {
|
||||
kfree(submsginfo);
|
||||
}
|
||||
|
||||
kfree(msginfo);
|
||||
vmbus_free_channel_msginfo(msginfo);
|
||||
|
||||
if (ret) {
|
||||
/*
|
||||
|
|
@ -545,8 +560,10 @@ cleanup:
|
|||
* left as true so the memory is leaked instead of being
|
||||
* put back on the free list.
|
||||
*/
|
||||
if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size)))
|
||||
gpadl->decrypted = false;
|
||||
if (gpadl->decrypted) {
|
||||
if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size)))
|
||||
gpadl->decrypted = false;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
|
@ -573,7 +590,7 @@ EXPORT_SYMBOL_GPL(vmbus_establish_gpadl);
|
|||
* keeps track of the next available slot in the array. Initially, each
|
||||
* slot points to the next one (as in a Linked List). The last slot
|
||||
* does not point to anything, so its value is U64_MAX by default.
|
||||
* @size The size of the array
|
||||
* @size: The size of the array
|
||||
*/
|
||||
static u64 *request_arr_init(u32 size)
|
||||
{
|
||||
|
|
@ -677,12 +694,13 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
|
|||
goto error_clean_ring;
|
||||
|
||||
err = hv_ringbuffer_init(&newchannel->outbound,
|
||||
page, send_pages, 0);
|
||||
page, send_pages, 0, newchannel->co_ring_buffer);
|
||||
if (err)
|
||||
goto error_free_gpadl;
|
||||
|
||||
err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages],
|
||||
recv_pages, newchannel->max_pkt_size);
|
||||
recv_pages, newchannel->max_pkt_size,
|
||||
newchannel->co_ring_buffer);
|
||||
if (err)
|
||||
goto error_free_gpadl;
|
||||
|
||||
|
|
@ -863,8 +881,11 @@ post_msg_err:
|
|||
|
||||
kfree(info);
|
||||
|
||||
ret = set_memory_encrypted((unsigned long)gpadl->buffer,
|
||||
PFN_UP(gpadl->size));
|
||||
if (gpadl->decrypted)
|
||||
ret = set_memory_encrypted((unsigned long)gpadl->buffer,
|
||||
PFN_UP(gpadl->size));
|
||||
else
|
||||
ret = 0;
|
||||
if (ret)
|
||||
pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret);
|
||||
|
||||
|
|
|
|||
|
|
@ -844,14 +844,14 @@ static void vmbus_wait_for_unload(void)
|
|||
= per_cpu_ptr(hv_context.cpu_context, cpu);
|
||||
|
||||
/*
|
||||
* In a CoCo VM the synic_message_page is not allocated
|
||||
* In a CoCo VM the hyp_synic_message_page is not allocated
|
||||
* in hv_synic_alloc(). Instead it is set/cleared in
|
||||
* hv_synic_enable_regs() and hv_synic_disable_regs()
|
||||
* hv_hyp_synic_enable_regs() and hv_hyp_synic_disable_regs()
|
||||
* such that it is set only when the CPU is online. If
|
||||
* not all present CPUs are online, the message page
|
||||
* might be NULL, so skip such CPUs.
|
||||
*/
|
||||
page_addr = hv_cpu->synic_message_page;
|
||||
page_addr = hv_cpu->hyp_synic_message_page;
|
||||
if (!page_addr)
|
||||
continue;
|
||||
|
||||
|
|
@ -892,7 +892,7 @@ completed:
|
|||
struct hv_per_cpu_context *hv_cpu
|
||||
= per_cpu_ptr(hv_context.cpu_context, cpu);
|
||||
|
||||
page_addr = hv_cpu->synic_message_page;
|
||||
page_addr = hv_cpu->hyp_synic_message_page;
|
||||
if (!page_addr)
|
||||
continue;
|
||||
|
||||
|
|
@ -1022,6 +1022,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
|
|||
struct vmbus_channel_offer_channel *offer;
|
||||
struct vmbus_channel *oldchannel, *newchannel;
|
||||
size_t offer_sz;
|
||||
bool co_ring_buffer, co_external_memory;
|
||||
|
||||
offer = (struct vmbus_channel_offer_channel *)hdr;
|
||||
|
||||
|
|
@ -1034,6 +1035,22 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
|
|||
return;
|
||||
}
|
||||
|
||||
co_ring_buffer = is_co_ring_buffer(offer);
|
||||
co_external_memory = is_co_external_memory(offer);
|
||||
if (!co_ring_buffer && co_external_memory) {
|
||||
pr_err("Invalid offer relid=%d: the ring buffer isn't encrypted\n",
|
||||
offer->child_relid);
|
||||
return;
|
||||
}
|
||||
if (co_ring_buffer || co_external_memory) {
|
||||
if (vmbus_proto_version < VERSION_WIN10_V6_0 || !vmbus_is_confidential()) {
|
||||
pr_err("Invalid offer relid=%d: no support for confidential VMBus\n",
|
||||
offer->child_relid);
|
||||
atomic_dec(&vmbus_connection.offer_in_progress);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
oldchannel = find_primary_channel_by_offer(offer);
|
||||
|
||||
if (oldchannel != NULL) {
|
||||
|
|
@ -1112,6 +1129,8 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
|
|||
pr_err("Unable to allocate channel object\n");
|
||||
return;
|
||||
}
|
||||
newchannel->co_ring_buffer = co_ring_buffer;
|
||||
newchannel->co_external_memory = co_external_memory;
|
||||
|
||||
vmbus_setup_channel_state(newchannel, offer);
|
||||
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version);
|
|||
* Linux guests and are not listed.
|
||||
*/
|
||||
static __u32 vmbus_versions[] = {
|
||||
VERSION_WIN10_V6_0,
|
||||
VERSION_WIN10_V5_3,
|
||||
VERSION_WIN10_V5_2,
|
||||
VERSION_WIN10_V5_1,
|
||||
|
|
@ -65,7 +66,7 @@ static __u32 vmbus_versions[] = {
|
|||
* Maximal VMBus protocol version guests can negotiate. Useful to cap the
|
||||
* VMBus version for testing and debugging purpose.
|
||||
*/
|
||||
static uint max_version = VERSION_WIN10_V5_3;
|
||||
static uint max_version = VERSION_WIN10_V6_0;
|
||||
|
||||
module_param(max_version, uint, S_IRUGO);
|
||||
MODULE_PARM_DESC(max_version,
|
||||
|
|
@ -105,6 +106,9 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
|
|||
vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID;
|
||||
}
|
||||
|
||||
if (vmbus_is_confidential() && version >= VERSION_WIN10_V6_0)
|
||||
msg->feature_flags = VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS;
|
||||
|
||||
/*
|
||||
* shared_gpa_boundary is zero in non-SNP VMs, so it's safe to always
|
||||
* bitwise OR it
|
||||
|
|
|
|||
377
drivers/hv/hv.c
377
drivers/hv/hv.c
|
|
@ -18,6 +18,7 @@
|
|||
#include <linux/clockchips.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/export.h>
|
||||
#include <clocksource/hyperv_timer.h>
|
||||
#include <asm/mshyperv.h>
|
||||
#include <linux/set_memory.h>
|
||||
|
|
@ -25,6 +26,7 @@
|
|||
|
||||
/* The one and only */
|
||||
struct hv_context hv_context;
|
||||
EXPORT_SYMBOL_FOR_MODULES(hv_context, "mshv_vtl");
|
||||
|
||||
/*
|
||||
* hv_init - Main initialization routine.
|
||||
|
|
@ -74,7 +76,11 @@ int hv_post_message(union hv_connection_id connection_id,
|
|||
aligned_msg->payload_size = payload_size;
|
||||
memcpy((void *)aligned_msg->payload, payload, payload_size);
|
||||
|
||||
if (ms_hyperv.paravisor_present) {
|
||||
if (ms_hyperv.paravisor_present && !vmbus_is_confidential()) {
|
||||
/*
|
||||
* If the VMBus isn't confidential, use the CoCo-specific
|
||||
* mechanism to communicate with the hypervisor.
|
||||
*/
|
||||
if (hv_isolation_type_tdx())
|
||||
status = hv_tdx_hypercall(HVCALL_POST_MESSAGE,
|
||||
virt_to_phys(aligned_msg), 0);
|
||||
|
|
@ -88,6 +94,11 @@ int hv_post_message(union hv_connection_id connection_id,
|
|||
u64 control = HVCALL_POST_MESSAGE;
|
||||
|
||||
control |= hv_nested ? HV_HYPERCALL_NESTED : 0;
|
||||
/*
|
||||
* If there is no paravisor, this will go to the hypervisor.
|
||||
* In the Confidential VMBus case, there is the paravisor
|
||||
* to which this will trap.
|
||||
*/
|
||||
status = hv_do_hypercall(control, aligned_msg, NULL);
|
||||
}
|
||||
|
||||
|
|
@ -95,11 +106,72 @@ int hv_post_message(union hv_connection_id connection_id,
|
|||
|
||||
return hv_result(status);
|
||||
}
|
||||
EXPORT_SYMBOL_FOR_MODULES(hv_post_message, "mshv_vtl");
|
||||
|
||||
static int hv_alloc_page(void **page, bool decrypt, const char *note)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* After the page changes its encryption status, its contents might
|
||||
* appear scrambled on some hardware. Thus `get_zeroed_page` would
|
||||
* zero the page out in vain, so do that explicitly exactly once.
|
||||
*
|
||||
* By default, the page is allocated encrypted in a CoCo VM.
|
||||
*/
|
||||
*page = (void *)__get_free_page(GFP_KERNEL);
|
||||
if (!*page)
|
||||
return -ENOMEM;
|
||||
|
||||
if (decrypt)
|
||||
ret = set_memory_decrypted((unsigned long)*page, 1);
|
||||
if (ret)
|
||||
goto failed;
|
||||
|
||||
memset(*page, 0, PAGE_SIZE);
|
||||
return 0;
|
||||
|
||||
failed:
|
||||
/*
|
||||
* Report the failure but don't put the page back on the free list as
|
||||
* its encryption status is unknown.
|
||||
*/
|
||||
pr_err("allocation failed for %s page, error %d, decrypted %d\n",
|
||||
note, ret, decrypt);
|
||||
*page = NULL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int hv_free_page(void **page, bool encrypt, const char *note)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (!*page)
|
||||
return 0;
|
||||
|
||||
if (encrypt)
|
||||
ret = set_memory_encrypted((unsigned long)*page, 1);
|
||||
|
||||
/*
|
||||
* In the case of the failure, the page is leaked. Something is wrong,
|
||||
* prefer to lose the page with the unknown encryption status and stay afloat.
|
||||
*/
|
||||
if (ret)
|
||||
pr_err("deallocation failed for %s page, error %d, encrypt %d\n",
|
||||
note, ret, encrypt);
|
||||
else
|
||||
free_page((unsigned long)*page);
|
||||
|
||||
*page = NULL;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int hv_synic_alloc(void)
|
||||
{
|
||||
int cpu, ret = -ENOMEM;
|
||||
struct hv_per_cpu_context *hv_cpu;
|
||||
const bool decrypt = !vmbus_is_confidential();
|
||||
|
||||
/*
|
||||
* First, zero all per-cpu memory areas so hv_synic_free() can
|
||||
|
|
@ -125,73 +197,37 @@ int hv_synic_alloc(void)
|
|||
vmbus_on_msg_dpc, (unsigned long)hv_cpu);
|
||||
|
||||
if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
|
||||
hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC);
|
||||
if (!hv_cpu->post_msg_page) {
|
||||
pr_err("Unable to allocate post msg page\n");
|
||||
ret = hv_alloc_page(&hv_cpu->post_msg_page,
|
||||
decrypt, "post msg");
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1);
|
||||
if (ret) {
|
||||
pr_err("Failed to decrypt post msg page: %d\n", ret);
|
||||
/* Just leak the page, as it's unsafe to free the page. */
|
||||
hv_cpu->post_msg_page = NULL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
memset(hv_cpu->post_msg_page, 0, PAGE_SIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Synic message and event pages are allocated by paravisor.
|
||||
* Skip these pages allocation here.
|
||||
* If these SynIC pages are not allocated, SIEF and SIM pages
|
||||
* are configured using what the root partition or the paravisor
|
||||
* provides upon reading the SIEFP and SIMP registers.
|
||||
*/
|
||||
if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
|
||||
hv_cpu->synic_message_page =
|
||||
(void *)get_zeroed_page(GFP_ATOMIC);
|
||||
if (!hv_cpu->synic_message_page) {
|
||||
pr_err("Unable to allocate SYNIC message page\n");
|
||||
ret = hv_alloc_page(&hv_cpu->hyp_synic_message_page,
|
||||
decrypt, "hypervisor SynIC msg");
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
hv_cpu->synic_event_page =
|
||||
(void *)get_zeroed_page(GFP_ATOMIC);
|
||||
if (!hv_cpu->synic_event_page) {
|
||||
pr_err("Unable to allocate SYNIC event page\n");
|
||||
|
||||
free_page((unsigned long)hv_cpu->synic_message_page);
|
||||
hv_cpu->synic_message_page = NULL;
|
||||
ret = hv_alloc_page(&hv_cpu->hyp_synic_event_page,
|
||||
decrypt, "hypervisor SynIC event");
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ms_hyperv.paravisor_present &&
|
||||
(hv_isolation_type_snp() || hv_isolation_type_tdx())) {
|
||||
ret = set_memory_decrypted((unsigned long)
|
||||
hv_cpu->synic_message_page, 1);
|
||||
if (ret) {
|
||||
pr_err("Failed to decrypt SYNIC msg page: %d\n", ret);
|
||||
hv_cpu->synic_message_page = NULL;
|
||||
|
||||
/*
|
||||
* Free the event page here so that hv_synic_free()
|
||||
* won't later try to re-encrypt it.
|
||||
*/
|
||||
free_page((unsigned long)hv_cpu->synic_event_page);
|
||||
hv_cpu->synic_event_page = NULL;
|
||||
if (vmbus_is_confidential()) {
|
||||
ret = hv_alloc_page(&hv_cpu->para_synic_message_page,
|
||||
false, "paravisor SynIC msg");
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = set_memory_decrypted((unsigned long)
|
||||
hv_cpu->synic_event_page, 1);
|
||||
if (ret) {
|
||||
pr_err("Failed to decrypt SYNIC event page: %d\n", ret);
|
||||
hv_cpu->synic_event_page = NULL;
|
||||
ret = hv_alloc_page(&hv_cpu->para_synic_event_page,
|
||||
false, "paravisor SynIC event");
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
|
||||
memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -207,70 +243,46 @@ err:
|
|||
|
||||
void hv_synic_free(void)
|
||||
{
|
||||
int cpu, ret;
|
||||
int cpu;
|
||||
const bool encrypt = !vmbus_is_confidential();
|
||||
|
||||
for_each_present_cpu(cpu) {
|
||||
struct hv_per_cpu_context *hv_cpu =
|
||||
per_cpu_ptr(hv_context.cpu_context, cpu);
|
||||
|
||||
/* It's better to leak the page if the encryption fails. */
|
||||
if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
|
||||
if (hv_cpu->post_msg_page) {
|
||||
ret = set_memory_encrypted((unsigned long)
|
||||
hv_cpu->post_msg_page, 1);
|
||||
if (ret) {
|
||||
pr_err("Failed to encrypt post msg page: %d\n", ret);
|
||||
hv_cpu->post_msg_page = NULL;
|
||||
}
|
||||
}
|
||||
if (ms_hyperv.paravisor_present && hv_isolation_type_tdx())
|
||||
hv_free_page(&hv_cpu->post_msg_page,
|
||||
encrypt, "post msg");
|
||||
if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
|
||||
hv_free_page(&hv_cpu->hyp_synic_event_page,
|
||||
encrypt, "hypervisor SynIC event");
|
||||
hv_free_page(&hv_cpu->hyp_synic_message_page,
|
||||
encrypt, "hypervisor SynIC msg");
|
||||
}
|
||||
|
||||
if (!ms_hyperv.paravisor_present &&
|
||||
(hv_isolation_type_snp() || hv_isolation_type_tdx())) {
|
||||
if (hv_cpu->synic_message_page) {
|
||||
ret = set_memory_encrypted((unsigned long)
|
||||
hv_cpu->synic_message_page, 1);
|
||||
if (ret) {
|
||||
pr_err("Failed to encrypt SYNIC msg page: %d\n", ret);
|
||||
hv_cpu->synic_message_page = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (hv_cpu->synic_event_page) {
|
||||
ret = set_memory_encrypted((unsigned long)
|
||||
hv_cpu->synic_event_page, 1);
|
||||
if (ret) {
|
||||
pr_err("Failed to encrypt SYNIC event page: %d\n", ret);
|
||||
hv_cpu->synic_event_page = NULL;
|
||||
}
|
||||
}
|
||||
if (vmbus_is_confidential()) {
|
||||
hv_free_page(&hv_cpu->para_synic_event_page,
|
||||
false, "paravisor SynIC event");
|
||||
hv_free_page(&hv_cpu->para_synic_message_page,
|
||||
false, "paravisor SynIC msg");
|
||||
}
|
||||
|
||||
free_page((unsigned long)hv_cpu->post_msg_page);
|
||||
free_page((unsigned long)hv_cpu->synic_event_page);
|
||||
free_page((unsigned long)hv_cpu->synic_message_page);
|
||||
}
|
||||
|
||||
kfree(hv_context.hv_numa_map);
|
||||
}
|
||||
|
||||
/*
|
||||
* hv_synic_init - Initialize the Synthetic Interrupt Controller.
|
||||
*
|
||||
* If it is already initialized by another entity (ie x2v shim), we need to
|
||||
* retrieve the initialized message and event pages. Otherwise, we create and
|
||||
* initialize the message and event pages.
|
||||
* hv_hyp_synic_enable_regs - Initialize the Synthetic Interrupt Controller
|
||||
* with the hypervisor.
|
||||
*/
|
||||
void hv_synic_enable_regs(unsigned int cpu)
|
||||
void hv_hyp_synic_enable_regs(unsigned int cpu)
|
||||
{
|
||||
struct hv_per_cpu_context *hv_cpu =
|
||||
per_cpu_ptr(hv_context.cpu_context, cpu);
|
||||
union hv_synic_simp simp;
|
||||
union hv_synic_siefp siefp;
|
||||
union hv_synic_sint shared_sint;
|
||||
union hv_synic_scontrol sctrl;
|
||||
|
||||
/* Setup the Synic's message page */
|
||||
/* Setup the Synic's message page with the hypervisor. */
|
||||
simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
|
||||
simp.simp_enabled = 1;
|
||||
|
||||
|
|
@ -278,18 +290,18 @@ void hv_synic_enable_regs(unsigned int cpu)
|
|||
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
|
||||
u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
|
||||
~ms_hyperv.shared_gpa_boundary;
|
||||
hv_cpu->synic_message_page =
|
||||
hv_cpu->hyp_synic_message_page =
|
||||
(void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
|
||||
if (!hv_cpu->synic_message_page)
|
||||
if (!hv_cpu->hyp_synic_message_page)
|
||||
pr_err("Fail to map synic message page.\n");
|
||||
} else {
|
||||
simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
|
||||
simp.base_simp_gpa = virt_to_phys(hv_cpu->hyp_synic_message_page)
|
||||
>> HV_HYP_PAGE_SHIFT;
|
||||
}
|
||||
|
||||
hv_set_msr(HV_MSR_SIMP, simp.as_uint64);
|
||||
|
||||
/* Setup the Synic's event page */
|
||||
/* Setup the Synic's event page with the hypervisor. */
|
||||
siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
|
||||
siefp.siefp_enabled = 1;
|
||||
|
||||
|
|
@ -297,16 +309,17 @@ void hv_synic_enable_regs(unsigned int cpu)
|
|||
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
|
||||
u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
|
||||
~ms_hyperv.shared_gpa_boundary;
|
||||
hv_cpu->synic_event_page =
|
||||
hv_cpu->hyp_synic_event_page =
|
||||
(void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
|
||||
if (!hv_cpu->synic_event_page)
|
||||
if (!hv_cpu->hyp_synic_event_page)
|
||||
pr_err("Fail to map synic event page.\n");
|
||||
} else {
|
||||
siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
|
||||
siefp.base_siefp_gpa = virt_to_phys(hv_cpu->hyp_synic_event_page)
|
||||
>> HV_HYP_PAGE_SHIFT;
|
||||
}
|
||||
|
||||
hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
|
||||
hv_enable_coco_interrupt(cpu, vmbus_interrupt, true);
|
||||
|
||||
/* Setup the shared SINT. */
|
||||
if (vmbus_irq != -1)
|
||||
|
|
@ -317,6 +330,11 @@ void hv_synic_enable_regs(unsigned int cpu)
|
|||
shared_sint.masked = false;
|
||||
shared_sint.auto_eoi = hv_recommend_using_aeoi();
|
||||
hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
|
||||
}
|
||||
|
||||
static void hv_hyp_synic_enable_interrupts(void)
|
||||
{
|
||||
union hv_synic_scontrol sctrl;
|
||||
|
||||
/* Enable the global synic bit */
|
||||
sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
|
||||
|
|
@ -325,23 +343,72 @@ void hv_synic_enable_regs(unsigned int cpu)
|
|||
hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
|
||||
}
|
||||
|
||||
static void hv_para_synic_enable_regs(unsigned int cpu)
|
||||
{
|
||||
union hv_synic_simp simp;
|
||||
union hv_synic_siefp siefp;
|
||||
struct hv_per_cpu_context *hv_cpu
|
||||
= per_cpu_ptr(hv_context.cpu_context, cpu);
|
||||
|
||||
/* Setup the Synic's message page with the paravisor. */
|
||||
simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP);
|
||||
simp.simp_enabled = 1;
|
||||
simp.base_simp_gpa = virt_to_phys(hv_cpu->para_synic_message_page)
|
||||
>> HV_HYP_PAGE_SHIFT;
|
||||
hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64);
|
||||
|
||||
/* Setup the Synic's event page with the paravisor. */
|
||||
siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP);
|
||||
siefp.siefp_enabled = 1;
|
||||
siefp.base_siefp_gpa = virt_to_phys(hv_cpu->para_synic_event_page)
|
||||
>> HV_HYP_PAGE_SHIFT;
|
||||
hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64);
|
||||
}
|
||||
|
||||
static void hv_para_synic_enable_interrupts(void)
|
||||
{
|
||||
union hv_synic_scontrol sctrl;
|
||||
|
||||
/* Enable the global synic bit */
|
||||
sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL);
|
||||
sctrl.enable = 1;
|
||||
hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64);
|
||||
}
|
||||
|
||||
int hv_synic_init(unsigned int cpu)
|
||||
{
|
||||
hv_synic_enable_regs(cpu);
|
||||
if (vmbus_is_confidential())
|
||||
hv_para_synic_enable_regs(cpu);
|
||||
|
||||
/*
|
||||
* The SINT is set in hv_hyp_synic_enable_regs() by calling
|
||||
* hv_set_msr(). hv_set_msr() in turn has special case code for the
|
||||
* SINT MSRs that write to the hypervisor version of the MSR *and*
|
||||
* the paravisor version of the MSR (but *without* the proxy bit when
|
||||
* VMBus is confidential).
|
||||
*
|
||||
* Then enable interrupts via the paravisor if VMBus is confidential,
|
||||
* and otherwise via the hypervisor.
|
||||
*/
|
||||
|
||||
hv_hyp_synic_enable_regs(cpu);
|
||||
if (vmbus_is_confidential())
|
||||
hv_para_synic_enable_interrupts();
|
||||
else
|
||||
hv_hyp_synic_enable_interrupts();
|
||||
|
||||
hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void hv_synic_disable_regs(unsigned int cpu)
|
||||
void hv_hyp_synic_disable_regs(unsigned int cpu)
|
||||
{
|
||||
struct hv_per_cpu_context *hv_cpu =
|
||||
per_cpu_ptr(hv_context.cpu_context, cpu);
|
||||
union hv_synic_sint shared_sint;
|
||||
union hv_synic_simp simp;
|
||||
union hv_synic_siefp siefp;
|
||||
union hv_synic_scontrol sctrl;
|
||||
|
||||
shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT);
|
||||
|
||||
|
|
@ -350,18 +417,21 @@ void hv_synic_disable_regs(unsigned int cpu)
|
|||
/* Need to correctly cleanup in the case of SMP!!! */
|
||||
/* Disable the interrupt */
|
||||
hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
|
||||
hv_enable_coco_interrupt(cpu, vmbus_interrupt, false);
|
||||
|
||||
simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
|
||||
/*
|
||||
* In Isolation VM, sim and sief pages are allocated by
|
||||
* In Isolation VM, simp and sief pages are allocated by
|
||||
* paravisor. These pages also will be used by kdump
|
||||
* kernel. So just reset enable bit here and keep page
|
||||
* addresses.
|
||||
*/
|
||||
simp.simp_enabled = 0;
|
||||
if (ms_hyperv.paravisor_present || hv_root_partition()) {
|
||||
iounmap(hv_cpu->synic_message_page);
|
||||
hv_cpu->synic_message_page = NULL;
|
||||
if (hv_cpu->hyp_synic_message_page) {
|
||||
iounmap(hv_cpu->hyp_synic_message_page);
|
||||
hv_cpu->hyp_synic_message_page = NULL;
|
||||
}
|
||||
} else {
|
||||
simp.base_simp_gpa = 0;
|
||||
}
|
||||
|
|
@ -372,21 +442,51 @@ void hv_synic_disable_regs(unsigned int cpu)
|
|||
siefp.siefp_enabled = 0;
|
||||
|
||||
if (ms_hyperv.paravisor_present || hv_root_partition()) {
|
||||
iounmap(hv_cpu->synic_event_page);
|
||||
hv_cpu->synic_event_page = NULL;
|
||||
if (hv_cpu->hyp_synic_event_page) {
|
||||
iounmap(hv_cpu->hyp_synic_event_page);
|
||||
hv_cpu->hyp_synic_event_page = NULL;
|
||||
}
|
||||
} else {
|
||||
siefp.base_siefp_gpa = 0;
|
||||
}
|
||||
|
||||
hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
|
||||
}
|
||||
|
||||
static void hv_hyp_synic_disable_interrupts(void)
|
||||
{
|
||||
union hv_synic_scontrol sctrl;
|
||||
|
||||
/* Disable the global synic bit */
|
||||
sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
|
||||
sctrl.enable = 0;
|
||||
hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
|
||||
}
|
||||
|
||||
if (vmbus_irq != -1)
|
||||
disable_percpu_irq(vmbus_irq);
|
||||
static void hv_para_synic_disable_regs(unsigned int cpu)
|
||||
{
|
||||
union hv_synic_simp simp;
|
||||
union hv_synic_siefp siefp;
|
||||
|
||||
/* Disable SynIC's message page in the paravisor. */
|
||||
simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP);
|
||||
simp.simp_enabled = 0;
|
||||
hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64);
|
||||
|
||||
/* Disable SynIC's event page in the paravisor. */
|
||||
siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP);
|
||||
siefp.siefp_enabled = 0;
|
||||
hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64);
|
||||
}
|
||||
|
||||
static void hv_para_synic_disable_interrupts(void)
|
||||
{
|
||||
union hv_synic_scontrol sctrl;
|
||||
|
||||
/* Disable the global synic bit */
|
||||
sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL);
|
||||
sctrl.enable = 0;
|
||||
hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64);
|
||||
}
|
||||
|
||||
#define HV_MAX_TRIES 3
|
||||
|
|
@ -399,16 +499,18 @@ void hv_synic_disable_regs(unsigned int cpu)
|
|||
* that the normal interrupt handling mechanism will find and process the channel interrupt
|
||||
* "very soon", and in the process clear the bit.
|
||||
*/
|
||||
static bool hv_synic_event_pending(void)
|
||||
static bool __hv_synic_event_pending(union hv_synic_event_flags *event, int sint)
|
||||
{
|
||||
struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
|
||||
union hv_synic_event_flags *event =
|
||||
(union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT;
|
||||
unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
|
||||
unsigned long *recv_int_page;
|
||||
bool pending;
|
||||
u32 relid;
|
||||
int tries = 0;
|
||||
|
||||
if (!event)
|
||||
return false;
|
||||
|
||||
event += sint;
|
||||
recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
|
||||
retry:
|
||||
pending = false;
|
||||
for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) {
|
||||
|
|
@ -425,6 +527,17 @@ retry:
|
|||
return pending;
|
||||
}
|
||||
|
||||
static bool hv_synic_event_pending(void)
|
||||
{
|
||||
struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
|
||||
union hv_synic_event_flags *hyp_synic_event_page = hv_cpu->hyp_synic_event_page;
|
||||
union hv_synic_event_flags *para_synic_event_page = hv_cpu->para_synic_event_page;
|
||||
|
||||
return
|
||||
__hv_synic_event_pending(hyp_synic_event_page, VMBUS_MESSAGE_SINT) ||
|
||||
__hv_synic_event_pending(para_synic_event_page, VMBUS_MESSAGE_SINT);
|
||||
}
|
||||
|
||||
static int hv_pick_new_cpu(struct vmbus_channel *channel)
|
||||
{
|
||||
int ret = -EBUSY;
|
||||
|
|
@ -517,7 +630,27 @@ int hv_synic_cleanup(unsigned int cpu)
|
|||
always_cleanup:
|
||||
hv_stimer_legacy_cleanup(cpu);
|
||||
|
||||
hv_synic_disable_regs(cpu);
|
||||
/*
|
||||
* First, disable the event and message pages
|
||||
* used for communicating with the host, and then
|
||||
* disable the host interrupts if VMBus is not
|
||||
* confidential.
|
||||
*/
|
||||
hv_hyp_synic_disable_regs(cpu);
|
||||
if (!vmbus_is_confidential())
|
||||
hv_hyp_synic_disable_interrupts();
|
||||
|
||||
/*
|
||||
* Perform the same steps for the Confidential VMBus.
|
||||
* The sequencing provides the guarantee that no data
|
||||
* may be posted for processing before disabling interrupts.
|
||||
*/
|
||||
if (vmbus_is_confidential()) {
|
||||
hv_para_synic_disable_regs(cpu);
|
||||
hv_para_synic_disable_interrupts();
|
||||
}
|
||||
if (vmbus_irq != -1)
|
||||
disable_percpu_irq(vmbus_irq);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -315,9 +315,9 @@ int __init hv_common_init(void)
|
|||
int i;
|
||||
union hv_hypervisor_version_info version;
|
||||
|
||||
/* Get information about the Hyper-V host version */
|
||||
/* Get information about the Microsoft Hypervisor version */
|
||||
if (!hv_get_hypervisor_version(&version))
|
||||
pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n",
|
||||
pr_info("Hyper-V: Hypervisor Build %d.%d.%d.%d-%d-%d\n",
|
||||
version.major_version, version.minor_version,
|
||||
version.build_number, version.service_number,
|
||||
version.service_pack, version.service_branch);
|
||||
|
|
@ -487,7 +487,7 @@ int hv_common_cpu_init(unsigned int cpu)
|
|||
* online and then taken offline
|
||||
*/
|
||||
if (!*inputarg) {
|
||||
mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
|
||||
mem = kmalloc_array(pgcount, HV_HYP_PAGE_SIZE, flags);
|
||||
if (!mem)
|
||||
return -ENOMEM;
|
||||
|
||||
|
|
@ -716,6 +716,27 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
|
||||
|
||||
void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set)
|
||||
{
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt);
|
||||
|
||||
void __weak hv_para_set_sint_proxy(bool enable)
|
||||
{
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(hv_para_set_sint_proxy);
|
||||
|
||||
u64 __weak hv_para_get_synic_register(unsigned int reg)
|
||||
{
|
||||
return ~0ULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(hv_para_get_synic_register);
|
||||
|
||||
void __weak hv_para_set_synic_register(unsigned int reg, u64 val)
|
||||
{
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(hv_para_set_synic_register);
|
||||
|
||||
void hv_identify_partition_type(void)
|
||||
{
|
||||
/* Assume guest role */
|
||||
|
|
|
|||
|
|
@ -586,7 +586,7 @@ static int util_probe(struct hv_device *dev,
|
|||
(struct hv_util_service *)dev_id->driver_data;
|
||||
int ret;
|
||||
|
||||
srv->recv_buffer = kmalloc(HV_HYP_PAGE_SIZE * 4, GFP_KERNEL);
|
||||
srv->recv_buffer = kmalloc_array(4, HV_HYP_PAGE_SIZE, GFP_KERNEL);
|
||||
if (!srv->recv_buffer)
|
||||
return -ENOMEM;
|
||||
srv->channel = dev->channel;
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@
|
|||
#include <linux/list.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <asm/sync_bitops.h>
|
||||
#include <asm/mshyperv.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/hyperv.h>
|
||||
#include <linux/interrupt.h>
|
||||
|
|
@ -32,6 +33,7 @@
|
|||
*/
|
||||
#define HV_UTIL_NEGO_TIMEOUT 55
|
||||
|
||||
void vmbus_isr(void);
|
||||
|
||||
/* Definitions for the monitored notification facility */
|
||||
union hv_monitor_trigger_group {
|
||||
|
|
@ -120,8 +122,26 @@ enum {
|
|||
* Per cpu state for channel handling
|
||||
*/
|
||||
struct hv_per_cpu_context {
|
||||
void *synic_message_page;
|
||||
void *synic_event_page;
|
||||
/*
|
||||
* SynIC pages for communicating with the host.
|
||||
*
|
||||
* These pages are accessible to the host partition and the hypervisor.
|
||||
* They may be used for exchanging data with the host partition and the
|
||||
* hypervisor even when they aren't trusted yet the guest partition
|
||||
* must be prepared to handle the malicious behavior.
|
||||
*/
|
||||
void *hyp_synic_message_page;
|
||||
void *hyp_synic_event_page;
|
||||
/*
|
||||
* SynIC pages for communicating with the paravisor.
|
||||
*
|
||||
* These pages may be accessed from within the guest partition only in
|
||||
* CoCo VMs. Neither the host partition nor the hypervisor can access
|
||||
* these pages in that case; they are used for exchanging data with the
|
||||
* paravisor.
|
||||
*/
|
||||
void *para_synic_message_page;
|
||||
void *para_synic_event_page;
|
||||
|
||||
/*
|
||||
* The page is only used in hv_post_message() for a TDX VM (with the
|
||||
|
|
@ -171,10 +191,10 @@ extern int hv_synic_alloc(void);
|
|||
|
||||
extern void hv_synic_free(void);
|
||||
|
||||
extern void hv_synic_enable_regs(unsigned int cpu);
|
||||
extern void hv_hyp_synic_enable_regs(unsigned int cpu);
|
||||
extern int hv_synic_init(unsigned int cpu);
|
||||
|
||||
extern void hv_synic_disable_regs(unsigned int cpu);
|
||||
extern void hv_hyp_synic_disable_regs(unsigned int cpu);
|
||||
extern int hv_synic_cleanup(unsigned int cpu);
|
||||
|
||||
/* Interface */
|
||||
|
|
@ -182,7 +202,8 @@ extern int hv_synic_cleanup(unsigned int cpu);
|
|||
void hv_ringbuffer_pre_init(struct vmbus_channel *channel);
|
||||
|
||||
int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
|
||||
struct page *pages, u32 pagecnt, u32 max_pkt_size);
|
||||
struct page *pages, u32 pagecnt, u32 max_pkt_size,
|
||||
bool confidential);
|
||||
|
||||
void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info);
|
||||
|
||||
|
|
@ -333,6 +354,51 @@ extern const struct vmbus_channel_message_table_entry
|
|||
|
||||
/* General vmbus interface */
|
||||
|
||||
bool vmbus_is_confidential(void);
|
||||
|
||||
#if IS_ENABLED(CONFIG_HYPERV_VMBUS)
|
||||
/* Free the message slot and signal end-of-message if required */
|
||||
static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
|
||||
{
|
||||
/*
|
||||
* On crash we're reading some other CPU's message page and we need
|
||||
* to be careful: this other CPU may already had cleared the header
|
||||
* and the host may already had delivered some other message there.
|
||||
* In case we blindly write msg->header.message_type we're going
|
||||
* to lose it. We can still lose a message of the same type but
|
||||
* we count on the fact that there can only be one
|
||||
* CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages
|
||||
* on crash.
|
||||
*/
|
||||
if (cmpxchg(&msg->header.message_type, old_msg_type,
|
||||
HVMSG_NONE) != old_msg_type)
|
||||
return;
|
||||
|
||||
/*
|
||||
* The cmxchg() above does an implicit memory barrier to
|
||||
* ensure the write to MessageType (ie set to
|
||||
* HVMSG_NONE) happens before we read the
|
||||
* MessagePending and EOMing. Otherwise, the EOMing
|
||||
* will not deliver any more messages since there is
|
||||
* no empty slot
|
||||
*/
|
||||
if (msg->header.message_flags.msg_pending) {
|
||||
/*
|
||||
* This will cause message queue rescan to
|
||||
* possibly deliver another msg from the
|
||||
* hypervisor
|
||||
*/
|
||||
if (vmbus_is_confidential())
|
||||
hv_para_set_synic_register(HV_MSR_EOM, 0);
|
||||
else
|
||||
hv_set_msr(HV_MSR_EOM, 0);
|
||||
}
|
||||
}
|
||||
|
||||
extern int vmbus_interrupt;
|
||||
extern int vmbus_irq;
|
||||
#endif /* CONFIG_HYPERV_VMBUS */
|
||||
|
||||
struct hv_device *vmbus_device_create(const guid_t *type,
|
||||
const guid_t *instance,
|
||||
struct vmbus_channel *channel);
|
||||
|
|
|
|||
|
|
@ -14,6 +14,9 @@
|
|||
#include <asm/mshyperv.h>
|
||||
#include <linux/resume_user_mode.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/acpi.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/reboot.h>
|
||||
|
||||
#include "mshv.h"
|
||||
|
||||
|
|
@ -138,3 +141,99 @@ int hv_call_get_partition_property(u64 partition_id,
|
|||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(hv_call_get_partition_property);
|
||||
|
||||
/*
|
||||
* Corresponding sleep states have to be initialized in order for a subsequent
|
||||
* HVCALL_ENTER_SLEEP_STATE call to succeed. Currently only S5 state as per
|
||||
* ACPI 6.4 chapter 7.4.2 is relevant, while S1, S2 and S3 can be supported.
|
||||
*
|
||||
* In order to pass proper PM values to mshv, ACPI should be initialized and
|
||||
* should support S5 sleep state when this method is invoked.
|
||||
*/
|
||||
static int hv_initialize_sleep_states(void)
|
||||
{
|
||||
u64 status;
|
||||
unsigned long flags;
|
||||
struct hv_input_set_system_property *in;
|
||||
acpi_status acpi_status;
|
||||
u8 sleep_type_a, sleep_type_b;
|
||||
|
||||
if (!acpi_sleep_state_supported(ACPI_STATE_S5)) {
|
||||
pr_err("%s: S5 sleep state not supported.\n", __func__);
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
acpi_status = acpi_get_sleep_type_data(ACPI_STATE_S5, &sleep_type_a,
|
||||
&sleep_type_b);
|
||||
if (ACPI_FAILURE(acpi_status))
|
||||
return -ENODEV;
|
||||
|
||||
local_irq_save(flags);
|
||||
in = *this_cpu_ptr(hyperv_pcpu_input_arg);
|
||||
memset(in, 0, sizeof(*in));
|
||||
|
||||
in->property_id = HV_SYSTEM_PROPERTY_SLEEP_STATE;
|
||||
in->set_sleep_state_info.sleep_state = HV_SLEEP_STATE_S5;
|
||||
in->set_sleep_state_info.pm1a_slp_typ = sleep_type_a;
|
||||
in->set_sleep_state_info.pm1b_slp_typ = sleep_type_b;
|
||||
|
||||
status = hv_do_hypercall(HVCALL_SET_SYSTEM_PROPERTY, in, NULL);
|
||||
local_irq_restore(flags);
|
||||
|
||||
if (!hv_result_success(status)) {
|
||||
hv_status_err(status, "\n");
|
||||
return hv_result_to_errno(status);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This notifier initializes sleep states in mshv hypervisor which will be
|
||||
* used during power off.
|
||||
*/
|
||||
static int hv_reboot_notifier_handler(struct notifier_block *this,
|
||||
unsigned long code, void *another)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (code == SYS_HALT || code == SYS_POWER_OFF)
|
||||
ret = hv_initialize_sleep_states();
|
||||
|
||||
return ret ? NOTIFY_DONE : NOTIFY_OK;
|
||||
}
|
||||
|
||||
static struct notifier_block hv_reboot_notifier = {
|
||||
.notifier_call = hv_reboot_notifier_handler,
|
||||
};
|
||||
|
||||
void hv_sleep_notifiers_register(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = register_reboot_notifier(&hv_reboot_notifier);
|
||||
if (ret)
|
||||
pr_err("%s: cannot register reboot notifier %d\n", __func__,
|
||||
ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* Power off the machine by entering S5 sleep state via Hyper-V hypercall.
|
||||
* This call does not return if successful.
|
||||
*/
|
||||
void hv_machine_power_off(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct hv_input_enter_sleep_state *in;
|
||||
|
||||
local_irq_save(flags);
|
||||
in = *this_cpu_ptr(hyperv_pcpu_input_arg);
|
||||
in->sleep_state = HV_SLEEP_STATE_S5;
|
||||
|
||||
(void)hv_do_hypercall(HVCALL_ENTER_SLEEP_STATE, in, NULL);
|
||||
local_irq_restore(flags);
|
||||
|
||||
/* should never reach here */
|
||||
BUG();
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -163,8 +163,10 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
|
|||
if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
#if IS_ENABLED(CONFIG_X86)
|
||||
if (irq->lapic_control.logical_dest_mode)
|
||||
return -EOPNOTSUPP;
|
||||
#endif
|
||||
|
||||
vp = partition->pt_vp_array[irq->lapic_apic_id];
|
||||
|
||||
|
|
@ -196,8 +198,10 @@ static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
|
|||
unsigned int seq;
|
||||
int idx;
|
||||
|
||||
#if IS_ENABLED(CONFIG_X86)
|
||||
WARN_ON(irqfd->irqfd_resampler &&
|
||||
!irq->lapic_control.level_triggered);
|
||||
#endif
|
||||
|
||||
idx = srcu_read_lock(&partition->pt_irq_srcu);
|
||||
if (irqfd->irqfd_girq_ent.guest_irq_num) {
|
||||
|
|
@ -469,6 +473,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
|
|||
init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
|
||||
|
||||
spin_lock_irq(&pt->pt_irqfds_lock);
|
||||
#if IS_ENABLED(CONFIG_X86)
|
||||
if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
|
||||
!irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
|
||||
/*
|
||||
|
|
@ -479,6 +484,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
|
|||
ret = -EINVAL;
|
||||
goto fail;
|
||||
}
|
||||
#endif
|
||||
ret = 0;
|
||||
hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
|
||||
if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
|
||||
|
|
@ -592,7 +598,7 @@ static void mshv_irqfd_release(struct mshv_partition *pt)
|
|||
|
||||
int mshv_irqfd_wq_init(void)
|
||||
{
|
||||
irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", 0, 0);
|
||||
irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", WQ_PERCPU, 0);
|
||||
if (!irqfd_cleanup_wq)
|
||||
return -ENOMEM;
|
||||
|
||||
|
|
|
|||
|
|
@ -119,6 +119,10 @@ void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent,
|
|||
lirq->lapic_vector = ent->girq_irq_data & 0xFF;
|
||||
lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF;
|
||||
lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8;
|
||||
#if IS_ENABLED(CONFIG_X86)
|
||||
lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1;
|
||||
lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1;
|
||||
#elif IS_ENABLED(CONFIG_ARM64)
|
||||
lirq->lapic_control.asserted = 1;
|
||||
#endif
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,555 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Copyright (c) 2025, Microsoft Corporation.
|
||||
*
|
||||
* Memory region management for mshv_root module.
|
||||
*
|
||||
* Authors: Microsoft Linux virtualization team
|
||||
*/
|
||||
|
||||
#include <linux/hmm.h>
|
||||
#include <linux/hyperv.h>
|
||||
#include <linux/kref.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/vmalloc.h>
|
||||
|
||||
#include <asm/mshyperv.h>
|
||||
|
||||
#include "mshv_root.h"
|
||||
|
||||
#define MSHV_MAP_FAULT_IN_PAGES PTRS_PER_PMD
|
||||
|
||||
/**
|
||||
* mshv_region_process_chunk - Processes a contiguous chunk of memory pages
|
||||
* in a region.
|
||||
* @region : Pointer to the memory region structure.
|
||||
* @flags : Flags to pass to the handler.
|
||||
* @page_offset: Offset into the region's pages array to start processing.
|
||||
* @page_count : Number of pages to process.
|
||||
* @handler : Callback function to handle the chunk.
|
||||
*
|
||||
* This function scans the region's pages starting from @page_offset,
|
||||
* checking for contiguous present pages of the same size (normal or huge).
|
||||
* It invokes @handler for the chunk of contiguous pages found. Returns the
|
||||
* number of pages handled, or a negative error code if the first page is
|
||||
* not present or the handler fails.
|
||||
*
|
||||
* Note: The @handler callback must be able to handle both normal and huge
|
||||
* pages.
|
||||
*
|
||||
* Return: Number of pages handled, or negative error code.
|
||||
*/
|
||||
static long mshv_region_process_chunk(struct mshv_mem_region *region,
|
||||
u32 flags,
|
||||
u64 page_offset, u64 page_count,
|
||||
int (*handler)(struct mshv_mem_region *region,
|
||||
u32 flags,
|
||||
u64 page_offset,
|
||||
u64 page_count))
|
||||
{
|
||||
u64 count, stride;
|
||||
unsigned int page_order;
|
||||
struct page *page;
|
||||
int ret;
|
||||
|
||||
page = region->pages[page_offset];
|
||||
if (!page)
|
||||
return -EINVAL;
|
||||
|
||||
page_order = folio_order(page_folio(page));
|
||||
/* The hypervisor only supports 4K and 2M page sizes */
|
||||
if (page_order && page_order != HPAGE_PMD_ORDER)
|
||||
return -EINVAL;
|
||||
|
||||
stride = 1 << page_order;
|
||||
|
||||
/* Start at stride since the first page is validated */
|
||||
for (count = stride; count < page_count; count += stride) {
|
||||
page = region->pages[page_offset + count];
|
||||
|
||||
/* Break if current page is not present */
|
||||
if (!page)
|
||||
break;
|
||||
|
||||
/* Break if page size changes */
|
||||
if (page_order != folio_order(page_folio(page)))
|
||||
break;
|
||||
}
|
||||
|
||||
ret = handler(region, flags, page_offset, count);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* mshv_region_process_range - Processes a range of memory pages in a
|
||||
* region.
|
||||
* @region : Pointer to the memory region structure.
|
||||
* @flags : Flags to pass to the handler.
|
||||
* @page_offset: Offset into the region's pages array to start processing.
|
||||
* @page_count : Number of pages to process.
|
||||
* @handler : Callback function to handle each chunk of contiguous
|
||||
* pages.
|
||||
*
|
||||
* Iterates over the specified range of pages in @region, skipping
|
||||
* non-present pages. For each contiguous chunk of present pages, invokes
|
||||
* @handler via mshv_region_process_chunk.
|
||||
*
|
||||
* Note: The @handler callback must be able to handle both normal and huge
|
||||
* pages.
|
||||
*
|
||||
* Returns 0 on success, or a negative error code on failure.
|
||||
*/
|
||||
static int mshv_region_process_range(struct mshv_mem_region *region,
|
||||
u32 flags,
|
||||
u64 page_offset, u64 page_count,
|
||||
int (*handler)(struct mshv_mem_region *region,
|
||||
u32 flags,
|
||||
u64 page_offset,
|
||||
u64 page_count))
|
||||
{
|
||||
long ret;
|
||||
|
||||
if (page_offset + page_count > region->nr_pages)
|
||||
return -EINVAL;
|
||||
|
||||
while (page_count) {
|
||||
/* Skip non-present pages */
|
||||
if (!region->pages[page_offset]) {
|
||||
page_offset++;
|
||||
page_count--;
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = mshv_region_process_chunk(region, flags,
|
||||
page_offset,
|
||||
page_count,
|
||||
handler);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
page_offset += ret;
|
||||
page_count -= ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
|
||||
u64 uaddr, u32 flags)
|
||||
{
|
||||
struct mshv_mem_region *region;
|
||||
|
||||
region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages);
|
||||
if (!region)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
region->nr_pages = nr_pages;
|
||||
region->start_gfn = guest_pfn;
|
||||
region->start_uaddr = uaddr;
|
||||
region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE;
|
||||
if (flags & BIT(MSHV_SET_MEM_BIT_WRITABLE))
|
||||
region->hv_map_flags |= HV_MAP_GPA_WRITABLE;
|
||||
if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
|
||||
region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
|
||||
|
||||
kref_init(®ion->refcount);
|
||||
|
||||
return region;
|
||||
}
|
||||
|
||||
static int mshv_region_chunk_share(struct mshv_mem_region *region,
|
||||
u32 flags,
|
||||
u64 page_offset, u64 page_count)
|
||||
{
|
||||
struct page *page = region->pages[page_offset];
|
||||
|
||||
if (PageHuge(page) || PageTransCompound(page))
|
||||
flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
|
||||
|
||||
return hv_call_modify_spa_host_access(region->partition->pt_id,
|
||||
region->pages + page_offset,
|
||||
page_count,
|
||||
HV_MAP_GPA_READABLE |
|
||||
HV_MAP_GPA_WRITABLE,
|
||||
flags, true);
|
||||
}
|
||||
|
||||
int mshv_region_share(struct mshv_mem_region *region)
|
||||
{
|
||||
u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
|
||||
|
||||
return mshv_region_process_range(region, flags,
|
||||
0, region->nr_pages,
|
||||
mshv_region_chunk_share);
|
||||
}
|
||||
|
||||
static int mshv_region_chunk_unshare(struct mshv_mem_region *region,
|
||||
u32 flags,
|
||||
u64 page_offset, u64 page_count)
|
||||
{
|
||||
struct page *page = region->pages[page_offset];
|
||||
|
||||
if (PageHuge(page) || PageTransCompound(page))
|
||||
flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
|
||||
|
||||
return hv_call_modify_spa_host_access(region->partition->pt_id,
|
||||
region->pages + page_offset,
|
||||
page_count, 0,
|
||||
flags, false);
|
||||
}
|
||||
|
||||
int mshv_region_unshare(struct mshv_mem_region *region)
|
||||
{
|
||||
u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
|
||||
|
||||
return mshv_region_process_range(region, flags,
|
||||
0, region->nr_pages,
|
||||
mshv_region_chunk_unshare);
|
||||
}
|
||||
|
||||
static int mshv_region_chunk_remap(struct mshv_mem_region *region,
|
||||
u32 flags,
|
||||
u64 page_offset, u64 page_count)
|
||||
{
|
||||
struct page *page = region->pages[page_offset];
|
||||
|
||||
if (PageHuge(page) || PageTransCompound(page))
|
||||
flags |= HV_MAP_GPA_LARGE_PAGE;
|
||||
|
||||
return hv_call_map_gpa_pages(region->partition->pt_id,
|
||||
region->start_gfn + page_offset,
|
||||
page_count, flags,
|
||||
region->pages + page_offset);
|
||||
}
|
||||
|
||||
static int mshv_region_remap_pages(struct mshv_mem_region *region,
|
||||
u32 map_flags,
|
||||
u64 page_offset, u64 page_count)
|
||||
{
|
||||
return mshv_region_process_range(region, map_flags,
|
||||
page_offset, page_count,
|
||||
mshv_region_chunk_remap);
|
||||
}
|
||||
|
||||
int mshv_region_map(struct mshv_mem_region *region)
|
||||
{
|
||||
u32 map_flags = region->hv_map_flags;
|
||||
|
||||
return mshv_region_remap_pages(region, map_flags,
|
||||
0, region->nr_pages);
|
||||
}
|
||||
|
||||
static void mshv_region_invalidate_pages(struct mshv_mem_region *region,
|
||||
u64 page_offset, u64 page_count)
|
||||
{
|
||||
if (region->type == MSHV_REGION_TYPE_MEM_PINNED)
|
||||
unpin_user_pages(region->pages + page_offset, page_count);
|
||||
|
||||
memset(region->pages + page_offset, 0,
|
||||
page_count * sizeof(struct page *));
|
||||
}
|
||||
|
||||
void mshv_region_invalidate(struct mshv_mem_region *region)
|
||||
{
|
||||
mshv_region_invalidate_pages(region, 0, region->nr_pages);
|
||||
}
|
||||
|
||||
int mshv_region_pin(struct mshv_mem_region *region)
|
||||
{
|
||||
u64 done_count, nr_pages;
|
||||
struct page **pages;
|
||||
__u64 userspace_addr;
|
||||
int ret;
|
||||
|
||||
for (done_count = 0; done_count < region->nr_pages; done_count += ret) {
|
||||
pages = region->pages + done_count;
|
||||
userspace_addr = region->start_uaddr +
|
||||
done_count * HV_HYP_PAGE_SIZE;
|
||||
nr_pages = min(region->nr_pages - done_count,
|
||||
MSHV_PIN_PAGES_BATCH_SIZE);
|
||||
|
||||
/*
|
||||
* Pinning assuming 4k pages works for large pages too.
|
||||
* All page structs within the large page are returned.
|
||||
*
|
||||
* Pin requests are batched because pin_user_pages_fast
|
||||
* with the FOLL_LONGTERM flag does a large temporary
|
||||
* allocation of contiguous memory.
|
||||
*/
|
||||
ret = pin_user_pages_fast(userspace_addr, nr_pages,
|
||||
FOLL_WRITE | FOLL_LONGTERM,
|
||||
pages);
|
||||
if (ret < 0)
|
||||
goto release_pages;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
release_pages:
|
||||
mshv_region_invalidate_pages(region, 0, done_count);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int mshv_region_chunk_unmap(struct mshv_mem_region *region,
|
||||
u32 flags,
|
||||
u64 page_offset, u64 page_count)
|
||||
{
|
||||
struct page *page = region->pages[page_offset];
|
||||
|
||||
if (PageHuge(page) || PageTransCompound(page))
|
||||
flags |= HV_UNMAP_GPA_LARGE_PAGE;
|
||||
|
||||
return hv_call_unmap_gpa_pages(region->partition->pt_id,
|
||||
region->start_gfn + page_offset,
|
||||
page_count, flags);
|
||||
}
|
||||
|
||||
static int mshv_region_unmap(struct mshv_mem_region *region)
|
||||
{
|
||||
return mshv_region_process_range(region, 0,
|
||||
0, region->nr_pages,
|
||||
mshv_region_chunk_unmap);
|
||||
}
|
||||
|
||||
static void mshv_region_destroy(struct kref *ref)
|
||||
{
|
||||
struct mshv_mem_region *region =
|
||||
container_of(ref, struct mshv_mem_region, refcount);
|
||||
struct mshv_partition *partition = region->partition;
|
||||
int ret;
|
||||
|
||||
if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
|
||||
mshv_region_movable_fini(region);
|
||||
|
||||
if (mshv_partition_encrypted(partition)) {
|
||||
ret = mshv_region_share(region);
|
||||
if (ret) {
|
||||
pt_err(partition,
|
||||
"Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n",
|
||||
ret);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
mshv_region_unmap(region);
|
||||
|
||||
mshv_region_invalidate(region);
|
||||
|
||||
vfree(region);
|
||||
}
|
||||
|
||||
void mshv_region_put(struct mshv_mem_region *region)
|
||||
{
|
||||
kref_put(®ion->refcount, mshv_region_destroy);
|
||||
}
|
||||
|
||||
int mshv_region_get(struct mshv_mem_region *region)
|
||||
{
|
||||
return kref_get_unless_zero(®ion->refcount);
|
||||
}
|
||||
|
||||
/**
|
||||
* mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region
|
||||
* @region: Pointer to the memory region structure
|
||||
* @range: Pointer to the HMM range structure
|
||||
*
|
||||
* This function performs the following steps:
|
||||
* 1. Reads the notifier sequence for the HMM range.
|
||||
* 2. Acquires a read lock on the memory map.
|
||||
* 3. Handles HMM faults for the specified range.
|
||||
* 4. Releases the read lock on the memory map.
|
||||
* 5. If successful, locks the memory region mutex.
|
||||
* 6. Verifies if the notifier sequence has changed during the operation.
|
||||
* If it has, releases the mutex and returns -EBUSY to match with
|
||||
* hmm_range_fault() return code for repeating.
|
||||
*
|
||||
* Return: 0 on success, a negative error code otherwise.
|
||||
*/
|
||||
static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region,
|
||||
struct hmm_range *range)
|
||||
{
|
||||
int ret;
|
||||
|
||||
range->notifier_seq = mmu_interval_read_begin(range->notifier);
|
||||
mmap_read_lock(region->mni.mm);
|
||||
ret = hmm_range_fault(range);
|
||||
mmap_read_unlock(region->mni.mm);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
mutex_lock(®ion->mutex);
|
||||
|
||||
if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) {
|
||||
mutex_unlock(®ion->mutex);
|
||||
cond_resched();
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* mshv_region_range_fault - Handle memory range faults for a given region.
|
||||
* @region: Pointer to the memory region structure.
|
||||
* @page_offset: Offset of the page within the region.
|
||||
* @page_count: Number of pages to handle.
|
||||
*
|
||||
* This function resolves memory faults for a specified range of pages
|
||||
* within a memory region. It uses HMM (Heterogeneous Memory Management)
|
||||
* to fault in the required pages and updates the region's page array.
|
||||
*
|
||||
* Return: 0 on success, negative error code on failure.
|
||||
*/
|
||||
static int mshv_region_range_fault(struct mshv_mem_region *region,
|
||||
u64 page_offset, u64 page_count)
|
||||
{
|
||||
struct hmm_range range = {
|
||||
.notifier = ®ion->mni,
|
||||
.default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
|
||||
};
|
||||
unsigned long *pfns;
|
||||
int ret;
|
||||
u64 i;
|
||||
|
||||
pfns = kmalloc_array(page_count, sizeof(*pfns), GFP_KERNEL);
|
||||
if (!pfns)
|
||||
return -ENOMEM;
|
||||
|
||||
range.hmm_pfns = pfns;
|
||||
range.start = region->start_uaddr + page_offset * HV_HYP_PAGE_SIZE;
|
||||
range.end = range.start + page_count * HV_HYP_PAGE_SIZE;
|
||||
|
||||
do {
|
||||
ret = mshv_region_hmm_fault_and_lock(region, &range);
|
||||
} while (ret == -EBUSY);
|
||||
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
for (i = 0; i < page_count; i++)
|
||||
region->pages[page_offset + i] = hmm_pfn_to_page(pfns[i]);
|
||||
|
||||
ret = mshv_region_remap_pages(region, region->hv_map_flags,
|
||||
page_offset, page_count);
|
||||
|
||||
mutex_unlock(®ion->mutex);
|
||||
out:
|
||||
kfree(pfns);
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn)
|
||||
{
|
||||
u64 page_offset, page_count;
|
||||
int ret;
|
||||
|
||||
/* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */
|
||||
page_offset = ALIGN_DOWN(gfn - region->start_gfn,
|
||||
MSHV_MAP_FAULT_IN_PAGES);
|
||||
|
||||
/* Map more pages than requested to reduce the number of faults. */
|
||||
page_count = min(region->nr_pages - page_offset,
|
||||
MSHV_MAP_FAULT_IN_PAGES);
|
||||
|
||||
ret = mshv_region_range_fault(region, page_offset, page_count);
|
||||
|
||||
WARN_ONCE(ret,
|
||||
"p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n",
|
||||
region->partition->pt_id, region->start_uaddr,
|
||||
region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT),
|
||||
gfn, page_offset, page_count);
|
||||
|
||||
return !ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* mshv_region_interval_invalidate - Invalidate a range of memory region
|
||||
* @mni: Pointer to the mmu_interval_notifier structure
|
||||
* @range: Pointer to the mmu_notifier_range structure
|
||||
* @cur_seq: Current sequence number for the interval notifier
|
||||
*
|
||||
* This function invalidates a memory region by remapping its pages with
|
||||
* no access permissions. It locks the region's mutex to ensure thread safety
|
||||
* and updates the sequence number for the interval notifier. If the range
|
||||
* is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking
|
||||
* lock and returns false if unsuccessful.
|
||||
*
|
||||
* NOTE: Failure to invalidate a region is a serious error, as the pages will
|
||||
* be considered freed while they are still mapped by the hypervisor.
|
||||
* Any attempt to access such pages will likely crash the system.
|
||||
*
|
||||
* Return: true if the region was successfully invalidated, false otherwise.
|
||||
*/
|
||||
static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni,
|
||||
const struct mmu_notifier_range *range,
|
||||
unsigned long cur_seq)
|
||||
{
|
||||
struct mshv_mem_region *region = container_of(mni,
|
||||
struct mshv_mem_region,
|
||||
mni);
|
||||
u64 page_offset, page_count;
|
||||
unsigned long mstart, mend;
|
||||
int ret = -EPERM;
|
||||
|
||||
if (mmu_notifier_range_blockable(range))
|
||||
mutex_lock(®ion->mutex);
|
||||
else if (!mutex_trylock(®ion->mutex))
|
||||
goto out_fail;
|
||||
|
||||
mmu_interval_set_seq(mni, cur_seq);
|
||||
|
||||
mstart = max(range->start, region->start_uaddr);
|
||||
mend = min(range->end, region->start_uaddr +
|
||||
(region->nr_pages << HV_HYP_PAGE_SHIFT));
|
||||
|
||||
page_offset = HVPFN_DOWN(mstart - region->start_uaddr);
|
||||
page_count = HVPFN_DOWN(mend - mstart);
|
||||
|
||||
ret = mshv_region_remap_pages(region, HV_MAP_GPA_NO_ACCESS,
|
||||
page_offset, page_count);
|
||||
if (ret)
|
||||
goto out_fail;
|
||||
|
||||
mshv_region_invalidate_pages(region, page_offset, page_count);
|
||||
|
||||
mutex_unlock(®ion->mutex);
|
||||
|
||||
return true;
|
||||
|
||||
out_fail:
|
||||
WARN_ONCE(ret,
|
||||
"Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n",
|
||||
region->start_uaddr,
|
||||
region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT),
|
||||
range->start, range->end, range->event,
|
||||
page_offset, page_offset + page_count - 1, (u64)range->mm, ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
static const struct mmu_interval_notifier_ops mshv_region_mni_ops = {
|
||||
.invalidate = mshv_region_interval_invalidate,
|
||||
};
|
||||
|
||||
void mshv_region_movable_fini(struct mshv_mem_region *region)
|
||||
{
|
||||
mmu_interval_notifier_remove(®ion->mni);
|
||||
}
|
||||
|
||||
bool mshv_region_movable_init(struct mshv_mem_region *region)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = mmu_interval_notifier_insert(®ion->mni, current->mm,
|
||||
region->start_uaddr,
|
||||
region->nr_pages << HV_HYP_PAGE_SHIFT,
|
||||
&mshv_region_mni_ops);
|
||||
if (ret)
|
||||
return false;
|
||||
|
||||
mutex_init(®ion->mutex);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
@ -15,6 +15,7 @@
|
|||
#include <linux/hashtable.h>
|
||||
#include <linux/dev_printk.h>
|
||||
#include <linux/build_bug.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <uapi/linux/mshv.h>
|
||||
|
||||
/*
|
||||
|
|
@ -70,18 +71,23 @@ do { \
|
|||
#define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__)
|
||||
#define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__)
|
||||
|
||||
enum mshv_region_type {
|
||||
MSHV_REGION_TYPE_MEM_PINNED,
|
||||
MSHV_REGION_TYPE_MEM_MOVABLE,
|
||||
MSHV_REGION_TYPE_MMIO
|
||||
};
|
||||
|
||||
struct mshv_mem_region {
|
||||
struct hlist_node hnode;
|
||||
struct kref refcount;
|
||||
u64 nr_pages;
|
||||
u64 start_gfn;
|
||||
u64 start_uaddr;
|
||||
u32 hv_map_flags;
|
||||
struct {
|
||||
u64 large_pages: 1; /* 2MiB */
|
||||
u64 range_pinned: 1;
|
||||
u64 reserved: 62;
|
||||
} flags;
|
||||
struct mshv_partition *partition;
|
||||
enum mshv_region_type type;
|
||||
struct mmu_interval_notifier mni;
|
||||
struct mutex mutex; /* protects region pages remapping */
|
||||
struct page *pages[];
|
||||
};
|
||||
|
||||
|
|
@ -98,6 +104,8 @@ struct mshv_partition {
|
|||
u64 pt_id;
|
||||
refcount_t pt_ref_count;
|
||||
struct mutex pt_mutex;
|
||||
|
||||
spinlock_t pt_mem_regions_lock;
|
||||
struct hlist_head pt_mem_regions; // not ordered
|
||||
|
||||
u32 pt_vp_count;
|
||||
|
|
@ -169,7 +177,7 @@ struct mshv_girq_routing_table {
|
|||
};
|
||||
|
||||
struct hv_synic_pages {
|
||||
struct hv_message_page *synic_message_page;
|
||||
struct hv_message_page *hyp_synic_message_page;
|
||||
struct hv_synic_event_flags_page *synic_event_flags_page;
|
||||
struct hv_synic_event_ring_page *synic_event_ring_page;
|
||||
};
|
||||
|
|
@ -178,6 +186,7 @@ struct mshv_root {
|
|||
struct hv_synic_pages __percpu *synic_pages;
|
||||
spinlock_t pt_ht_lock;
|
||||
DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS);
|
||||
struct hv_partition_property_vmm_capabilities vmm_caps;
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -278,11 +287,12 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
|
|||
/* Choose between pages and bytes */
|
||||
struct hv_vp_state_data state_data, u64 page_count,
|
||||
struct page **pages, u32 num_bytes, u8 *bytes);
|
||||
int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
||||
union hv_input_vtl input_vtl,
|
||||
struct page **state_page);
|
||||
int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
||||
union hv_input_vtl input_vtl);
|
||||
int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
||||
union hv_input_vtl input_vtl,
|
||||
struct page **state_page);
|
||||
int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
||||
struct page *state_page,
|
||||
union hv_input_vtl input_vtl);
|
||||
int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
|
||||
u64 connection_partition_id, struct hv_port_info *port_info,
|
||||
u8 port_vtl, u8 min_connection_vtl, int node);
|
||||
|
|
@ -295,17 +305,32 @@ int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
|
|||
int hv_call_disconnect_port(u64 connection_partition_id,
|
||||
union hv_connection_id connection_id);
|
||||
int hv_call_notify_port_ring_empty(u32 sint_index);
|
||||
int hv_call_map_stat_page(enum hv_stats_object_type type,
|
||||
const union hv_stats_object_identity *identity,
|
||||
void **addr);
|
||||
int hv_call_unmap_stat_page(enum hv_stats_object_type type,
|
||||
const union hv_stats_object_identity *identity);
|
||||
int hv_map_stats_page(enum hv_stats_object_type type,
|
||||
const union hv_stats_object_identity *identity,
|
||||
void **addr);
|
||||
int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr,
|
||||
const union hv_stats_object_identity *identity);
|
||||
int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
|
||||
u64 page_struct_count, u32 host_access,
|
||||
u32 flags, u8 acquire);
|
||||
int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg,
|
||||
void *property_value, size_t property_value_sz);
|
||||
|
||||
extern struct mshv_root mshv_root;
|
||||
extern enum hv_scheduler_type hv_scheduler_type;
|
||||
extern u8 * __percpu *hv_synic_eventring_tail;
|
||||
|
||||
struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
|
||||
u64 uaddr, u32 flags);
|
||||
int mshv_region_share(struct mshv_mem_region *region);
|
||||
int mshv_region_unshare(struct mshv_mem_region *region);
|
||||
int mshv_region_map(struct mshv_mem_region *region);
|
||||
void mshv_region_invalidate(struct mshv_mem_region *region);
|
||||
int mshv_region_pin(struct mshv_mem_region *region);
|
||||
void mshv_region_put(struct mshv_mem_region *region);
|
||||
int mshv_region_get(struct mshv_mem_region *region);
|
||||
bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn);
|
||||
void mshv_region_movable_fini(struct mshv_mem_region *region);
|
||||
bool mshv_region_movable_init(struct mshv_mem_region *region);
|
||||
|
||||
#endif /* _MSHV_ROOT_H_ */
|
||||
|
|
|
|||
|
|
@ -388,7 +388,13 @@ int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector,
|
|||
memset(input, 0, sizeof(*input));
|
||||
input->partition_id = partition_id;
|
||||
input->vector = vector;
|
||||
/*
|
||||
* NOTE: dest_addr only needs to be provided while asserting an
|
||||
* interrupt on x86 platform
|
||||
*/
|
||||
#if IS_ENABLED(CONFIG_X86)
|
||||
input->dest_addr = dest_addr;
|
||||
#endif
|
||||
input->control = control;
|
||||
status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL);
|
||||
local_irq_restore(flags);
|
||||
|
|
@ -526,9 +532,9 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
|
|||
return ret;
|
||||
}
|
||||
|
||||
int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
||||
union hv_input_vtl input_vtl,
|
||||
struct page **state_page)
|
||||
static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
||||
union hv_input_vtl input_vtl,
|
||||
struct page **state_page)
|
||||
{
|
||||
struct hv_input_map_vp_state_page *input;
|
||||
struct hv_output_map_vp_state_page *output;
|
||||
|
|
@ -542,12 +548,20 @@ int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
|||
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
|
||||
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
|
||||
|
||||
memset(input, 0, sizeof(*input));
|
||||
input->partition_id = partition_id;
|
||||
input->vp_index = vp_index;
|
||||
input->type = type;
|
||||
input->input_vtl = input_vtl;
|
||||
|
||||
status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, output);
|
||||
if (*state_page) {
|
||||
input->flags.map_location_provided = 1;
|
||||
input->requested_map_location =
|
||||
page_to_pfn(*state_page);
|
||||
}
|
||||
|
||||
status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input,
|
||||
output);
|
||||
|
||||
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
|
||||
if (hv_result_success(status))
|
||||
|
|
@ -565,8 +579,41 @@ int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
|||
return ret;
|
||||
}
|
||||
|
||||
int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
||||
union hv_input_vtl input_vtl)
|
||||
static bool mshv_use_overlay_gpfn(void)
|
||||
{
|
||||
return hv_l1vh_partition() &&
|
||||
mshv_root.vmm_caps.vmm_can_provide_overlay_gpfn;
|
||||
}
|
||||
|
||||
int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
||||
union hv_input_vtl input_vtl,
|
||||
struct page **state_page)
|
||||
{
|
||||
int ret = 0;
|
||||
struct page *allocated_page = NULL;
|
||||
|
||||
if (mshv_use_overlay_gpfn()) {
|
||||
allocated_page = alloc_page(GFP_KERNEL);
|
||||
if (!allocated_page)
|
||||
return -ENOMEM;
|
||||
*state_page = allocated_page;
|
||||
} else {
|
||||
*state_page = NULL;
|
||||
}
|
||||
|
||||
ret = hv_call_map_vp_state_page(partition_id, vp_index, type, input_vtl,
|
||||
state_page);
|
||||
|
||||
if (ret && allocated_page) {
|
||||
__free_page(allocated_page);
|
||||
*state_page = NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
||||
union hv_input_vtl input_vtl)
|
||||
{
|
||||
unsigned long flags;
|
||||
u64 status;
|
||||
|
|
@ -590,6 +637,48 @@ int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
|||
return hv_result_to_errno(status);
|
||||
}
|
||||
|
||||
int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
|
||||
struct page *state_page, union hv_input_vtl input_vtl)
|
||||
{
|
||||
int ret = hv_call_unmap_vp_state_page(partition_id, vp_index, type, input_vtl);
|
||||
|
||||
if (mshv_use_overlay_gpfn() && state_page)
|
||||
__free_page(state_page);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code,
|
||||
u64 arg, void *property_value,
|
||||
size_t property_value_sz)
|
||||
{
|
||||
u64 status;
|
||||
unsigned long flags;
|
||||
struct hv_input_get_partition_property_ex *input;
|
||||
struct hv_output_get_partition_property_ex *output;
|
||||
|
||||
local_irq_save(flags);
|
||||
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
|
||||
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
|
||||
|
||||
memset(input, 0, sizeof(*input));
|
||||
input->partition_id = partition_id;
|
||||
input->property_code = property_code;
|
||||
input->arg = arg;
|
||||
status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY_EX, input, output);
|
||||
|
||||
if (!hv_result_success(status)) {
|
||||
local_irq_restore(flags);
|
||||
hv_status_debug(status, "\n");
|
||||
return hv_result_to_errno(status);
|
||||
}
|
||||
memcpy(property_value, &output->property_value, property_value_sz);
|
||||
|
||||
local_irq_restore(flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
hv_call_clear_virtual_interrupt(u64 partition_id)
|
||||
{
|
||||
|
|
@ -724,9 +813,51 @@ hv_call_notify_port_ring_empty(u32 sint_index)
|
|||
return hv_result_to_errno(status);
|
||||
}
|
||||
|
||||
int hv_call_map_stat_page(enum hv_stats_object_type type,
|
||||
const union hv_stats_object_identity *identity,
|
||||
void **addr)
|
||||
static int hv_call_map_stats_page2(enum hv_stats_object_type type,
|
||||
const union hv_stats_object_identity *identity,
|
||||
u64 map_location)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct hv_input_map_stats_page2 *input;
|
||||
u64 status;
|
||||
int ret;
|
||||
|
||||
if (!map_location || !mshv_use_overlay_gpfn())
|
||||
return -EINVAL;
|
||||
|
||||
do {
|
||||
local_irq_save(flags);
|
||||
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
|
||||
|
||||
memset(input, 0, sizeof(*input));
|
||||
input->type = type;
|
||||
input->identity = *identity;
|
||||
input->map_location = map_location;
|
||||
|
||||
status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE2, input, NULL);
|
||||
|
||||
local_irq_restore(flags);
|
||||
|
||||
ret = hv_result_to_errno(status);
|
||||
|
||||
if (!ret)
|
||||
break;
|
||||
|
||||
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
|
||||
hv_status_debug(status, "\n");
|
||||
break;
|
||||
}
|
||||
|
||||
ret = hv_call_deposit_pages(NUMA_NO_NODE,
|
||||
hv_current_partition_id, 1);
|
||||
} while (!ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int hv_call_map_stats_page(enum hv_stats_object_type type,
|
||||
const union hv_stats_object_identity *identity,
|
||||
void **addr)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct hv_input_map_stats_page *input;
|
||||
|
|
@ -765,8 +896,38 @@ int hv_call_map_stat_page(enum hv_stats_object_type type,
|
|||
return ret;
|
||||
}
|
||||
|
||||
int hv_call_unmap_stat_page(enum hv_stats_object_type type,
|
||||
const union hv_stats_object_identity *identity)
|
||||
int hv_map_stats_page(enum hv_stats_object_type type,
|
||||
const union hv_stats_object_identity *identity,
|
||||
void **addr)
|
||||
{
|
||||
int ret;
|
||||
struct page *allocated_page = NULL;
|
||||
|
||||
if (!addr)
|
||||
return -EINVAL;
|
||||
|
||||
if (mshv_use_overlay_gpfn()) {
|
||||
allocated_page = alloc_page(GFP_KERNEL);
|
||||
if (!allocated_page)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = hv_call_map_stats_page2(type, identity,
|
||||
page_to_pfn(allocated_page));
|
||||
*addr = page_address(allocated_page);
|
||||
} else {
|
||||
ret = hv_call_map_stats_page(type, identity, addr);
|
||||
}
|
||||
|
||||
if (ret && allocated_page) {
|
||||
__free_page(allocated_page);
|
||||
*addr = NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int hv_call_unmap_stats_page(enum hv_stats_object_type type,
|
||||
const union hv_stats_object_identity *identity)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct hv_input_unmap_stats_page *input;
|
||||
|
|
@ -785,6 +946,19 @@ int hv_call_unmap_stat_page(enum hv_stats_object_type type,
|
|||
return hv_result_to_errno(status);
|
||||
}
|
||||
|
||||
int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr,
|
||||
const union hv_stats_object_identity *identity)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = hv_call_unmap_stats_page(type, identity);
|
||||
|
||||
if (mshv_use_overlay_gpfn() && page_addr)
|
||||
__free_page(virt_to_page(page_addr));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
|
||||
u64 page_struct_count, u32 host_access,
|
||||
u32 flags, u8 acquire)
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -394,7 +394,7 @@ unlock_out:
|
|||
void mshv_isr(void)
|
||||
{
|
||||
struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
|
||||
struct hv_message_page **msg_page = &spages->synic_message_page;
|
||||
struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
|
||||
struct hv_message *msg;
|
||||
bool handled;
|
||||
|
||||
|
|
@ -456,7 +456,7 @@ int mshv_synic_init(unsigned int cpu)
|
|||
#endif
|
||||
union hv_synic_scontrol sctrl;
|
||||
struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
|
||||
struct hv_message_page **msg_page = &spages->synic_message_page;
|
||||
struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
|
||||
struct hv_synic_event_flags_page **event_flags_page =
|
||||
&spages->synic_event_flags_page;
|
||||
struct hv_synic_event_ring_page **event_ring_page =
|
||||
|
|
@ -550,7 +550,7 @@ int mshv_synic_cleanup(unsigned int cpu)
|
|||
union hv_synic_sirbp sirbp;
|
||||
union hv_synic_scontrol sctrl;
|
||||
struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
|
||||
struct hv_message_page **msg_page = &spages->synic_message_page;
|
||||
struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
|
||||
struct hv_synic_event_flags_page **event_flags_page =
|
||||
&spages->synic_event_flags_page;
|
||||
struct hv_synic_event_ring_page **event_ring_page =
|
||||
|
|
|
|||
|
|
@ -0,0 +1,25 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
||||
#ifndef _MSHV_VTL_H
|
||||
#define _MSHV_VTL_H
|
||||
|
||||
#include <linux/mshv.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
struct mshv_vtl_run {
|
||||
u32 cancel;
|
||||
u32 vtl_ret_action_size;
|
||||
u32 pad[2];
|
||||
char exit_message[MSHV_MAX_RUN_MSG_SIZE];
|
||||
union {
|
||||
struct mshv_vtl_cpu_context cpu_context;
|
||||
|
||||
/*
|
||||
* Reserving room for the cpu context to grow and to maintain compatibility
|
||||
* with user mode.
|
||||
*/
|
||||
char reserved[1024];
|
||||
};
|
||||
char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE];
|
||||
};
|
||||
|
||||
#endif /* _MSHV_VTL_H */
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -184,7 +184,8 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
|
|||
|
||||
/* Initialize the ring buffer. */
|
||||
int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
|
||||
struct page *pages, u32 page_cnt, u32 max_pkt_size)
|
||||
struct page *pages, u32 page_cnt, u32 max_pkt_size,
|
||||
bool confidential)
|
||||
{
|
||||
struct page **pages_wraparound;
|
||||
int i;
|
||||
|
|
@ -208,7 +209,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
|
|||
|
||||
ring_info->ring_buffer = (struct hv_ring_buffer *)
|
||||
vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP,
|
||||
pgprot_decrypted(PAGE_KERNEL));
|
||||
confidential ? PAGE_KERNEL : pgprot_decrypted(PAGE_KERNEL));
|
||||
|
||||
kfree(pages_wraparound);
|
||||
if (!ring_info->ring_buffer)
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@
|
|||
#include <linux/syscore_ops.h>
|
||||
#include <linux/dma-map-ops.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/export.h>
|
||||
#include <clocksource/hyperv_timer.h>
|
||||
#include <asm/mshyperv.h>
|
||||
#include "hyperv_vmbus.h"
|
||||
|
|
@ -56,6 +57,18 @@ static long __percpu *vmbus_evt;
|
|||
int vmbus_irq;
|
||||
int vmbus_interrupt;
|
||||
|
||||
/*
|
||||
* If the Confidential VMBus is used, the data on the "wire" is not
|
||||
* visible to either the host or the hypervisor.
|
||||
*/
|
||||
static bool is_confidential;
|
||||
|
||||
bool vmbus_is_confidential(void)
|
||||
{
|
||||
return is_confidential;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vmbus_is_confidential);
|
||||
|
||||
/*
|
||||
* The panic notifier below is responsible solely for unloading the
|
||||
* vmbus connection, which is necessary in a panic event.
|
||||
|
|
@ -1045,12 +1058,9 @@ static void vmbus_onmessage_work(struct work_struct *work)
|
|||
kfree(ctx);
|
||||
}
|
||||
|
||||
void vmbus_on_msg_dpc(unsigned long data)
|
||||
static void __vmbus_on_msg_dpc(void *message_page_addr)
|
||||
{
|
||||
struct hv_per_cpu_context *hv_cpu = (void *)data;
|
||||
void *page_addr = hv_cpu->synic_message_page;
|
||||
struct hv_message msg_copy, *msg = (struct hv_message *)page_addr +
|
||||
VMBUS_MESSAGE_SINT;
|
||||
struct hv_message msg_copy, *msg;
|
||||
struct vmbus_channel_message_header *hdr;
|
||||
enum vmbus_channel_message_type msgtype;
|
||||
const struct vmbus_channel_message_table_entry *entry;
|
||||
|
|
@ -1058,6 +1068,10 @@ void vmbus_on_msg_dpc(unsigned long data)
|
|||
__u8 payload_size;
|
||||
u32 message_type;
|
||||
|
||||
if (!message_page_addr)
|
||||
return;
|
||||
msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
|
||||
|
||||
/*
|
||||
* 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
|
||||
* it is being used in 'struct vmbus_channel_message_header' definition
|
||||
|
|
@ -1183,6 +1197,14 @@ msg_handled:
|
|||
vmbus_signal_eom(msg, message_type);
|
||||
}
|
||||
|
||||
void vmbus_on_msg_dpc(unsigned long data)
|
||||
{
|
||||
struct hv_per_cpu_context *hv_cpu = (void *)data;
|
||||
|
||||
__vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page);
|
||||
__vmbus_on_msg_dpc(hv_cpu->para_synic_message_page);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PM_SLEEP
|
||||
/*
|
||||
* Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for
|
||||
|
|
@ -1221,21 +1243,19 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel)
|
|||
#endif /* CONFIG_PM_SLEEP */
|
||||
|
||||
/*
|
||||
* Schedule all channels with events pending
|
||||
* Schedule all channels with events pending.
|
||||
* The event page can be directly checked to get the id of
|
||||
* the channel that has the interrupt pending.
|
||||
*/
|
||||
static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
|
||||
static void vmbus_chan_sched(void *event_page_addr)
|
||||
{
|
||||
unsigned long *recv_int_page;
|
||||
u32 maxbits, relid;
|
||||
union hv_synic_event_flags *event;
|
||||
|
||||
/*
|
||||
* The event page can be directly checked to get the id of
|
||||
* the channel that has the interrupt pending.
|
||||
*/
|
||||
void *page_addr = hv_cpu->synic_event_page;
|
||||
union hv_synic_event_flags *event
|
||||
= (union hv_synic_event_flags *)page_addr +
|
||||
VMBUS_MESSAGE_SINT;
|
||||
if (!event_page_addr)
|
||||
return;
|
||||
event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT;
|
||||
|
||||
maxbits = HV_EVENT_FLAGS_COUNT;
|
||||
recv_int_page = event->flags;
|
||||
|
|
@ -1243,6 +1263,11 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
|
|||
if (unlikely(!recv_int_page))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Suggested-by: Michael Kelley <mhklinux@outlook.com>
|
||||
* One possible optimization would be to keep track of the largest relID that's in use,
|
||||
* and only scan up to that relID.
|
||||
*/
|
||||
for_each_set_bit(relid, recv_int_page, maxbits) {
|
||||
void (*callback_fn)(void *context);
|
||||
struct vmbus_channel *channel;
|
||||
|
|
@ -1306,29 +1331,39 @@ sched_unlock_rcu:
|
|||
}
|
||||
}
|
||||
|
||||
static void vmbus_isr(void)
|
||||
static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr)
|
||||
{
|
||||
struct hv_per_cpu_context *hv_cpu
|
||||
= this_cpu_ptr(hv_context.cpu_context);
|
||||
void *page_addr;
|
||||
struct hv_message *msg;
|
||||
|
||||
vmbus_chan_sched(hv_cpu);
|
||||
|
||||
page_addr = hv_cpu->synic_message_page;
|
||||
msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
|
||||
if (!message_page_addr)
|
||||
return;
|
||||
msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
|
||||
|
||||
/* Check if there are actual msgs to be processed */
|
||||
if (msg->header.message_type != HVMSG_NONE) {
|
||||
if (msg->header.message_type == HVMSG_TIMER_EXPIRED) {
|
||||
hv_stimer0_isr();
|
||||
vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED);
|
||||
} else
|
||||
} else {
|
||||
tasklet_schedule(&hv_cpu->msg_dpc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vmbus_isr(void)
|
||||
{
|
||||
struct hv_per_cpu_context *hv_cpu
|
||||
= this_cpu_ptr(hv_context.cpu_context);
|
||||
|
||||
vmbus_chan_sched(hv_cpu->hyp_synic_event_page);
|
||||
vmbus_chan_sched(hv_cpu->para_synic_event_page);
|
||||
|
||||
vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page);
|
||||
vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page);
|
||||
|
||||
add_interrupt_randomness(vmbus_interrupt);
|
||||
}
|
||||
EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl");
|
||||
|
||||
static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
|
||||
{
|
||||
|
|
@ -1343,54 +1378,14 @@ static void vmbus_percpu_work(struct work_struct *work)
|
|||
hv_synic_init(cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
* vmbus_bus_init -Main vmbus driver initialization routine.
|
||||
*
|
||||
* Here, we
|
||||
* - initialize the vmbus driver context
|
||||
* - invoke the vmbus hv main init routine
|
||||
* - retrieve the channel offers
|
||||
*/
|
||||
static int vmbus_bus_init(void)
|
||||
static int vmbus_alloc_synic_and_connect(void)
|
||||
{
|
||||
int ret, cpu;
|
||||
struct work_struct __percpu *works;
|
||||
|
||||
ret = hv_init();
|
||||
if (ret != 0) {
|
||||
pr_err("Unable to initialize the hypervisor - 0x%x\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = bus_register(&hv_bus);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* VMbus interrupts are best modeled as per-cpu interrupts. If
|
||||
* on an architecture with support for per-cpu IRQs (e.g. ARM64),
|
||||
* allocate a per-cpu IRQ using standard Linux kernel functionality.
|
||||
* If not on such an architecture (e.g., x86/x64), then rely on
|
||||
* code in the arch-specific portion of the code tree to connect
|
||||
* the VMbus interrupt handler.
|
||||
*/
|
||||
|
||||
if (vmbus_irq == -1) {
|
||||
hv_setup_vmbus_handler(vmbus_isr);
|
||||
} else {
|
||||
vmbus_evt = alloc_percpu(long);
|
||||
ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr,
|
||||
"Hyper-V VMbus", vmbus_evt);
|
||||
if (ret) {
|
||||
pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d",
|
||||
vmbus_irq, ret);
|
||||
free_percpu(vmbus_evt);
|
||||
goto err_setup;
|
||||
}
|
||||
}
|
||||
int hyperv_cpuhp_online;
|
||||
|
||||
ret = hv_synic_alloc();
|
||||
if (ret)
|
||||
if (ret < 0)
|
||||
goto err_alloc;
|
||||
|
||||
works = alloc_percpu(struct work_struct);
|
||||
|
|
@ -1424,6 +1419,72 @@ static int vmbus_bus_init(void)
|
|||
hyperv_cpuhp_online = ret;
|
||||
|
||||
ret = vmbus_connect();
|
||||
if (ret)
|
||||
goto err_connect;
|
||||
return 0;
|
||||
|
||||
err_connect:
|
||||
cpuhp_remove_state(hyperv_cpuhp_online);
|
||||
return -ENODEV;
|
||||
err_alloc:
|
||||
hv_synic_free();
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/*
|
||||
* vmbus_bus_init -Main vmbus driver initialization routine.
|
||||
*
|
||||
* Here, we
|
||||
* - initialize the vmbus driver context
|
||||
* - invoke the vmbus hv main init routine
|
||||
* - retrieve the channel offers
|
||||
*/
|
||||
static int vmbus_bus_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = hv_init();
|
||||
if (ret != 0) {
|
||||
pr_err("Unable to initialize the hypervisor - 0x%x\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = bus_register(&hv_bus);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* VMbus interrupts are best modeled as per-cpu interrupts. If
|
||||
* on an architecture with support for per-cpu IRQs (e.g. ARM64),
|
||||
* allocate a per-cpu IRQ using standard Linux kernel functionality.
|
||||
* If not on such an architecture (e.g., x86/x64), then rely on
|
||||
* code in the arch-specific portion of the code tree to connect
|
||||
* the VMbus interrupt handler.
|
||||
*/
|
||||
|
||||
if (vmbus_irq == -1) {
|
||||
hv_setup_vmbus_handler(vmbus_isr);
|
||||
} else {
|
||||
vmbus_evt = alloc_percpu(long);
|
||||
ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr,
|
||||
"Hyper-V VMbus", vmbus_evt);
|
||||
if (ret) {
|
||||
pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d",
|
||||
vmbus_irq, ret);
|
||||
free_percpu(vmbus_evt);
|
||||
goto err_setup;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Cache the value as getting it involves a VM exit on x86(_64), and
|
||||
* doing that on each VP while initializing SynIC's wastes time.
|
||||
*/
|
||||
is_confidential = ms_hyperv.confidential_vmbus_available;
|
||||
if (is_confidential)
|
||||
pr_info("Establishing connection to the confidential VMBus\n");
|
||||
hv_para_set_sint_proxy(!is_confidential);
|
||||
ret = vmbus_alloc_synic_and_connect();
|
||||
if (ret)
|
||||
goto err_connect;
|
||||
|
||||
|
|
@ -1439,9 +1500,6 @@ static int vmbus_bus_init(void)
|
|||
return 0;
|
||||
|
||||
err_connect:
|
||||
cpuhp_remove_state(hyperv_cpuhp_online);
|
||||
err_alloc:
|
||||
hv_synic_free();
|
||||
if (vmbus_irq == -1) {
|
||||
hv_remove_vmbus_handler();
|
||||
} else {
|
||||
|
|
@ -2798,7 +2856,7 @@ static void hv_crash_handler(struct pt_regs *regs)
|
|||
*/
|
||||
cpu = smp_processor_id();
|
||||
hv_stimer_cleanup(cpu);
|
||||
hv_synic_disable_regs(cpu);
|
||||
hv_hyp_synic_disable_regs(cpu);
|
||||
};
|
||||
|
||||
static int hv_synic_suspend(void *data)
|
||||
|
|
@ -2823,14 +2881,14 @@ static int hv_synic_suspend(void *data)
|
|||
* interrupts-disabled context.
|
||||
*/
|
||||
|
||||
hv_synic_disable_regs(0);
|
||||
hv_hyp_synic_disable_regs(0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void hv_synic_resume(void *data)
|
||||
{
|
||||
hv_synic_enable_regs(0);
|
||||
hv_hyp_synic_enable_regs(0);
|
||||
|
||||
/*
|
||||
* Note: we don't need to call hv_stimer_init(0), because the timer
|
||||
|
|
|
|||
|
|
@ -62,6 +62,8 @@ struct ms_hyperv_info {
|
|||
};
|
||||
};
|
||||
u64 shared_gpa_boundary;
|
||||
bool msi_ext_dest_id;
|
||||
bool confidential_vmbus_available;
|
||||
};
|
||||
extern struct ms_hyperv_info ms_hyperv;
|
||||
extern bool hv_nested;
|
||||
|
|
@ -124,10 +126,12 @@ static inline unsigned int hv_repcomp(u64 status)
|
|||
|
||||
/*
|
||||
* Rep hypercalls. Callers of this functions are supposed to ensure that
|
||||
* rep_count and varhead_size comply with Hyper-V hypercall definition.
|
||||
* rep_count, varhead_size, and rep_start comply with Hyper-V hypercall
|
||||
* definition.
|
||||
*/
|
||||
static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
|
||||
void *input, void *output)
|
||||
static inline u64 hv_do_rep_hypercall_ex(u16 code, u16 rep_count,
|
||||
u16 varhead_size, u16 rep_start,
|
||||
void *input, void *output)
|
||||
{
|
||||
u64 control = code;
|
||||
u64 status;
|
||||
|
|
@ -135,6 +139,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
|
|||
|
||||
control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET;
|
||||
control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
|
||||
control |= (u64)rep_start << HV_HYPERCALL_REP_START_OFFSET;
|
||||
|
||||
do {
|
||||
status = hv_do_hypercall(control, input, output);
|
||||
|
|
@ -152,6 +157,14 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
|
|||
return status;
|
||||
}
|
||||
|
||||
/* For the typical case where rep_start is 0 */
|
||||
static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
|
||||
void *input, void *output)
|
||||
{
|
||||
return hv_do_rep_hypercall_ex(code, rep_count, varhead_size, 0,
|
||||
input, output);
|
||||
}
|
||||
|
||||
/* Generate the guest OS identifier as described in the Hyper-V TLFS */
|
||||
static inline u64 hv_generate_guest_id(u64 kernel_version)
|
||||
{
|
||||
|
|
@ -163,46 +176,6 @@ static inline u64 hv_generate_guest_id(u64 kernel_version)
|
|||
return guest_id;
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_HYPERV_VMBUS)
|
||||
/* Free the message slot and signal end-of-message if required */
|
||||
static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
|
||||
{
|
||||
/*
|
||||
* On crash we're reading some other CPU's message page and we need
|
||||
* to be careful: this other CPU may already had cleared the header
|
||||
* and the host may already had delivered some other message there.
|
||||
* In case we blindly write msg->header.message_type we're going
|
||||
* to lose it. We can still lose a message of the same type but
|
||||
* we count on the fact that there can only be one
|
||||
* CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages
|
||||
* on crash.
|
||||
*/
|
||||
if (cmpxchg(&msg->header.message_type, old_msg_type,
|
||||
HVMSG_NONE) != old_msg_type)
|
||||
return;
|
||||
|
||||
/*
|
||||
* The cmxchg() above does an implicit memory barrier to
|
||||
* ensure the write to MessageType (ie set to
|
||||
* HVMSG_NONE) happens before we read the
|
||||
* MessagePending and EOMing. Otherwise, the EOMing
|
||||
* will not deliver any more messages since there is
|
||||
* no empty slot
|
||||
*/
|
||||
if (msg->header.message_flags.msg_pending) {
|
||||
/*
|
||||
* This will cause message queue rescan to
|
||||
* possibly deliver another msg from the
|
||||
* hypervisor
|
||||
*/
|
||||
hv_set_msr(HV_MSR_EOM, 0);
|
||||
}
|
||||
}
|
||||
|
||||
extern int vmbus_interrupt;
|
||||
extern int vmbus_irq;
|
||||
#endif /* CONFIG_HYPERV_VMBUS */
|
||||
|
||||
int hv_get_hypervisor_version(union hv_hypervisor_version_info *info);
|
||||
|
||||
void hv_setup_vmbus_handler(void (*handler)(void));
|
||||
|
|
@ -336,6 +309,10 @@ bool hv_is_isolation_supported(void);
|
|||
bool hv_isolation_type_snp(void);
|
||||
u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size);
|
||||
u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2);
|
||||
void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set);
|
||||
void hv_para_set_sint_proxy(bool enable);
|
||||
u64 hv_para_get_synic_register(unsigned int reg);
|
||||
void hv_para_set_synic_register(unsigned int reg, u64 val);
|
||||
void hyperv_cleanup(void);
|
||||
bool hv_query_ext_cap(u64 cap_query);
|
||||
void hv_setup_dma_ops(struct device *dev, bool coherent);
|
||||
|
|
|
|||
|
|
@ -260,6 +260,7 @@ union hv_hypervisor_version_info {
|
|||
#define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082
|
||||
/* Support for the extended IOAPIC RTE format */
|
||||
#define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2)
|
||||
#define HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE BIT(3)
|
||||
|
||||
#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
|
||||
#define HYPERV_CPUID_MIN 0x40000005
|
||||
|
|
@ -464,18 +465,21 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */
|
|||
#define HVCALL_RESET_DEBUG_SESSION 0x006b
|
||||
#define HVCALL_MAP_STATS_PAGE 0x006c
|
||||
#define HVCALL_UNMAP_STATS_PAGE 0x006d
|
||||
#define HVCALL_SET_SYSTEM_PROPERTY 0x006f
|
||||
#define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076
|
||||
#define HVCALL_GET_SYSTEM_PROPERTY 0x007b
|
||||
#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c
|
||||
#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d
|
||||
#define HVCALL_RETARGET_INTERRUPT 0x007e
|
||||
#define HVCALL_NOTIFY_PARTITION_EVENT 0x0087
|
||||
#define HVCALL_ENTER_SLEEP_STATE 0x0084
|
||||
#define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b
|
||||
#define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091
|
||||
#define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094
|
||||
#define HVCALL_CREATE_PORT 0x0095
|
||||
#define HVCALL_CONNECT_PORT 0x0096
|
||||
#define HVCALL_START_VP 0x0099
|
||||
#define HVCALL_GET_VP_INDEX_FROM_APIC_ID 0x009a
|
||||
#define HVCALL_GET_VP_INDEX_FROM_APIC_ID 0x009a
|
||||
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
|
||||
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
|
||||
#define HVCALL_SIGNAL_EVENT_DIRECT 0x00c0
|
||||
|
|
@ -490,8 +494,11 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */
|
|||
#define HVCALL_GET_VP_STATE 0x00e3
|
||||
#define HVCALL_SET_VP_STATE 0x00e4
|
||||
#define HVCALL_GET_VP_CPUID_VALUES 0x00f4
|
||||
#define HVCALL_GET_PARTITION_PROPERTY_EX 0x0101
|
||||
#define HVCALL_MMIO_READ 0x0106
|
||||
#define HVCALL_MMIO_WRITE 0x0107
|
||||
#define HVCALL_DISABLE_HYP_EX 0x010f
|
||||
#define HVCALL_MAP_STATS_PAGE2 0x0131
|
||||
|
||||
/* HV_HYPERCALL_INPUT */
|
||||
#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0)
|
||||
|
|
@ -880,6 +887,48 @@ struct hv_get_vp_from_apic_id_in {
|
|||
u32 apic_ids[];
|
||||
} __packed;
|
||||
|
||||
union hv_register_vsm_partition_config {
|
||||
u64 as_uint64;
|
||||
struct {
|
||||
u64 enable_vtl_protection : 1;
|
||||
u64 default_vtl_protection_mask : 4;
|
||||
u64 zero_memory_on_reset : 1;
|
||||
u64 deny_lower_vtl_startup : 1;
|
||||
u64 intercept_acceptance : 1;
|
||||
u64 intercept_enable_vtl_protection : 1;
|
||||
u64 intercept_vp_startup : 1;
|
||||
u64 intercept_cpuid_unimplemented : 1;
|
||||
u64 intercept_unrecoverable_exception : 1;
|
||||
u64 intercept_page : 1;
|
||||
u64 mbz : 51;
|
||||
} __packed;
|
||||
};
|
||||
|
||||
union hv_register_vsm_capabilities {
|
||||
u64 as_uint64;
|
||||
struct {
|
||||
u64 dr6_shared: 1;
|
||||
u64 mbec_vtl_mask: 16;
|
||||
u64 deny_lower_vtl_startup: 1;
|
||||
u64 supervisor_shadow_stack: 1;
|
||||
u64 hardware_hvpt_available: 1;
|
||||
u64 software_hvpt_available: 1;
|
||||
u64 hardware_hvpt_range_bits: 6;
|
||||
u64 intercept_page_available: 1;
|
||||
u64 return_action_available: 1;
|
||||
u64 reserved: 35;
|
||||
} __packed;
|
||||
};
|
||||
|
||||
union hv_register_vsm_page_offsets {
|
||||
struct {
|
||||
u64 vtl_call_offset : 12;
|
||||
u64 vtl_return_offset : 12;
|
||||
u64 reserved_mbz : 40;
|
||||
} __packed;
|
||||
u64 as_uint64;
|
||||
};
|
||||
|
||||
struct hv_nested_enlightenments_control {
|
||||
struct {
|
||||
u32 directhypercall : 1;
|
||||
|
|
@ -1002,6 +1051,70 @@ enum hv_register_name {
|
|||
|
||||
/* VSM */
|
||||
HV_REGISTER_VSM_VP_STATUS = 0x000D0003,
|
||||
|
||||
/* Synthetic VSM registers */
|
||||
HV_REGISTER_VSM_CODE_PAGE_OFFSETS = 0x000D0002,
|
||||
HV_REGISTER_VSM_CAPABILITIES = 0x000D0006,
|
||||
HV_REGISTER_VSM_PARTITION_CONFIG = 0x000D0007,
|
||||
|
||||
#if defined(CONFIG_X86)
|
||||
/* X64 Debug Registers */
|
||||
HV_X64_REGISTER_DR0 = 0x00050000,
|
||||
HV_X64_REGISTER_DR1 = 0x00050001,
|
||||
HV_X64_REGISTER_DR2 = 0x00050002,
|
||||
HV_X64_REGISTER_DR3 = 0x00050003,
|
||||
HV_X64_REGISTER_DR6 = 0x00050004,
|
||||
HV_X64_REGISTER_DR7 = 0x00050005,
|
||||
|
||||
/* X64 Cache control MSRs */
|
||||
HV_X64_REGISTER_MSR_MTRR_CAP = 0x0008000D,
|
||||
HV_X64_REGISTER_MSR_MTRR_DEF_TYPE = 0x0008000E,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0 = 0x00080010,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1 = 0x00080011,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2 = 0x00080012,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3 = 0x00080013,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4 = 0x00080014,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5 = 0x00080015,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6 = 0x00080016,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7 = 0x00080017,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8 = 0x00080018,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9 = 0x00080019,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA = 0x0008001A,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB = 0x0008001B,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC = 0x0008001C,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASED = 0x0008001D,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE = 0x0008001E,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF = 0x0008001F,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0 = 0x00080040,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1 = 0x00080041,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2 = 0x00080042,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3 = 0x00080043,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4 = 0x00080044,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5 = 0x00080045,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6 = 0x00080046,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7 = 0x00080047,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8 = 0x00080048,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9 = 0x00080049,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA = 0x0008004A,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB = 0x0008004B,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC = 0x0008004C,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD = 0x0008004D,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE = 0x0008004E,
|
||||
HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF = 0x0008004F,
|
||||
HV_X64_REGISTER_MSR_MTRR_FIX64K00000 = 0x00080070,
|
||||
HV_X64_REGISTER_MSR_MTRR_FIX16K80000 = 0x00080071,
|
||||
HV_X64_REGISTER_MSR_MTRR_FIX16KA0000 = 0x00080072,
|
||||
HV_X64_REGISTER_MSR_MTRR_FIX4KC0000 = 0x00080073,
|
||||
HV_X64_REGISTER_MSR_MTRR_FIX4KC8000 = 0x00080074,
|
||||
HV_X64_REGISTER_MSR_MTRR_FIX4KD0000 = 0x00080075,
|
||||
HV_X64_REGISTER_MSR_MTRR_FIX4KD8000 = 0x00080076,
|
||||
HV_X64_REGISTER_MSR_MTRR_FIX4KE0000 = 0x00080077,
|
||||
HV_X64_REGISTER_MSR_MTRR_FIX4KE8000 = 0x00080078,
|
||||
HV_X64_REGISTER_MSR_MTRR_FIX4KF0000 = 0x00080079,
|
||||
HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A,
|
||||
|
||||
HV_X64_REGISTER_REG_PAGE = 0x0009001C,
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -376,6 +376,46 @@ struct hv_input_set_partition_property {
|
|||
u64 property_value;
|
||||
} __packed;
|
||||
|
||||
union hv_partition_property_arg {
|
||||
u64 as_uint64;
|
||||
struct {
|
||||
union {
|
||||
u32 arg;
|
||||
u32 vp_index;
|
||||
};
|
||||
u16 reserved0;
|
||||
u8 reserved1;
|
||||
u8 object_type;
|
||||
} __packed;
|
||||
};
|
||||
|
||||
struct hv_input_get_partition_property_ex {
|
||||
u64 partition_id;
|
||||
u32 property_code; /* enum hv_partition_property_code */
|
||||
u32 padding;
|
||||
union {
|
||||
union hv_partition_property_arg arg_data;
|
||||
u64 arg;
|
||||
};
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* NOTE: Should use hv_input_set_partition_property_ex_header to compute this
|
||||
* size, but hv_input_get_partition_property_ex is identical so it suffices
|
||||
*/
|
||||
#define HV_PARTITION_PROPERTY_EX_MAX_VAR_SIZE \
|
||||
(HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_partition_property_ex))
|
||||
|
||||
union hv_partition_property_ex {
|
||||
u8 buffer[HV_PARTITION_PROPERTY_EX_MAX_VAR_SIZE];
|
||||
struct hv_partition_property_vmm_capabilities vmm_capabilities;
|
||||
/* More fields to be filled in when needed */
|
||||
};
|
||||
|
||||
struct hv_output_get_partition_property_ex {
|
||||
union hv_partition_property_ex property_value;
|
||||
} __packed;
|
||||
|
||||
enum hv_vp_state_page_type {
|
||||
HV_VP_STATE_PAGE_REGISTERS = 0,
|
||||
HV_VP_STATE_PAGE_INTERCEPT_MESSAGE = 1,
|
||||
|
|
@ -539,9 +579,15 @@ union hv_interrupt_control {
|
|||
u64 as_uint64;
|
||||
struct {
|
||||
u32 interrupt_type; /* enum hv_interrupt_type */
|
||||
#if IS_ENABLED(CONFIG_X86)
|
||||
u32 level_triggered : 1;
|
||||
u32 logical_dest_mode : 1;
|
||||
u32 rsvd : 30;
|
||||
#elif IS_ENABLED(CONFIG_ARM64)
|
||||
u32 rsvd1 : 2;
|
||||
u32 asserted : 1;
|
||||
u32 rsvd2 : 29;
|
||||
#endif
|
||||
} __packed;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -96,8 +96,34 @@ enum hv_partition_property_code {
|
|||
HV_PARTITION_PROPERTY_XSAVE_STATES = 0x00060007,
|
||||
HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE = 0x00060008,
|
||||
HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY = 0x00060009,
|
||||
|
||||
/* Extended properties with larger property values */
|
||||
HV_PARTITION_PROPERTY_VMM_CAPABILITIES = 0x00090007,
|
||||
};
|
||||
|
||||
#define HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT 1
|
||||
#define HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT 59
|
||||
|
||||
struct hv_partition_property_vmm_capabilities {
|
||||
u16 bank_count;
|
||||
u16 reserved[3];
|
||||
union {
|
||||
u64 as_uint64[HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT];
|
||||
struct {
|
||||
u64 map_gpa_preserve_adjustable: 1;
|
||||
u64 vmm_can_provide_overlay_gpfn: 1;
|
||||
u64 vp_affinity_property: 1;
|
||||
#if IS_ENABLED(CONFIG_ARM64)
|
||||
u64 vmm_can_provide_gic_overlay_locations: 1;
|
||||
#else
|
||||
u64 reservedbit3: 1;
|
||||
#endif
|
||||
u64 assignable_synthetic_proc_features: 1;
|
||||
u64 reserved0: HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT;
|
||||
} __packed;
|
||||
};
|
||||
} __packed;
|
||||
|
||||
enum hv_snp_status {
|
||||
HV_SNP_STATUS_NONE = 0,
|
||||
HV_SNP_STATUS_AVAILABLE = 1,
|
||||
|
|
@ -114,8 +140,33 @@ enum hv_snp_status {
|
|||
|
||||
enum hv_system_property {
|
||||
/* Add more values when needed */
|
||||
HV_SYSTEM_PROPERTY_SLEEP_STATE = 3,
|
||||
HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15,
|
||||
HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21,
|
||||
HV_SYSTEM_PROPERTY_CRASHDUMPAREA = 47,
|
||||
};
|
||||
|
||||
#define HV_PFN_RANGE_PGBITS 24 /* HV_SPA_PAGE_RANGE_ADDITIONAL_PAGES_BITS */
|
||||
union hv_pfn_range { /* HV_SPA_PAGE_RANGE */
|
||||
u64 as_uint64;
|
||||
struct {
|
||||
/* 39:0: base pfn. 63:40: additional pages */
|
||||
u64 base_pfn : 64 - HV_PFN_RANGE_PGBITS;
|
||||
u64 add_pfns : HV_PFN_RANGE_PGBITS;
|
||||
} __packed;
|
||||
};
|
||||
|
||||
enum hv_sleep_state {
|
||||
HV_SLEEP_STATE_S1 = 1,
|
||||
HV_SLEEP_STATE_S2 = 2,
|
||||
HV_SLEEP_STATE_S3 = 3,
|
||||
HV_SLEEP_STATE_S4 = 4,
|
||||
HV_SLEEP_STATE_S5 = 5,
|
||||
/*
|
||||
* After hypervisor has received this, any follow up sleep
|
||||
* state registration requests will be rejected.
|
||||
*/
|
||||
HV_SLEEP_STATE_LOCK = 6
|
||||
};
|
||||
|
||||
enum hv_dynamic_processor_feature_property {
|
||||
|
|
@ -142,15 +193,50 @@ struct hv_output_get_system_property {
|
|||
#if IS_ENABLED(CONFIG_X86)
|
||||
u64 hv_processor_feature_value;
|
||||
#endif
|
||||
union hv_pfn_range hv_cda_info; /* CrashdumpAreaAddress */
|
||||
u64 hv_tramp_pa; /* CrashdumpTrampolineAddress */
|
||||
};
|
||||
} __packed;
|
||||
|
||||
struct hv_sleep_state_info {
|
||||
u32 sleep_state; /* enum hv_sleep_state */
|
||||
u8 pm1a_slp_typ;
|
||||
u8 pm1b_slp_typ;
|
||||
} __packed;
|
||||
|
||||
struct hv_input_set_system_property {
|
||||
u32 property_id; /* enum hv_system_property */
|
||||
u32 reserved;
|
||||
union {
|
||||
/* More fields to be filled in when needed */
|
||||
struct hv_sleep_state_info set_sleep_state_info;
|
||||
|
||||
/*
|
||||
* Add a reserved field to ensure the union is 8-byte aligned as
|
||||
* existing members may not be. This is a temporary measure
|
||||
* until all remaining members are added.
|
||||
*/
|
||||
u64 reserved0[8];
|
||||
};
|
||||
} __packed;
|
||||
|
||||
struct hv_input_enter_sleep_state { /* HV_INPUT_ENTER_SLEEP_STATE */
|
||||
u32 sleep_state; /* enum hv_sleep_state */
|
||||
} __packed;
|
||||
|
||||
struct hv_input_map_stats_page {
|
||||
u32 type; /* enum hv_stats_object_type */
|
||||
u32 padding;
|
||||
union hv_stats_object_identity identity;
|
||||
} __packed;
|
||||
|
||||
struct hv_input_map_stats_page2 {
|
||||
u32 type; /* enum hv_stats_object_type */
|
||||
u32 padding;
|
||||
union hv_stats_object_identity identity;
|
||||
u64 map_location;
|
||||
} __packed;
|
||||
|
||||
struct hv_output_map_stats_page {
|
||||
u64 map_location;
|
||||
} __packed;
|
||||
|
|
@ -234,6 +320,48 @@ union hv_gpa_page_access_state {
|
|||
u8 as_uint8;
|
||||
} __packed;
|
||||
|
||||
enum hv_crashdump_action {
|
||||
HV_CRASHDUMP_NONE = 0,
|
||||
HV_CRASHDUMP_SUSPEND_ALL_VPS,
|
||||
HV_CRASHDUMP_PREPARE_FOR_STATE_SAVE,
|
||||
HV_CRASHDUMP_STATE_SAVED,
|
||||
HV_CRASHDUMP_ENTRY,
|
||||
};
|
||||
|
||||
struct hv_partition_event_root_crashdump_input {
|
||||
u32 crashdump_action; /* enum hv_crashdump_action */
|
||||
} __packed;
|
||||
|
||||
struct hv_input_disable_hyp_ex { /* HV_X64_INPUT_DISABLE_HYPERVISOR_EX */
|
||||
u64 rip;
|
||||
u64 arg;
|
||||
} __packed;
|
||||
|
||||
struct hv_crashdump_area { /* HV_CRASHDUMP_AREA */
|
||||
u32 version;
|
||||
union {
|
||||
u32 flags_as_uint32;
|
||||
struct {
|
||||
u32 cda_valid : 1;
|
||||
u32 cda_unused : 31;
|
||||
} __packed;
|
||||
};
|
||||
/* more unused fields */
|
||||
} __packed;
|
||||
|
||||
union hv_partition_event_input {
|
||||
struct hv_partition_event_root_crashdump_input crashdump_input;
|
||||
};
|
||||
|
||||
enum hv_partition_event {
|
||||
HV_PARTITION_EVENT_ROOT_CRASHDUMP = 2,
|
||||
};
|
||||
|
||||
struct hv_input_notify_partition_event {
|
||||
u32 event; /* enum hv_partition_event */
|
||||
union hv_partition_event_input input;
|
||||
} __packed;
|
||||
|
||||
struct hv_lp_startup_status {
|
||||
u64 hv_status;
|
||||
u64 substatus1;
|
||||
|
|
|
|||
|
|
@ -11,6 +11,10 @@
|
|||
#define __has_builtin(x) (0)
|
||||
#endif
|
||||
|
||||
/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
|
||||
#define ___PASTE(a, b) a##b
|
||||
#define __PASTE(a, b) ___PASTE(a, b)
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
/*
|
||||
|
|
@ -79,10 +83,6 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
|
|||
# define __builtin_warning(x, y...) (1)
|
||||
#endif /* __CHECKER__ */
|
||||
|
||||
/* Indirect macros required for expanded argument pasting, eg. __LINE__. */
|
||||
#define ___PASTE(a,b) a##b
|
||||
#define __PASTE(a,b) ___PASTE(a,b)
|
||||
|
||||
#ifdef __KERNEL__
|
||||
|
||||
/* Attributes */
|
||||
|
|
|
|||
|
|
@ -265,16 +265,18 @@ static inline u32 hv_get_avail_to_write_percent(
|
|||
* Linux kernel.
|
||||
*/
|
||||
|
||||
#define VERSION_WS2008 ((0 << 16) | (13))
|
||||
#define VERSION_WIN7 ((1 << 16) | (1))
|
||||
#define VERSION_WIN8 ((2 << 16) | (4))
|
||||
#define VERSION_WIN8_1 ((3 << 16) | (0))
|
||||
#define VERSION_WIN10 ((4 << 16) | (0))
|
||||
#define VERSION_WIN10_V4_1 ((4 << 16) | (1))
|
||||
#define VERSION_WIN10_V5 ((5 << 16) | (0))
|
||||
#define VERSION_WIN10_V5_1 ((5 << 16) | (1))
|
||||
#define VERSION_WIN10_V5_2 ((5 << 16) | (2))
|
||||
#define VERSION_WIN10_V5_3 ((5 << 16) | (3))
|
||||
#define VMBUS_MAKE_VERSION(MAJ, MIN) ((((u32)MAJ) << 16) | (MIN))
|
||||
#define VERSION_WS2008 VMBUS_MAKE_VERSION(0, 13)
|
||||
#define VERSION_WIN7 VMBUS_MAKE_VERSION(1, 1)
|
||||
#define VERSION_WIN8 VMBUS_MAKE_VERSION(2, 4)
|
||||
#define VERSION_WIN8_1 VMBUS_MAKE_VERSION(3, 0)
|
||||
#define VERSION_WIN10 VMBUS_MAKE_VERSION(4, 0)
|
||||
#define VERSION_WIN10_V4_1 VMBUS_MAKE_VERSION(4, 1)
|
||||
#define VERSION_WIN10_V5 VMBUS_MAKE_VERSION(5, 0)
|
||||
#define VERSION_WIN10_V5_1 VMBUS_MAKE_VERSION(5, 1)
|
||||
#define VERSION_WIN10_V5_2 VMBUS_MAKE_VERSION(5, 2)
|
||||
#define VERSION_WIN10_V5_3 VMBUS_MAKE_VERSION(5, 3)
|
||||
#define VERSION_WIN10_V6_0 VMBUS_MAKE_VERSION(6, 0)
|
||||
|
||||
/* Make maximum size of pipe payload of 16K */
|
||||
#define MAX_PIPE_DATA_PAYLOAD (sizeof(u8) * 16384)
|
||||
|
|
@ -335,14 +337,22 @@ struct vmbus_channel_offer {
|
|||
} __packed;
|
||||
|
||||
/* Server Flags */
|
||||
#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 1
|
||||
#define VMBUS_CHANNEL_SERVER_SUPPORTS_TRANSFER_PAGES 2
|
||||
#define VMBUS_CHANNEL_SERVER_SUPPORTS_GPADLS 4
|
||||
#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x10
|
||||
#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x100
|
||||
#define VMBUS_CHANNEL_PARENT_OFFER 0x200
|
||||
#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x400
|
||||
#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000
|
||||
#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 0x0001
|
||||
/*
|
||||
* This flag indicates that the channel is offered by the paravisor, and must
|
||||
* use encrypted memory for the channel ring buffer.
|
||||
*/
|
||||
#define VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER 0x0002
|
||||
/*
|
||||
* This flag indicates that the channel is offered by the paravisor, and must
|
||||
* use encrypted memory for GPA direct packets and additional GPADLs.
|
||||
*/
|
||||
#define VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY 0x0004
|
||||
#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x0010
|
||||
#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x0100
|
||||
#define VMBUS_CHANNEL_PARENT_OFFER 0x0200
|
||||
#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x0400
|
||||
#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000
|
||||
|
||||
struct vmpacket_descriptor {
|
||||
u16 type;
|
||||
|
|
@ -621,6 +631,12 @@ struct vmbus_channel_relid_released {
|
|||
u32 child_relid;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* Used by the paravisor only, means that the encrypted ring buffers and
|
||||
* the encrypted external memory are supported
|
||||
*/
|
||||
#define VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS 0x10
|
||||
|
||||
struct vmbus_channel_initiate_contact {
|
||||
struct vmbus_channel_message_header header;
|
||||
u32 vmbus_version_requested;
|
||||
|
|
@ -630,7 +646,8 @@ struct vmbus_channel_initiate_contact {
|
|||
struct {
|
||||
u8 msg_sint;
|
||||
u8 msg_vtl;
|
||||
u8 reserved[6];
|
||||
u8 reserved[2];
|
||||
u32 feature_flags; /* VMBus version 6.0 */
|
||||
};
|
||||
};
|
||||
u64 monitor_page1;
|
||||
|
|
@ -1003,6 +1020,10 @@ struct vmbus_channel {
|
|||
|
||||
/* boolean to control visibility of sysfs for ring buffer */
|
||||
bool ring_sysfs_visible;
|
||||
/* The ring buffer is encrypted */
|
||||
bool co_ring_buffer;
|
||||
/* The external memory is encrypted */
|
||||
bool co_external_memory;
|
||||
};
|
||||
|
||||
#define lock_requestor(channel, flags) \
|
||||
|
|
@ -1027,6 +1048,16 @@ u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id,
|
|||
u64 rqst_addr);
|
||||
u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id);
|
||||
|
||||
static inline bool is_co_ring_buffer(const struct vmbus_channel_offer_channel *o)
|
||||
{
|
||||
return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER);
|
||||
}
|
||||
|
||||
static inline bool is_co_external_memory(const struct vmbus_channel_offer_channel *o)
|
||||
{
|
||||
return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY);
|
||||
}
|
||||
|
||||
static inline bool is_hvsock_offer(const struct vmbus_channel_offer_channel *o)
|
||||
{
|
||||
return !!(o->offer.chn_flags & VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER);
|
||||
|
|
|
|||
|
|
@ -25,6 +25,8 @@
|
|||
#define STATIC_CALL_SITE_INIT 2UL /* init section */
|
||||
#define STATIC_CALL_SITE_FLAGS 3UL
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
/*
|
||||
* The static call site table needs to be created by external tooling (objtool
|
||||
* or a compiler plugin).
|
||||
|
|
@ -100,4 +102,6 @@ struct static_call_key {
|
|||
|
||||
#endif /* CONFIG_HAVE_STATIC_CALL */
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#endif /* _STATIC_CALL_TYPES_H */
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ enum {
|
|||
MSHV_PT_BIT_LAPIC,
|
||||
MSHV_PT_BIT_X2APIC,
|
||||
MSHV_PT_BIT_GPA_SUPER_PAGES,
|
||||
MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES,
|
||||
MSHV_PT_BIT_COUNT,
|
||||
};
|
||||
|
||||
|
|
@ -41,6 +42,8 @@ enum {
|
|||
* @pt_flags: Bitmask of 1 << MSHV_PT_BIT_*
|
||||
* @pt_isolation: MSHV_PT_ISOLATION_*
|
||||
*
|
||||
* This is the initial/v1 version for backward compatibility.
|
||||
*
|
||||
* Returns a file descriptor to act as a handle to a guest partition.
|
||||
* At this point the partition is not yet initialized in the hypervisor.
|
||||
* Some operations must be done with the partition in this state, e.g. setting
|
||||
|
|
@ -52,6 +55,37 @@ struct mshv_create_partition {
|
|||
__u64 pt_isolation;
|
||||
};
|
||||
|
||||
#define MSHV_NUM_CPU_FEATURES_BANKS 2
|
||||
|
||||
/**
|
||||
* struct mshv_create_partition_v2
|
||||
*
|
||||
* This is extended version of the above initial MSHV_CREATE_PARTITION
|
||||
* ioctl and allows for following additional parameters:
|
||||
*
|
||||
* @pt_num_cpu_fbanks: Must be set to MSHV_NUM_CPU_FEATURES_BANKS.
|
||||
* @pt_cpu_fbanks: Disabled processor feature banks array.
|
||||
* @pt_disabled_xsave: Disabled xsave feature bits.
|
||||
*
|
||||
* pt_cpu_fbanks and pt_disabled_xsave are passed through as-is to the create
|
||||
* partition hypercall.
|
||||
*
|
||||
* Returns : same as above original mshv_create_partition
|
||||
*/
|
||||
struct mshv_create_partition_v2 {
|
||||
__u64 pt_flags;
|
||||
__u64 pt_isolation;
|
||||
__u16 pt_num_cpu_fbanks;
|
||||
__u8 pt_rsvd[6]; /* MBZ */
|
||||
__u64 pt_cpu_fbanks[MSHV_NUM_CPU_FEATURES_BANKS];
|
||||
__u64 pt_rsvd1[2]; /* MBZ */
|
||||
#if defined(__x86_64__)
|
||||
__u64 pt_disabled_xsave;
|
||||
#else
|
||||
__u64 pt_rsvd2; /* MBZ */
|
||||
#endif
|
||||
} __packed;
|
||||
|
||||
/* /dev/mshv */
|
||||
#define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition)
|
||||
|
||||
|
|
@ -89,7 +123,7 @@ enum {
|
|||
* @rsvd: MBZ
|
||||
*
|
||||
* Map or unmap a region of userspace memory to Guest Physical Addresses (GPA).
|
||||
* Mappings can't overlap in GPA space or userspace.
|
||||
* Mappings can't overlap in GPA space.
|
||||
* To unmap, these fields must match an existing mapping.
|
||||
*/
|
||||
struct mshv_user_mem_region {
|
||||
|
|
@ -288,4 +322,84 @@ struct mshv_get_set_vp_state {
|
|||
* #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall)
|
||||
*/
|
||||
|
||||
/* Structure definitions, macros and IOCTLs for mshv_vtl */
|
||||
|
||||
#define MSHV_CAP_CORE_API_STABLE 0x0
|
||||
#define MSHV_CAP_REGISTER_PAGE 0x1
|
||||
#define MSHV_CAP_VTL_RETURN_ACTION 0x2
|
||||
#define MSHV_CAP_DR6_SHARED 0x3
|
||||
#define MSHV_MAX_RUN_MSG_SIZE 256
|
||||
|
||||
struct mshv_vp_registers {
|
||||
__u32 count; /* supports only 1 register at a time */
|
||||
__u32 reserved; /* Reserved for alignment or future use */
|
||||
__u64 regs_ptr; /* pointer to struct hv_register_assoc */
|
||||
};
|
||||
|
||||
struct mshv_vtl_set_eventfd {
|
||||
__s32 fd;
|
||||
__u32 flag;
|
||||
};
|
||||
|
||||
struct mshv_vtl_signal_event {
|
||||
__u32 connection_id;
|
||||
__u32 flag;
|
||||
};
|
||||
|
||||
struct mshv_vtl_sint_post_msg {
|
||||
__u64 message_type;
|
||||
__u32 connection_id;
|
||||
__u32 payload_size; /* Must not exceed HV_MESSAGE_PAYLOAD_BYTE_COUNT */
|
||||
__u64 payload_ptr; /* pointer to message payload (bytes) */
|
||||
};
|
||||
|
||||
struct mshv_vtl_ram_disposition {
|
||||
__u64 start_pfn;
|
||||
__u64 last_pfn;
|
||||
};
|
||||
|
||||
struct mshv_vtl_set_poll_file {
|
||||
__u32 cpu;
|
||||
__u32 fd;
|
||||
};
|
||||
|
||||
struct mshv_vtl_hvcall_setup {
|
||||
__u64 bitmap_array_size; /* stores number of bytes */
|
||||
__u64 allow_bitmap_ptr;
|
||||
};
|
||||
|
||||
struct mshv_vtl_hvcall {
|
||||
__u64 control; /* Hypercall control code */
|
||||
__u64 input_size; /* Size of the input data */
|
||||
__u64 input_ptr; /* Pointer to the input struct */
|
||||
__u64 status; /* Status of the hypercall (output) */
|
||||
__u64 output_size; /* Size of the output data */
|
||||
__u64 output_ptr; /* Pointer to the output struct */
|
||||
};
|
||||
|
||||
struct mshv_sint_mask {
|
||||
__u8 mask;
|
||||
__u8 reserved[7];
|
||||
};
|
||||
|
||||
/* /dev/mshv device IOCTL */
|
||||
#define MSHV_CHECK_EXTENSION _IOW(MSHV_IOCTL, 0x00, __u32)
|
||||
|
||||
/* vtl device */
|
||||
#define MSHV_CREATE_VTL _IOR(MSHV_IOCTL, 0x1D, char)
|
||||
#define MSHV_ADD_VTL0_MEMORY _IOW(MSHV_IOCTL, 0x21, struct mshv_vtl_ram_disposition)
|
||||
#define MSHV_SET_POLL_FILE _IOW(MSHV_IOCTL, 0x25, struct mshv_vtl_set_poll_file)
|
||||
#define MSHV_RETURN_TO_LOWER_VTL _IO(MSHV_IOCTL, 0x27)
|
||||
#define MSHV_GET_VP_REGISTERS _IOWR(MSHV_IOCTL, 0x05, struct mshv_vp_registers)
|
||||
#define MSHV_SET_VP_REGISTERS _IOW(MSHV_IOCTL, 0x06, struct mshv_vp_registers)
|
||||
|
||||
/* VMBus device IOCTLs */
|
||||
#define MSHV_SINT_SIGNAL_EVENT _IOW(MSHV_IOCTL, 0x22, struct mshv_vtl_signal_event)
|
||||
#define MSHV_SINT_POST_MESSAGE _IOW(MSHV_IOCTL, 0x23, struct mshv_vtl_sint_post_msg)
|
||||
#define MSHV_SINT_SET_EVENTFD _IOW(MSHV_IOCTL, 0x24, struct mshv_vtl_set_eventfd)
|
||||
#define MSHV_SINT_PAUSE_MESSAGE_STREAM _IOW(MSHV_IOCTL, 0x25, struct mshv_sint_mask)
|
||||
|
||||
/* hv_hvcall device */
|
||||
#define MSHV_HVCALL_SETUP _IOW(MSHV_IOCTL, 0x1E, struct mshv_vtl_hvcall_setup)
|
||||
#define MSHV_HVCALL _IOWR(MSHV_IOCTL, 0x1F, struct mshv_vtl_hvcall)
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -25,6 +25,8 @@
|
|||
#define STATIC_CALL_SITE_INIT 2UL /* init section */
|
||||
#define STATIC_CALL_SITE_FLAGS 3UL
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
/*
|
||||
* The static call site table needs to be created by external tooling (objtool
|
||||
* or a compiler plugin).
|
||||
|
|
@ -100,4 +102,6 @@ struct static_call_key {
|
|||
|
||||
#endif /* CONFIG_HAVE_STATIC_CALL */
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#endif /* _STATIC_CALL_TYPES_H */
|
||||
|
|
|
|||
Loading…
Reference in New Issue