From e96204e5e96ea3cacb5686e06ed29977c023254f Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 7 Feb 2025 11:03:21 -0800 Subject: [PATCH 01/24] hyperv: Move hv_current_partition_id to arch-generic code Move hv_current_partition_id and hv_get_partition_id() to hv_common.c, and call hv_get_partition_id() on arm64 in hyperv_init(). These aren't specific to x86_64 and will be needed by common code. Set hv_current_partition_id to HV_PARTITION_ID_SELF by default. Rename struct hv_get_partition_id to hv_output_get_partition_id, to make it distinct from the function hv_get_partition_id(), and match the original Hyper-V struct name. Remove the BUG()s. Failing to get the id need not crash the machine. Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1738955002-20821-2-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1738955002-20821-2-git-send-email-nunodasneves@linux.microsoft.com> --- arch/arm64/hyperv/mshyperv.c | 3 +++ arch/x86/hyperv/hv_init.c | 25 +------------------------ arch/x86/include/asm/mshyperv.h | 2 -- drivers/hv/hv_common.c | 22 ++++++++++++++++++++++ include/asm-generic/mshyperv.h | 2 ++ include/hyperv/hvgdk_mini.h | 2 +- 6 files changed, 29 insertions(+), 27 deletions(-) diff --git a/arch/arm64/hyperv/mshyperv.c b/arch/arm64/hyperv/mshyperv.c index fc49949b7df6..29fcfd595f48 100644 --- a/arch/arm64/hyperv/mshyperv.c +++ b/arch/arm64/hyperv/mshyperv.c @@ -72,6 +72,9 @@ static int __init hyperv_init(void) return ret; } + if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID) + hv_get_partition_id(); + ms_hyperv_late_init(); hyperv_initialized = true; diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 173005e6a95d..9be1446f5bd3 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -34,9 +34,6 @@ #include #include -u64 hv_current_partition_id = ~0ull; -EXPORT_SYMBOL_GPL(hv_current_partition_id); - void *hv_hypercall_pg; EXPORT_SYMBOL_GPL(hv_hypercall_pg); @@ -393,24 +390,6 @@ static void __init hv_stimer_setup_percpu_clockev(void) old_setup_percpu_clockev(); } -static void __init hv_get_partition_id(void) -{ - struct hv_get_partition_id *output_page; - u64 status; - unsigned long flags; - - local_irq_save(flags); - output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); - status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page); - if (!hv_result_success(status)) { - /* No point in proceeding if this failed */ - pr_err("Failed to get partition ID: %lld\n", status); - BUG(); - } - hv_current_partition_id = output_page->partition_id; - local_irq_restore(flags); -} - #if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) static u8 __init get_vtl(void) { @@ -605,11 +584,9 @@ skip_hypercall_pg_init: register_syscore_ops(&hv_syscore_ops); - if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID) + if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID) hv_get_partition_id(); - BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull); - #ifdef CONFIG_PCI_MSI /* * If we're running as root, we want to create our own PCI MSI domain. diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index f91ab1e75f9f..8d3ada3e8d0d 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -43,8 +43,6 @@ extern bool hyperv_paravisor_present; extern void *hv_hypercall_pg; -extern u64 hv_current_partition_id; - extern union hv_ghcb * __percpu *hv_ghcb_pg; bool hv_isolation_type_snp(void); diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index f2e6f55d6ca6..ee3083937b4f 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -31,6 +31,9 @@ #include #include +u64 hv_current_partition_id = HV_PARTITION_ID_SELF; +EXPORT_SYMBOL_GPL(hv_current_partition_id); + /* * hv_root_partition, ms_hyperv and hv_nested are defined here with other * Hyper-V specific globals so they are shared across all architectures and are @@ -283,6 +286,25 @@ static inline bool hv_output_page_exists(void) return hv_root_partition || IS_ENABLED(CONFIG_HYPERV_VTL_MODE); } +void __init hv_get_partition_id(void) +{ + struct hv_output_get_partition_id *output; + unsigned long flags; + u64 status, pt_id; + + local_irq_save(flags); + output = *this_cpu_ptr(hyperv_pcpu_input_arg); + status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, &output); + pt_id = output->partition_id; + local_irq_restore(flags); + + if (hv_result_success(status)) + hv_current_partition_id = pt_id; + else + pr_err("Hyper-V: failed to get partition ID: %#x\n", + hv_result(status)); +} + int __init hv_common_init(void) { int i; diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index a7bbe504e4f3..febeddf6cd8a 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -58,6 +58,7 @@ struct ms_hyperv_info { }; extern struct ms_hyperv_info ms_hyperv; extern bool hv_nested; +extern u64 hv_current_partition_id; extern void * __percpu *hyperv_pcpu_input_arg; extern void * __percpu *hyperv_pcpu_output_arg; @@ -207,6 +208,7 @@ extern u64 (*hv_read_reference_counter)(void); #define VP_INVAL U32_MAX int __init hv_common_init(void); +void __init hv_get_partition_id(void); void __init hv_common_free(void); void __init ms_hyperv_late_init(void); int hv_common_cpu_init(unsigned int cpu); diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 155615175965..58895883f636 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -182,7 +182,7 @@ struct hv_tsc_emulation_control { /* HV_TSC_INVARIANT_CONTROL */ #endif /* CONFIG_X86 */ -struct hv_get_partition_id { /* HV_OUTPUT_GET_PARTITION_ID */ +struct hv_output_get_partition_id { u64 partition_id; } __packed; From 0222eb30a3572cc9c4e2f0a3bb37f8f71089f2b6 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 7 Feb 2025 11:03:22 -0800 Subject: [PATCH 02/24] hyperv: Move arch/x86/hyperv/hv_proc.c to drivers/hv These helpers are not specific to x86_64 and will be needed by common code. Remove some unnecessary #includes. Reviewed-by: Michael Kelley Signed-off-by: Nuno Das Neves Link: https://lore.kernel.org/r/1738955002-20821-3-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1738955002-20821-3-git-send-email-nunodasneves@linux.microsoft.com> --- arch/x86/hyperv/Makefile | 2 +- arch/x86/include/asm/mshyperv.h | 4 ---- drivers/hv/Makefile | 2 +- {arch/x86/hyperv => drivers/hv}/hv_proc.c | 4 ---- include/asm-generic/mshyperv.h | 4 ++++ 5 files changed, 6 insertions(+), 10 deletions(-) rename {arch/x86/hyperv => drivers/hv}/hv_proc.c (98%) diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 3a1548054b48..d55f494f471d 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o -obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o +obj-$(CONFIG_X86_64) += hv_apic.o obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 8d3ada3e8d0d..7dfca93ef048 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -56,10 +56,6 @@ u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); #define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL #define HV_AP_SEGMENT_LIMIT 0xffffffff -int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); -int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); -int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); - /* * If the hypercall involves no input or output parameters, the hypervisor * ignores the corresponding GPA pointer. diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index b992c0ed182b..9afcabb3fbd2 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -13,4 +13,4 @@ hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o # Code that must be built-in -obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o +obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o hv_proc.o diff --git a/arch/x86/hyperv/hv_proc.c b/drivers/hv/hv_proc.c similarity index 98% rename from arch/x86/hyperv/hv_proc.c rename to drivers/hv/hv_proc.c index ac4c834d4435..3e410489f480 100644 --- a/arch/x86/hyperv/hv_proc.c +++ b/drivers/hv/hv_proc.c @@ -6,11 +6,7 @@ #include #include #include -#include #include -#include - -#include /* * See struct hv_deposit_memory. The first u64 is partition ID, the rest diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index febeddf6cd8a..7adc10a4fa3e 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -218,6 +218,10 @@ void *hv_alloc_hyperv_page(void); void *hv_alloc_hyperv_zeroed_page(void); void hv_free_hyperv_page(void *addr); +int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); +int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); +int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); + /** * hv_cpu_number_to_vp_number() - Map CPU to VP. * @cpu_number: CPU number in Linux terms From 7c0db8a4f59d84554a9edc7409bec2ef59063439 Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Fri, 17 Jan 2025 15:33:06 -0500 Subject: [PATCH 03/24] cpu: export lockdep_assert_cpus_held() If CONFIG_HYPERV=m, lockdep_assert_cpus_held() is undefined for HyperV. So, export the function so that GPL drivers can use it more broadly. Cc: Michael Kelley Signed-off-by: Hamza Mahfooz Reviewed-by: Michael Kelley Tested-by: Michael Kelley Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20250117203309.192072-1-hamzamahfooz@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20250117203309.192072-1-hamzamahfooz@linux.microsoft.com> --- kernel/cpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 07455d25329c..ca56397c3b1d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -526,6 +526,7 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(&cpu_hotplug_lock); } +EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held); #ifdef CONFIG_LOCKDEP int lockdep_is_cpus_held(void) From 5e4304ff8cd9330690de73df7d047014dce191bd Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Fri, 17 Jan 2025 15:33:07 -0500 Subject: [PATCH 04/24] drivers/hv: introduce vmbus_channel_set_cpu() The core functionality in target_cpu_store() is also needed in a subsequent patch for automatically changing the CPU when taking a CPU offline. As such, factor out the body of target_cpu_store() into new function vmbus_channel_set_cpu() that can also be used elsewhere. No functional change is intended. Cc: Boqun Feng Cc: Michael Kelley Cc: Wei Liu Signed-off-by: Hamza Mahfooz Reviewed-by: Michael Kelley Tested-by: Michael Kelley Link: https://lore.kernel.org/r/20250117203309.192072-2-hamzamahfooz@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20250117203309.192072-2-hamzamahfooz@linux.microsoft.com> --- drivers/hv/vmbus_drv.c | 52 +++++++++++++++++++++++++----------------- include/linux/hyperv.h | 1 + 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 0f6cd44fff29..75eb1390b45c 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1611,18 +1611,18 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf) { return sprintf(buf, "%u\n", channel->target_cpu); } -static ssize_t target_cpu_store(struct vmbus_channel *channel, - const char *buf, size_t count) + +int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu) { - u32 target_cpu, origin_cpu; - ssize_t ret = count; + u32 origin_cpu; + int ret = 0; + + lockdep_assert_cpus_held(); + lockdep_assert_held(&vmbus_connection.channel_mutex); if (vmbus_proto_version < VERSION_WIN10_V4_1) return -EIO; - if (sscanf(buf, "%uu", &target_cpu) != 1) - return -EIO; - /* Validate target_cpu for the cpumask_test_cpu() operation below. */ if (target_cpu >= nr_cpumask_bits) return -EINVAL; @@ -1630,22 +1630,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ))) return -EINVAL; - /* No CPUs should come up or down during this. */ - cpus_read_lock(); - - if (!cpu_online(target_cpu)) { - cpus_read_unlock(); + if (!cpu_online(target_cpu)) return -EINVAL; - } /* - * Synchronizes target_cpu_store() and channel closure: + * Synchronizes vmbus_channel_set_cpu() and channel closure: * * { Initially: state = CHANNEL_OPENED } * * CPU1 CPU2 * - * [target_cpu_store()] [vmbus_disconnect_ring()] + * [vmbus_channel_set_cpu()] [vmbus_disconnect_ring()] * * LOCK channel_mutex LOCK channel_mutex * LOAD r1 = state LOAD r2 = state @@ -1660,7 +1655,6 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, * Note. The host processes the channel messages "sequentially", in * the order in which they are received on a per-partition basis. */ - mutex_lock(&vmbus_connection.channel_mutex); /* * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels; @@ -1668,17 +1662,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, */ if (channel->state != CHANNEL_OPENED_STATE) { ret = -EIO; - goto cpu_store_unlock; + goto end; } origin_cpu = channel->target_cpu; if (target_cpu == origin_cpu) - goto cpu_store_unlock; + goto end; if (vmbus_send_modifychannel(channel, hv_cpu_number_to_vp_number(target_cpu))) { ret = -EIO; - goto cpu_store_unlock; + goto end; } /* @@ -1708,10 +1702,26 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, origin_cpu, target_cpu); } -cpu_store_unlock: +end: + return ret; +} + +static ssize_t target_cpu_store(struct vmbus_channel *channel, + const char *buf, size_t count) +{ + u32 target_cpu; + ssize_t ret; + + if (sscanf(buf, "%uu", &target_cpu) != 1) + return -EIO; + + cpus_read_lock(); + mutex_lock(&vmbus_connection.channel_mutex); + ret = vmbus_channel_set_cpu(channel, target_cpu); mutex_unlock(&vmbus_connection.channel_mutex); cpus_read_unlock(); - return ret; + + return ret ?: count; } static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 4179add2864b..7f4f8d8bdf43 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1661,6 +1661,7 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, const guid_t *shv_host_servie_id); int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp); void vmbus_set_event(struct vmbus_channel *channel); +int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu); /* Get the start of the ring buffer. */ static inline void * From 3a7f7785eae7cf012af128ca9e383c91e4955354 Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Fri, 17 Jan 2025 15:33:08 -0500 Subject: [PATCH 05/24] drivers/hv: add CPU offlining support Currently, it is tedious to offline CPUs in a Hyper-V VM since CPUs may have VMBus channels attached to them that a user would have to manually rebind elsewhere. So, as made mention of in commit d570aec0f2154 ("Drivers: hv: vmbus: Synchronize init_vp_index() vs. CPU hotplug"), rebind channels associated with CPUs that a user is trying to offline to a new "randomly" selected CPU. Cc: Boqun Feng Cc: Michael Kelley Cc: Wei Liu Signed-off-by: Hamza Mahfooz Reviewed-by: Michael Kelley Tested-by: Michael Kelley Link: https://lore.kernel.org/r/20250117203309.192072-3-hamzamahfooz@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20250117203309.192072-3-hamzamahfooz@linux.microsoft.com> --- drivers/hv/hv.c | 72 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 36d9ba097ff5..fab0690b5c41 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -433,13 +433,47 @@ retry: return pending; } +static int hv_pick_new_cpu(struct vmbus_channel *channel) +{ + int ret = -EBUSY; + int start; + int cpu; + + lockdep_assert_cpus_held(); + lockdep_assert_held(&vmbus_connection.channel_mutex); + + /* + * We can't assume that the relevant interrupts will be sent before + * the cpu is offlined on older versions of hyperv. + */ + if (vmbus_proto_version < VERSION_WIN10_V5_3) + return -EBUSY; + + start = get_random_u32_below(nr_cpu_ids); + + for_each_cpu_wrap(cpu, cpu_online_mask, start) { + if (channel->target_cpu == cpu || + channel->target_cpu == VMBUS_CONNECT_CPU) + continue; + + ret = vmbus_channel_set_cpu(channel, cpu); + if (!ret) + break; + } + + if (ret) + ret = vmbus_channel_set_cpu(channel, VMBUS_CONNECT_CPU); + + return ret; +} + /* * hv_synic_cleanup - Cleanup routine for hv_synic_init(). */ int hv_synic_cleanup(unsigned int cpu) { struct vmbus_channel *channel, *sc; - bool channel_found = false; + int ret = 0; if (vmbus_connection.conn_state != CONNECTED) goto always_cleanup; @@ -456,38 +490,34 @@ int hv_synic_cleanup(unsigned int cpu) /* * Search for channels which are bound to the CPU we're about to - * cleanup. In case we find one and vmbus is still connected, we - * fail; this will effectively prevent CPU offlining. - * - * TODO: Re-bind the channels to different CPUs. + * cleanup. */ mutex_lock(&vmbus_connection.channel_mutex); list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { if (channel->target_cpu == cpu) { - channel_found = true; - break; + ret = hv_pick_new_cpu(channel); + if (ret) { + mutex_unlock(&vmbus_connection.channel_mutex); + return ret; + } } list_for_each_entry(sc, &channel->sc_list, sc_list) { if (sc->target_cpu == cpu) { - channel_found = true; - break; + ret = hv_pick_new_cpu(sc); + if (ret) { + mutex_unlock(&vmbus_connection.channel_mutex); + return ret; + } } } - if (channel_found) - break; } mutex_unlock(&vmbus_connection.channel_mutex); - if (channel_found) - return -EBUSY; - /* - * channel_found == false means that any channels that were previously - * assigned to the CPU have been reassigned elsewhere with a call of - * vmbus_send_modifychannel(). Scan the event flags page looking for - * bits that are set and waiting with a timeout for vmbus_chan_sched() - * to process such bits. If bits are still set after this operation - * and VMBus is connected, fail the CPU offlining operation. + * Scan the event flags page looking for bits that are set and waiting + * with a timeout for vmbus_chan_sched() to process such bits. If bits + * are still set after this operation and VMBus is connected, fail the + * CPU offlining operation. */ if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending()) return -EBUSY; @@ -497,5 +527,5 @@ always_cleanup: hv_synic_disable_regs(cpu); - return 0; + return ret; } From 9d8731a1757bef8630cb47e5ae3a1abbcf863e90 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 21 Feb 2025 11:56:33 -0800 Subject: [PATCH 06/24] hyperv: Convert hypercall statuses to linux error codes Return linux-friendly error codes from hypercall helper functions, which allows them to be used more flexibly. Introduce hv_result_to_errno() for this purpose, which also handles the special value U64_MAX returned from hv_do_hypercall(). Signed-off-by: Nuno Das Neves Reviewed-by: Easwar Hariharan Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1740167795-13296-2-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1740167795-13296-2-git-send-email-nunodasneves@linux.microsoft.com> --- drivers/hv/hv_common.c | 34 ++++++++++++++++++++++++++++++++++ drivers/hv/hv_proc.c | 10 +++++----- include/asm-generic/mshyperv.h | 1 + 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index ee3083937b4f..5cf9894b9e79 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -683,3 +683,37 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2) return HV_STATUS_INVALID_PARAMETER; } EXPORT_SYMBOL_GPL(hv_tdx_hypercall); + +/* Convert a hypercall result into a linux-friendly error code. */ +int hv_result_to_errno(u64 status) +{ + /* hv_do_hypercall() may return U64_MAX, hypercalls aren't possible */ + if (unlikely(status == U64_MAX)) + return -EOPNOTSUPP; + /* + * A failed hypercall is usually only recoverable (or loggable) near + * the call site where the HV_STATUS_* code is known. So the errno + * it gets converted to is not too useful further up the stack. + * Provide a few mappings that could be useful, and revert to -EIO + * as a fallback. + */ + switch (hv_result(status)) { + case HV_STATUS_SUCCESS: + return 0; + case HV_STATUS_INVALID_HYPERCALL_CODE: + case HV_STATUS_INVALID_HYPERCALL_INPUT: + case HV_STATUS_INVALID_PARAMETER: + case HV_STATUS_INVALID_PARTITION_ID: + case HV_STATUS_INVALID_VP_INDEX: + case HV_STATUS_INVALID_PORT_ID: + case HV_STATUS_INVALID_CONNECTION_ID: + case HV_STATUS_INVALID_LP_INDEX: + case HV_STATUS_INVALID_REGISTER_VALUE: + return -EINVAL; + case HV_STATUS_INSUFFICIENT_MEMORY: + return -ENOMEM; + default: + break; + } + return -EIO; +} diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c index 3e410489f480..2fae18e4f7d2 100644 --- a/drivers/hv/hv_proc.c +++ b/drivers/hv/hv_proc.c @@ -88,7 +88,7 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) local_irq_restore(flags); if (!hv_result_success(status)) { pr_err("Failed to deposit pages: %lld\n", status); - ret = hv_result(status); + ret = hv_result_to_errno(status); goto err_free_allocations; } @@ -114,7 +114,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) struct hv_output_add_logical_processor *output; u64 status; unsigned long flags; - int ret = HV_STATUS_SUCCESS; + int ret = 0; /* * When adding a logical processor, the hypervisor may return @@ -139,7 +139,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) if (!hv_result_success(status)) { pr_err("%s: cpu %u apic ID %u, %lld\n", __func__, lp_index, apic_id, status); - ret = hv_result(status); + ret = hv_result_to_errno(status); } break; } @@ -154,7 +154,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) struct hv_create_vp *input; u64 status; unsigned long irq_flags; - int ret = HV_STATUS_SUCCESS; + int ret = 0; /* Root VPs don't seem to need pages deposited */ if (partition_id != hv_current_partition_id) { @@ -181,7 +181,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) if (!hv_result_success(status)) { pr_err("%s: vcpu %u, lp %u, %lld\n", __func__, vp_index, flags, status); - ret = hv_result(status); + ret = hv_result_to_errno(status); } break; } diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 7adc10a4fa3e..3f115e2bcdaa 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -297,6 +297,7 @@ static inline int cpumask_to_vpset_skip(struct hv_vpset *vpset, return __cpumask_to_vpset(vpset, cpus, func); } +int hv_result_to_errno(u64 status); void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die); bool hv_is_hyperv_initialized(void); bool hv_is_hibernation_supported(void); From db912b8954c23a55dbc6dc683e0e06ffcb433848 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 21 Feb 2025 11:56:34 -0800 Subject: [PATCH 07/24] hyperv: Change hv_root_partition into a function Introduce hv_curr_partition_type to store the partition type as an enum. Right now this is limited to guest or root partition, but there will be other kinds in future and the enum is easily extensible. Set up hv_curr_partition_type early in Hyper-V initialization with hv_identify_partition_type(). hv_root_partition() just queries this value, and shouldn't be called before that. Making this check into a function sets the stage for adding a config option to gate the compilation of root partition code. In particular, hv_root_partition() can be stubbed out always be false if root partition support isn't desired. Signed-off-by: Nuno Das Neves Reviewed-by: Easwar Hariharan Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1740167795-13296-3-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1740167795-13296-3-git-send-email-nunodasneves@linux.microsoft.com> --- arch/arm64/hyperv/mshyperv.c | 2 ++ arch/x86/hyperv/hv_init.c | 10 +++++----- arch/x86/kernel/cpu/mshyperv.c | 24 ++-------------------- drivers/clocksource/hyperv_timer.c | 4 ++-- drivers/hv/hv.c | 10 +++++----- drivers/hv/hv_common.c | 32 ++++++++++++++++++++++++------ drivers/hv/vmbus_drv.c | 2 +- drivers/iommu/hyperv-iommu.c | 4 ++-- include/asm-generic/mshyperv.h | 15 ++++++++++++-- 9 files changed, 58 insertions(+), 45 deletions(-) diff --git a/arch/arm64/hyperv/mshyperv.c b/arch/arm64/hyperv/mshyperv.c index 29fcfd595f48..2265ea5ce5ad 100644 --- a/arch/arm64/hyperv/mshyperv.c +++ b/arch/arm64/hyperv/mshyperv.c @@ -61,6 +61,8 @@ static int __init hyperv_init(void) ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, ms_hyperv.misc_features); + hv_identify_partition_type(); + ret = hv_common_init(); if (ret) return ret; diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 9be1446f5bd3..ddeb40930bc8 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -90,7 +90,7 @@ static int hv_cpu_init(unsigned int cpu) return 0; hvp = &hv_vp_assist_page[cpu]; - if (hv_root_partition) { + if (hv_root_partition()) { /* * For root partition we get the hypervisor provided VP assist * page, instead of allocating a new page. @@ -242,7 +242,7 @@ static int hv_cpu_die(unsigned int cpu) if (hv_vp_assist_page && hv_vp_assist_page[cpu]) { union hv_vp_assist_msr_contents msr = { 0 }; - if (hv_root_partition) { + if (hv_root_partition()) { /* * For root partition the VP assist page is mapped to * hypervisor provided page, and thus we unmap the @@ -317,7 +317,7 @@ static int hv_suspend(void) union hv_x64_msr_hypercall_contents hypercall_msr; int ret; - if (hv_root_partition) + if (hv_root_partition()) return -EPERM; /* @@ -518,7 +518,7 @@ void __init hyperv_init(void) rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; - if (hv_root_partition) { + if (hv_root_partition()) { struct page *pg; void *src; @@ -592,7 +592,7 @@ skip_hypercall_pg_init: * If we're running as root, we want to create our own PCI MSI domain. * We can't set this in hv_pci_init because that would be too late. */ - if (hv_root_partition) + if (hv_root_partition()) x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; #endif diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index f285757618fc..4f01f424ea5b 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -33,8 +33,6 @@ #include #include -/* Is Linux running as the root partition? */ -bool hv_root_partition; /* Is Linux running on nested Microsoft Hypervisor */ bool hv_nested; struct ms_hyperv_info ms_hyperv; @@ -451,25 +449,7 @@ static void __init ms_hyperv_init_platform(void) pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); - /* - * Check CPU management privilege. - * - * To mirror what Windows does we should extract CPU management - * features and use the ReservedIdentityBit to detect if Linux is the - * root partition. But that requires negotiating CPU management - * interface (a process to be finalized). For now, use the privilege - * flag as the indicator for running as root. - * - * Hyper-V should never specify running as root and as a Confidential - * VM. But to protect against a compromised/malicious Hyper-V trying - * to exploit root behavior to expose Confidential VM memory, ignore - * the root partition setting if also a Confidential VM. - */ - if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) && - !(ms_hyperv.priv_high & HV_ISOLATION)) { - hv_root_partition = true; - pr_info("Hyper-V: running as root partition\n"); - } + hv_identify_partition_type(); if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) { hv_nested = true; @@ -618,7 +598,7 @@ static void __init ms_hyperv_init_platform(void) # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; - if (hv_root_partition || + if (hv_root_partition() || (!ms_hyperv.paravisor_present && hv_isolation_type_snp())) smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; # endif diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index f00019b078a7..09549451dd51 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -582,7 +582,7 @@ static void __init hv_init_tsc_clocksource(void) * mapped. */ tsc_msr.as_uint64 = hv_get_msr(HV_MSR_REFERENCE_TSC); - if (hv_root_partition) + if (hv_root_partition()) tsc_pfn = tsc_msr.pfn; else tsc_pfn = HVPFN_DOWN(virt_to_phys(tsc_page)); @@ -627,7 +627,7 @@ void __init hv_remap_tsc_clocksource(void) if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) return; - if (!hv_root_partition) { + if (!hv_root_partition()) { WARN(1, "%s: attempt to remap TSC page in guest partition\n", __func__); return; diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index fab0690b5c41..a38f84548bc2 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -144,7 +144,7 @@ int hv_synic_alloc(void) * Synic message and event pages are allocated by paravisor. * Skip these pages allocation here. */ - if (!ms_hyperv.paravisor_present && !hv_root_partition) { + if (!ms_hyperv.paravisor_present && !hv_root_partition()) { hv_cpu->synic_message_page = (void *)get_zeroed_page(GFP_ATOMIC); if (!hv_cpu->synic_message_page) { @@ -272,7 +272,7 @@ void hv_synic_enable_regs(unsigned int cpu) simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); simp.simp_enabled = 1; - if (ms_hyperv.paravisor_present || hv_root_partition) { + if (ms_hyperv.paravisor_present || hv_root_partition()) { /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; @@ -291,7 +291,7 @@ void hv_synic_enable_regs(unsigned int cpu) siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); siefp.siefp_enabled = 1; - if (ms_hyperv.paravisor_present || hv_root_partition) { + if (ms_hyperv.paravisor_present || hv_root_partition()) { /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; @@ -367,7 +367,7 @@ void hv_synic_disable_regs(unsigned int cpu) * addresses. */ simp.simp_enabled = 0; - if (ms_hyperv.paravisor_present || hv_root_partition) { + if (ms_hyperv.paravisor_present || hv_root_partition()) { iounmap(hv_cpu->synic_message_page); hv_cpu->synic_message_page = NULL; } else { @@ -379,7 +379,7 @@ void hv_synic_disable_regs(unsigned int cpu) siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); siefp.siefp_enabled = 0; - if (ms_hyperv.paravisor_present || hv_root_partition) { + if (ms_hyperv.paravisor_present || hv_root_partition()) { iounmap(hv_cpu->synic_event_page); hv_cpu->synic_event_page = NULL; } else { diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 5cf9894b9e79..3d9cfcfbc854 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -34,8 +34,11 @@ u64 hv_current_partition_id = HV_PARTITION_ID_SELF; EXPORT_SYMBOL_GPL(hv_current_partition_id); +enum hv_partition_type hv_curr_partition_type; +EXPORT_SYMBOL_GPL(hv_curr_partition_type); + /* - * hv_root_partition, ms_hyperv and hv_nested are defined here with other + * ms_hyperv and hv_nested are defined here with other * Hyper-V specific globals so they are shared across all architectures and are * built only when CONFIG_HYPERV is defined. But on x86, * ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not @@ -43,9 +46,6 @@ EXPORT_SYMBOL_GPL(hv_current_partition_id); * here, allowing for an overriding definition in the module containing * ms_hyperv_init_platform(). */ -bool __weak hv_root_partition; -EXPORT_SYMBOL_GPL(hv_root_partition); - bool __weak hv_nested; EXPORT_SYMBOL_GPL(hv_nested); @@ -283,7 +283,7 @@ static void hv_kmsg_dump_register(void) static inline bool hv_output_page_exists(void) { - return hv_root_partition || IS_ENABLED(CONFIG_HYPERV_VTL_MODE); + return hv_root_partition() || IS_ENABLED(CONFIG_HYPERV_VTL_MODE); } void __init hv_get_partition_id(void) @@ -594,7 +594,7 @@ EXPORT_SYMBOL_GPL(hv_setup_dma_ops); bool hv_is_hibernation_supported(void) { - return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4); + return !hv_root_partition() && acpi_sleep_state_supported(ACPI_STATE_S4); } EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); @@ -717,3 +717,23 @@ int hv_result_to_errno(u64 status) } return -EIO; } + +void hv_identify_partition_type(void) +{ + /* Assume guest role */ + hv_curr_partition_type = HV_PARTITION_TYPE_GUEST; + /* + * Check partition creation and cpu management privileges + * + * Hyper-V should never specify running as root and as a Confidential + * VM. But to protect against a compromised/malicious Hyper-V trying + * to exploit root behavior to expose Confidential VM memory, ignore + * the root partition setting if also a Confidential VM. + */ + if ((ms_hyperv.priv_high & HV_CREATE_PARTITIONS) && + (ms_hyperv.priv_high & HV_CPU_MANAGEMENT) && + !(ms_hyperv.priv_high & HV_ISOLATION)) { + pr_info("Hyper-V: running as root partition\n"); + hv_curr_partition_type = HV_PARTITION_TYPE_ROOT; + } +} diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 75eb1390b45c..22afebfc28ff 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2656,7 +2656,7 @@ static int __init hv_acpi_init(void) if (!hv_is_hyperv_initialized()) return -ENODEV; - if (hv_root_partition && !hv_nested) + if (hv_root_partition() && !hv_nested) return 0; /* diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index 2a86aa5d54c6..53e4b37716af 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -130,7 +130,7 @@ static int __init hyperv_prepare_irq_remapping(void) x86_init.hyper.msi_ext_dest_id()) return -ENODEV; - if (hv_root_partition) { + if (hv_root_partition()) { name = "HYPERV-ROOT-IR"; ops = &hyperv_root_ir_domain_ops; } else { @@ -151,7 +151,7 @@ static int __init hyperv_prepare_irq_remapping(void) return -ENOMEM; } - if (hv_root_partition) + if (hv_root_partition()) return 0; /* The rest is only relevant to guests */ /* diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 3f115e2bcdaa..54ebd630e72c 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -28,6 +28,11 @@ #define VTPM_BASE_ADDRESS 0xfed40000 +enum hv_partition_type { + HV_PARTITION_TYPE_GUEST, + HV_PARTITION_TYPE_ROOT, +}; + struct ms_hyperv_info { u32 features; u32 priv_high; @@ -59,6 +64,7 @@ struct ms_hyperv_info { extern struct ms_hyperv_info ms_hyperv; extern bool hv_nested; extern u64 hv_current_partition_id; +extern enum hv_partition_type hv_curr_partition_type; extern void * __percpu *hyperv_pcpu_input_arg; extern void * __percpu *hyperv_pcpu_output_arg; @@ -190,8 +196,6 @@ void hv_remove_crash_handler(void); extern int vmbus_interrupt; extern int vmbus_irq; -extern bool hv_root_partition; - #if IS_ENABLED(CONFIG_HYPERV) /* * Hypervisor's notion of virtual processor ID is different from @@ -213,6 +217,7 @@ void __init hv_common_free(void); void __init ms_hyperv_late_init(void); int hv_common_cpu_init(unsigned int cpu); int hv_common_cpu_die(unsigned int cpu); +void hv_identify_partition_type(void); void *hv_alloc_hyperv_page(void); void *hv_alloc_hyperv_zeroed_page(void); @@ -310,6 +315,7 @@ void hyperv_cleanup(void); bool hv_query_ext_cap(u64 cap_query); void hv_setup_dma_ops(struct device *dev, bool coherent); #else /* CONFIG_HYPERV */ +static inline void hv_identify_partition_type(void) {} static inline bool hv_is_hyperv_initialized(void) { return false; } static inline bool hv_is_hibernation_supported(void) { return false; } static inline void hyperv_cleanup(void) {} @@ -321,4 +327,9 @@ static inline enum hv_isolation_type hv_get_isolation_type(void) } #endif /* CONFIG_HYPERV */ +static inline bool hv_root_partition(void) +{ + return hv_curr_partition_type == HV_PARTITION_TYPE_ROOT; +} + #endif From 461fbbd036b11d755b50e2ef7c165859a0f908d5 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 21 Feb 2025 11:56:35 -0800 Subject: [PATCH 08/24] hyperv: Add CONFIG_MSHV_ROOT to gate root partition support CONFIG_MSHV_ROOT allows kernels built to run as a normal Hyper-V guest to exclude the root partition code, which is expected to grow significantly over time. This option is a tristate so future driver code can be built as a (m)odule, allowing faster development iteration cycles. If CONFIG_MSHV_ROOT is disabled, don't compile hv_proc.c, and stub hv_root_partition() to return false unconditionally. This allows the compiler to optimize away root partition code blocks since they will be disabled at compile time. In the case of booting as root partition *without* CONFIG_MSHV_ROOT enabled, print a critical error (the kernel will likely crash). Signed-off-by: Nuno Das Neves Reviewed-by: Easwar Hariharan Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1740167795-13296-4-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1740167795-13296-4-git-send-email-nunodasneves@linux.microsoft.com> --- drivers/hv/Kconfig | 16 ++++++++++++++++ drivers/hv/Makefile | 3 ++- drivers/hv/hv_common.c | 5 ++++- include/asm-generic/mshyperv.h | 24 ++++++++++++++++++++---- 4 files changed, 42 insertions(+), 6 deletions(-) diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 862c47b191af..3118d5472fab 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -55,4 +55,20 @@ config HYPERV_BALLOON help Select this option to enable Hyper-V Balloon driver. +config MSHV_ROOT + tristate "Microsoft Hyper-V root partition support" + depends on HYPERV && (X86_64 || ARM64) + depends on !HYPERV_VTL_MODE + # The hypervisor interface operates on 4k pages. Enforcing it here + # simplifies many assumptions in the root partition code. + # e.g. When withdrawing memory, the hypervisor gives back 4k pages in + # no particular order, making it impossible to reassemble larger pages + depends on PAGE_SIZE_4KB + default n + help + Select this option to enable support for booting and running as root + partition on Microsoft Hyper-V. + + If unsure, say N. + endmenu diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index 9afcabb3fbd2..2b8dc954b350 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -13,4 +13,5 @@ hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o # Code that must be built-in -obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o hv_proc.o +obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o +obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 3d9cfcfbc854..9804adb4cc56 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -734,6 +734,9 @@ void hv_identify_partition_type(void) (ms_hyperv.priv_high & HV_CPU_MANAGEMENT) && !(ms_hyperv.priv_high & HV_ISOLATION)) { pr_info("Hyper-V: running as root partition\n"); - hv_curr_partition_type = HV_PARTITION_TYPE_ROOT; + if (IS_ENABLED(CONFIG_MSHV_ROOT)) + hv_curr_partition_type = HV_PARTITION_TYPE_ROOT; + else + pr_crit("Hyper-V: CONFIG_MSHV_ROOT not enabled!\n"); } } diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 54ebd630e72c..b13b0cda4ac8 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -223,10 +223,6 @@ void *hv_alloc_hyperv_page(void); void *hv_alloc_hyperv_zeroed_page(void); void hv_free_hyperv_page(void *addr); -int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); -int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); -int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); - /** * hv_cpu_number_to_vp_number() - Map CPU to VP. * @cpu_number: CPU number in Linux terms @@ -327,9 +323,29 @@ static inline enum hv_isolation_type hv_get_isolation_type(void) } #endif /* CONFIG_HYPERV */ +#if IS_ENABLED(CONFIG_MSHV_ROOT) static inline bool hv_root_partition(void) { return hv_curr_partition_type == HV_PARTITION_TYPE_ROOT; } +int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); +int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); +int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); + +#else /* CONFIG_MSHV_ROOT */ +static inline bool hv_root_partition(void) { return false; } +static inline int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) +{ + return -EOPNOTSUPP; +} +static inline int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id) +{ + return -EOPNOTSUPP; +} +static inline int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_MSHV_ROOT */ #endif From fe14262695526145334bfa0bb51fcc365cf6dfb5 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 11 Mar 2025 10:16:34 +0100 Subject: [PATCH 09/24] hyperv: Remove unused union and structs The union vmpacket_largest_possible_header and several structs have not been used for a long time afaict - remove them. Reviewed-by: Michael Kelley Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20250311091634.494888-2-thorsten.blum@linux.dev Signed-off-by: Wei Liu Message-ID: <20250311091634.494888-2-thorsten.blum@linux.dev> --- include/linux/hyperv.h | 56 ------------------------------------------ 1 file changed, 56 deletions(-) diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 7f4f8d8bdf43..675959fb97ba 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -371,19 +371,6 @@ struct vmtransfer_page_packet_header { struct vmtransfer_page_range ranges[]; } __packed; -struct vmgpadl_packet_header { - struct vmpacket_descriptor d; - u32 gpadl; - u32 reserved; -} __packed; - -struct vmadd_remove_transfer_page_set { - struct vmpacket_descriptor d; - u32 gpadl; - u16 xfer_pageset_id; - u16 reserved; -} __packed; - /* * This structure defines a range in guest physical space that can be made to * look virtually contiguous. @@ -394,30 +381,6 @@ struct gpa_range { u64 pfn_array[]; }; -/* - * This is the format for an Establish Gpadl packet, which contains a handle by - * which this GPADL will be known and a set of GPA ranges associated with it. - * This can be converted to a MDL by the guest OS. If there are multiple GPA - * ranges, then the resulting MDL will be "chained," representing multiple VA - * ranges. - */ -struct vmestablish_gpadl { - struct vmpacket_descriptor d; - u32 gpadl; - u32 range_cnt; - struct gpa_range range[1]; -} __packed; - -/* - * This is the format for a Teardown Gpadl packet, which indicates that the - * GPADL handle in the Establish Gpadl packet will never be referenced again. - */ -struct vmteardown_gpadl { - struct vmpacket_descriptor d; - u32 gpadl; - u32 reserved; /* for alignment to a 8-byte boundary */ -} __packed; - /* * This is the format for a GPA-Direct packet, which contains a set of GPA * ranges, in addition to commands and/or data. @@ -429,25 +392,6 @@ struct vmdata_gpa_direct { struct gpa_range range[1]; } __packed; -/* This is the format for a Additional Data Packet. */ -struct vmadditional_data { - struct vmpacket_descriptor d; - u64 total_bytes; - u32 offset; - u32 byte_cnt; - unsigned char data[1]; -} __packed; - -union vmpacket_largest_possible_header { - struct vmpacket_descriptor simple_hdr; - struct vmtransfer_page_packet_header xfer_page_hdr; - struct vmgpadl_packet_header gpadl_hdr; - struct vmadd_remove_transfer_page_set add_rm_xfer_page_hdr; - struct vmestablish_gpadl establish_gpadl_hdr; - struct vmteardown_gpadl teardown_gpadl_hdr; - struct vmdata_gpa_direct data_gpa_direct_hdr; -}; - #define VMPACKET_DATA_START_ADDRESS(__packet) \ (void *)(((unsigned char *)__packet) + \ ((struct vmpacket_descriptor)__packet)->offset8 * 8) From ced518ad55b4118dd77f8a455a06801b89f2f877 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Thu, 27 Feb 2025 13:47:27 -0800 Subject: [PATCH 10/24] x86/hyperv: Add VTL mode emergency restart callback By default, X86(-64) systems use the emergecy restart routine in the course of which the code unconditionally writes to the physical address of 0x472 to indicate the boot mode to the firmware (BIOS or UEFI). When the kernel itself runs as a firmware in the VTL mode, that write corrupts the memory of the guest upon emergency restarting. Preserving the state intact in that situation is important for debugging, at least. Define the specialized machine callback to avoid that write and use the triple fault to perform emergency restart. Signed-off-by: Roman Kisel Link: https://lore.kernel.org/r/20250227214728.15672-2-romank@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20250227214728.15672-2-romank@linux.microsoft.com> --- arch/x86/hyperv/hv_vtl.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 4e1b1e3b5658..4421b75ad9a9 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -12,6 +12,7 @@ #include #include #include +#include #include <../kernel/smpboot.h> extern struct boot_params boot_params; @@ -22,6 +23,27 @@ static bool __init hv_vtl_msi_ext_dest_id(void) return true; } +/* + * The `native_machine_emergency_restart` function from `reboot.c` writes + * to the physical address 0x472 to indicate the type of reboot for the + * firmware. We cannot have that in VSM as the memory composition might + * be more generic, and such write effectively corrupts the memory thus + * making diagnostics harder at the very least. + */ +static void __noreturn hv_vtl_emergency_restart(void) +{ + /* + * Cause a triple fault and the immediate reset. Here the code does not run + * on the top of any firmware, whereby cannot reach out to its services. + * The inifinite loop is for the improbable case that the triple fault does + * not work and have to preserve the state intact for debugging. + */ + for (;;) { + idt_invalidate(); + __asm__ __volatile__("int3"); + } +} + void __init hv_vtl_init_platform(void) { pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); @@ -235,6 +257,7 @@ static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) int __init hv_vtl_early_init(void) { + machine_ops.emergency_restart = hv_vtl_emergency_restart; /* * `boot_cpu_has` returns the runtime feature support, * and here is the earliest it can be used. From 07b74192e6170571892c09bf5c4352a85db22557 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Thu, 27 Feb 2025 13:47:28 -0800 Subject: [PATCH 11/24] x86/hyperv: Add VTL mode callback for restarting the system The kernel runs as a firmware in the VTL mode, and the only way to restart in the VTL mode on x86 is to triple fault. Thus, one has to always supply "reboot=t" on the kernel command line in the VTL mode, and missing that renders rebooting not working. Define the machine restart callback to always use the triple fault to provide the robust configuration by default. Signed-off-by: Roman Kisel Link: https://lore.kernel.org/r/20250227214728.15672-3-romank@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20250227214728.15672-3-romank@linux.microsoft.com> --- arch/x86/hyperv/hv_vtl.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 4421b75ad9a9..582fe820e29c 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -44,6 +44,15 @@ static void __noreturn hv_vtl_emergency_restart(void) } } +/* + * The only way to restart in the VTL mode is to triple fault as the kernel runs + * as firmware. + */ +static void __noreturn hv_vtl_restart(char __maybe_unused *cmd) +{ + hv_vtl_emergency_restart(); +} + void __init hv_vtl_init_platform(void) { pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); @@ -258,6 +267,8 @@ static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) int __init hv_vtl_early_init(void) { machine_ops.emergency_restart = hv_vtl_emergency_restart; + machine_ops.restart = hv_vtl_restart; + /* * `boot_cpu_has` returns the runtime feature support, * and here is the earliest it can be used. From e792d843aa3c9d039074cdce728d5803262e57a7 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 13 Mar 2025 04:52:17 -0400 Subject: [PATCH 12/24] x86/hyperv: Fix check of return value from snp_set_vmsa() snp_set_vmsa() returns 0 as success result and so fix it. Cc: stable@vger.kernel.org Fixes: 44676bb9d566 ("x86/hyperv: Add smp support for SEV-SNP guest") Signed-off-by: Tianyu Lan Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20250313085217.45483-1-ltykernel@gmail.com Signed-off-by: Wei Liu Message-ID: <20250313085217.45483-1-ltykernel@gmail.com> --- arch/x86/hyperv/ivm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index dd68d9ad9b22..c0039a90e9e0 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -338,7 +338,7 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) vmsa->sev_features = sev_status >> 2; ret = snp_set_vmsa(vmsa, true); - if (!ret) { + if (ret) { pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret); free_page((u64)vmsa); return ret; From 3817854ba892016ddb03ee31208e9f8e440f5bee Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 14 Mar 2025 12:28:47 -0700 Subject: [PATCH 13/24] hyperv: Log hypercall status codes as strings Introduce hv_status_printk() macros as a convenience to log hypercall errors, formatting them with the status code (HV_STATUS_*) as a raw hex value and also as a string, which saves some time while debugging. Create a table of HV_STATUS_ codes with strings and mapped errnos, and use it for hv_result_to_string() and hv_result_to_errno(). Use the new hv_status_printk()s in hv_proc.c, hyperv-iommu.c, and irqdomain.c hypercalls to aid debugging in the root partition. Signed-off-by: Nuno Das Neves Reviewed-by: Stanislav Kinsburskii Link: https://lore.kernel.org/r/1741980536-3865-2-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1741980536-3865-2-git-send-email-nunodasneves@linux.microsoft.com> --- arch/x86/hyperv/irqdomain.c | 6 +- drivers/hv/hv_common.c | 129 ++++++++++++++++++++++++--------- drivers/hv/hv_proc.c | 10 +-- drivers/iommu/hyperv-iommu.c | 4 +- include/asm-generic/mshyperv.h | 13 ++++ 5 files changed, 118 insertions(+), 44 deletions(-) diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c index 64b921360b0f..31f0d29cbc5e 100644 --- a/arch/x86/hyperv/irqdomain.c +++ b/arch/x86/hyperv/irqdomain.c @@ -64,7 +64,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level, local_irq_restore(flags); if (!hv_result_success(status)) - pr_err("%s: hypercall failed, status %lld\n", __func__, status); + hv_status_err(status, "\n"); return hv_result(status); } @@ -224,7 +224,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) kfree(stored_entry); if (status != HV_STATUS_SUCCESS) { - pr_debug("%s: failed to unmap, status %lld", __func__, status); + hv_status_debug(status, "failed to unmap\n"); return; } } @@ -273,7 +273,7 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd) status = hv_unmap_msi_interrupt(dev, &old_entry); if (status != HV_STATUS_SUCCESS) - pr_err("%s: hypercall failed, status %lld\n", __func__, status); + hv_status_err(status, "\n"); } static void hv_msi_free_irq(struct irq_domain *domain, diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 9804adb4cc56..885bbc3d86d8 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -684,40 +684,6 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2) } EXPORT_SYMBOL_GPL(hv_tdx_hypercall); -/* Convert a hypercall result into a linux-friendly error code. */ -int hv_result_to_errno(u64 status) -{ - /* hv_do_hypercall() may return U64_MAX, hypercalls aren't possible */ - if (unlikely(status == U64_MAX)) - return -EOPNOTSUPP; - /* - * A failed hypercall is usually only recoverable (or loggable) near - * the call site where the HV_STATUS_* code is known. So the errno - * it gets converted to is not too useful further up the stack. - * Provide a few mappings that could be useful, and revert to -EIO - * as a fallback. - */ - switch (hv_result(status)) { - case HV_STATUS_SUCCESS: - return 0; - case HV_STATUS_INVALID_HYPERCALL_CODE: - case HV_STATUS_INVALID_HYPERCALL_INPUT: - case HV_STATUS_INVALID_PARAMETER: - case HV_STATUS_INVALID_PARTITION_ID: - case HV_STATUS_INVALID_VP_INDEX: - case HV_STATUS_INVALID_PORT_ID: - case HV_STATUS_INVALID_CONNECTION_ID: - case HV_STATUS_INVALID_LP_INDEX: - case HV_STATUS_INVALID_REGISTER_VALUE: - return -EINVAL; - case HV_STATUS_INSUFFICIENT_MEMORY: - return -ENOMEM; - default: - break; - } - return -EIO; -} - void hv_identify_partition_type(void) { /* Assume guest role */ @@ -740,3 +706,98 @@ void hv_identify_partition_type(void) pr_crit("Hyper-V: CONFIG_MSHV_ROOT not enabled!\n"); } } + +struct hv_status_info { + char *string; + int errno; + u16 code; +}; + +/* + * Note on the errno mappings: + * A failed hypercall is usually only recoverable (or loggable) near + * the call site where the HV_STATUS_* code is known. So the errno + * it gets converted to is not too useful further up the stack. + * Provide a few mappings that could be useful, and revert to -EIO + * as a fallback. + */ +static const struct hv_status_info hv_status_infos[] = { +#define _STATUS_INFO(status, errno) { #status, (errno), (status) } + _STATUS_INFO(HV_STATUS_SUCCESS, 0), + _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_CODE, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_INPUT, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_ALIGNMENT, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_PARAMETER, -EINVAL), + _STATUS_INFO(HV_STATUS_ACCESS_DENIED, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_PARTITION_STATE, -EIO), + _STATUS_INFO(HV_STATUS_OPERATION_DENIED, -EIO), + _STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY, -EIO), + _STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO), + _STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM), + _STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL), + _STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_PORT_ID, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_CONNECTION_ID, -EINVAL), + _STATUS_INFO(HV_STATUS_INSUFFICIENT_BUFFERS, -EIO), + _STATUS_INFO(HV_STATUS_NOT_ACKNOWLEDGED, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_VP_STATE, -EIO), + _STATUS_INFO(HV_STATUS_NO_RESOURCES, -EIO), + _STATUS_INFO(HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EIO), + _STATUS_INFO(HV_STATUS_OPERATION_FAILED, -EIO), + _STATUS_INFO(HV_STATUS_TIME_OUT, -EIO), + _STATUS_INFO(HV_STATUS_CALL_PENDING, -EIO), + _STATUS_INFO(HV_STATUS_VTL_ALREADY_ENABLED, -EIO), +#undef _STATUS_INFO +}; + +static inline const struct hv_status_info *find_hv_status_info(u64 hv_status) +{ + int i; + u16 code = hv_result(hv_status); + + for (i = 0; i < ARRAY_SIZE(hv_status_infos); ++i) { + const struct hv_status_info *info = &hv_status_infos[i]; + + if (info->code == code) + return info; + } + + return NULL; +} + +/* Convert a hypercall result into a linux-friendly error code. */ +int hv_result_to_errno(u64 status) +{ + const struct hv_status_info *info; + + /* hv_do_hypercall() may return U64_MAX, hypercalls aren't possible */ + if (unlikely(status == U64_MAX)) + return -EOPNOTSUPP; + + info = find_hv_status_info(status); + if (info) + return info->errno; + + return -EIO; +} +EXPORT_SYMBOL_GPL(hv_result_to_errno); + +const char *hv_result_to_string(u64 status) +{ + const struct hv_status_info *info; + + if (unlikely(status == U64_MAX)) + return "Hypercall page missing!"; + + info = find_hv_status_info(status); + if (info) + return info->string; + + return "Unknown"; +} +EXPORT_SYMBOL_GPL(hv_result_to_string); diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c index 2fae18e4f7d2..605999f10e17 100644 --- a/drivers/hv/hv_proc.c +++ b/drivers/hv/hv_proc.c @@ -87,7 +87,7 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) page_count, 0, input_page, NULL); local_irq_restore(flags); if (!hv_result_success(status)) { - pr_err("Failed to deposit pages: %lld\n", status); + hv_status_err(status, "\n"); ret = hv_result_to_errno(status); goto err_free_allocations; } @@ -137,8 +137,8 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { if (!hv_result_success(status)) { - pr_err("%s: cpu %u apic ID %u, %lld\n", __func__, - lp_index, apic_id, status); + hv_status_err(status, "cpu %u apic ID: %u\n", + lp_index, apic_id); ret = hv_result_to_errno(status); } break; @@ -179,8 +179,8 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { if (!hv_result_success(status)) { - pr_err("%s: vcpu %u, lp %u, %lld\n", __func__, - vp_index, flags, status); + hv_status_err(status, "vcpu: %u, lp: %u\n", + vp_index, flags); ret = hv_result_to_errno(status); } break; diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index 53e4b37716af..761ab647f372 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -217,7 +217,7 @@ hyperv_root_ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) status = hv_unmap_ioapic_interrupt(ioapic_id, &entry); if (status != HV_STATUS_SUCCESS) - pr_debug("%s: unexpected unmap status %lld\n", __func__, status); + hv_status_debug(status, "failed to unmap\n"); data->entry.ioapic_rte.as_uint64 = 0; data->entry.source = 0; /* Invalid source */ @@ -228,7 +228,7 @@ hyperv_root_ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) vector, &entry); if (status != HV_STATUS_SUCCESS) { - pr_err("%s: map hypercall failed, status %lld\n", __func__, status); + hv_status_err(status, "map failed\n"); return; } diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index b13b0cda4ac8..250c65236919 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -298,6 +298,19 @@ static inline int cpumask_to_vpset_skip(struct hv_vpset *vpset, return __cpumask_to_vpset(vpset, cpus, func); } +#define _hv_status_fmt(fmt) "%s: Hyper-V status: %#x = %s: " fmt +#define hv_status_printk(level, status, fmt, ...) \ +do { \ + u64 __status = (status); \ + pr_##level(_hv_status_fmt(fmt), __func__, hv_result(__status), \ + hv_result_to_string(__status), ##__VA_ARGS__); \ +} while (0) +#define hv_status_err(status, fmt, ...) \ + hv_status_printk(err, status, fmt, ##__VA_ARGS__) +#define hv_status_debug(status, fmt, ...) \ + hv_status_printk(debug, status, fmt, ##__VA_ARGS__) + +const char *hv_result_to_string(u64 hv_status); int hv_result_to_errno(u64 status); void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die); bool hv_is_hyperv_initialized(void); From 8cac51796ecb162c437cc651f37152095af76591 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Fri, 14 Mar 2025 12:28:48 -0700 Subject: [PATCH 14/24] x86/mshyperv: Add support for extended Hyper-V features Extend the "ms_hyperv_info" structure to include a new field, "ext_features", for capturing extended Hyper-V features. Update the "ms_hyperv_init_platform" function to retrieve these features using the cpuid instruction and include them in the informational output. Signed-off-by: Stanislav Kinsburskii Signed-off-by: Nuno Das Neves Reviewed-by: Easwar Hariharan Reviewed-by: Roman Kisel Reviewed-by: Tianyu Lan Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1741980536-3865-3-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1741980536-3865-3-git-send-email-nunodasneves@linux.microsoft.com> --- arch/x86/kernel/cpu/mshyperv.c | 6 ++++-- include/asm-generic/mshyperv.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 4f01f424ea5b..fd285b18d6b4 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -434,13 +434,15 @@ static void __init ms_hyperv_init_platform(void) */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES); + ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES); ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); - pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n", - ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, + pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n", + ms_hyperv.features, ms_hyperv.priv_high, + ms_hyperv.ext_features, ms_hyperv.hints, ms_hyperv.misc_features); ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 250c65236919..c8043efabf5a 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -36,6 +36,7 @@ enum hv_partition_type { struct ms_hyperv_info { u32 features; u32 priv_high; + u32 ext_features; u32 misc_features; u32 hints; u32 nested_features; From feba84c2c98109cd784de931240c0dab6396c0d7 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 14 Mar 2025 12:28:49 -0700 Subject: [PATCH 15/24] arm64/hyperv: Add some missing functions to arm64 These non-nested msr and fast hypercall functions are present in x86, but they must be available in both architectures for the root partition driver code. While at it, remove the redundant 'extern' keywords from the hv_do_hypercall() variants in asm-generic/mshyperv.h. Signed-off-by: Nuno Das Neves Reviewed-by: Stanislav Kinsburskii Reviewed-by: Roman Kisel Link: https://lore.kernel.org/r/1741980536-3865-4-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1741980536-3865-4-git-send-email-nunodasneves@linux.microsoft.com> --- arch/arm64/hyperv/hv_core.c | 17 +++++++++++++++++ arch/arm64/include/asm/mshyperv.h | 13 +++++++++++++ include/asm-generic/mshyperv.h | 6 ++++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/arm64/hyperv/hv_core.c b/arch/arm64/hyperv/hv_core.c index 69004f619c57..e33a9e3c366a 100644 --- a/arch/arm64/hyperv/hv_core.c +++ b/arch/arm64/hyperv/hv_core.c @@ -53,6 +53,23 @@ u64 hv_do_fast_hypercall8(u16 code, u64 input) } EXPORT_SYMBOL_GPL(hv_do_fast_hypercall8); +/* + * hv_do_fast_hypercall16 -- Invoke the specified hypercall + * with arguments in registers instead of physical memory. + * Avoids the overhead of virt_to_phys for simple hypercalls. + */ +u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2) +{ + struct arm_smccc_res res; + u64 control; + + control = (u64)code | HV_HYPERCALL_FAST_BIT; + + arm_smccc_1_1_hvc(HV_FUNC_ID, control, input1, input2, &res); + return res.a0; +} +EXPORT_SYMBOL_GPL(hv_do_fast_hypercall16); + /* * Set a single VP register to a 64-bit value. */ diff --git a/arch/arm64/include/asm/mshyperv.h b/arch/arm64/include/asm/mshyperv.h index 2e2f83bafcfb..b721d3134ab6 100644 --- a/arch/arm64/include/asm/mshyperv.h +++ b/arch/arm64/include/asm/mshyperv.h @@ -40,6 +40,19 @@ static inline u64 hv_get_msr(unsigned int reg) return hv_get_vpreg(reg); } +/* + * Nested is not supported on arm64 + */ +static inline void hv_set_non_nested_msr(unsigned int reg, u64 value) +{ + hv_set_msr(reg, value); +} + +static inline u64 hv_get_non_nested_msr(unsigned int reg) +{ + return hv_get_msr(reg); +} + /* SMCCC hypercall parameters */ #define HV_SMCCC_FUNC_NUMBER 1 #define HV_FUNC_ID ARM_SMCCC_CALL_VAL( \ diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index c8043efabf5a..c3697bc0598d 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -70,8 +70,10 @@ extern enum hv_partition_type hv_curr_partition_type; extern void * __percpu *hyperv_pcpu_input_arg; extern void * __percpu *hyperv_pcpu_output_arg; -extern u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr); -extern u64 hv_do_fast_hypercall8(u16 control, u64 input8); +u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr); +u64 hv_do_fast_hypercall8(u16 control, u64 input8); +u64 hv_do_fast_hypercall16(u16 control, u64 input1, u64 input2); + bool hv_isolation_type_snp(void); bool hv_isolation_type_tdx(void); From af37bc759f1064cc4dc0a6a12afd3fb25c12fe4d Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 14 Mar 2025 12:28:50 -0700 Subject: [PATCH 16/24] hyperv: Introduce hv_recommend_using_aeoi() Factor out the check for enabling auto eoi, to be reused in root partition code. No functional changes. Signed-off-by: Nuno Das Neves Reviewed-by: Stanislav Kinsburskii Reviewed-by: Easwar Hariharan Reviewed-by: Michael Kelley Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/r/1741980536-3865-5-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1741980536-3865-5-git-send-email-nunodasneves@linux.microsoft.com> --- drivers/hv/hv.c | 12 +----------- include/asm-generic/mshyperv.h | 13 +++++++++++++ 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index a38f84548bc2..308c8f279df8 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -313,17 +313,7 @@ void hv_synic_enable_regs(unsigned int cpu) shared_sint.vector = vmbus_interrupt; shared_sint.masked = false; - - /* - * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64), - * it doesn't provide a recommendation flag and AEOI must be disabled. - */ -#ifdef HV_DEPRECATING_AEOI_RECOMMENDED - shared_sint.auto_eoi = - !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED); -#else - shared_sint.auto_eoi = 0; -#endif + shared_sint.auto_eoi = hv_recommend_using_aeoi(); hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); /* Enable the global synic bit */ diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index c3697bc0598d..8519b8ec8e9d 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -77,6 +77,19 @@ u64 hv_do_fast_hypercall16(u16 control, u64 input1, u64 input2); bool hv_isolation_type_snp(void); bool hv_isolation_type_tdx(void); +/* + * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64), + * it doesn't provide a recommendation flag and AEOI must be disabled. + */ +static inline bool hv_recommend_using_aeoi(void) +{ +#ifdef HV_DEPRECATING_AEOI_RECOMMENDED + return !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED); +#else + return false; +#endif +} + static inline struct hv_proximity_domain_info hv_numa_node_to_pxm_info(int node) { struct hv_proximity_domain_info pxm_info = {}; From 4ee23f3a4a46dc07dd6f82801001aa370faa8312 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 14 Mar 2025 12:28:51 -0700 Subject: [PATCH 17/24] acpi: numa: Export node_to_pxm() node_to_pxm() is used by hv_numa_node_to_pxm_info(). That helper will be used by Hyper-V root partition module code when CONFIG_MSHV_ROOT=m. Signed-off-by: Nuno Das Neves Reviewed-by: Stanislav Kinsburskii Reviewed-by: Easwar Hariharan Reviewed-by: Michael Kelley Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/r/1741980536-3865-6-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1741980536-3865-6-git-send-email-nunodasneves@linux.microsoft.com> --- drivers/acpi/numa/srat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 00ac0d7bb8c9..ce815d7cb8f6 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -51,6 +51,7 @@ int node_to_pxm(int node) return PXM_INVAL; return node_to_pxm_map[node]; } +EXPORT_SYMBOL_GPL(node_to_pxm); static void __acpi_map_pxm_to_node(int pxm, int node) { From 21050f619720029e89d365b0e1328eafe088dbd8 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 14 Mar 2025 12:28:52 -0700 Subject: [PATCH 18/24] Drivers: hv: Export some functions for use by root partition module hv_get_hypervisor_version(), hv_call_deposit_pages(), and hv_call_create_vp(), are all needed in-module with CONFIG_MSHV_ROOT=m. Signed-off-by: Nuno Das Neves Reviewed-by: Stanislav Kinsburskii Reviewed-by: Roman Kisel Reviewed-by: Easwar Hariharan Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/r/1741980536-3865-7-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1741980536-3865-7-git-send-email-nunodasneves@linux.microsoft.com> --- arch/arm64/hyperv/mshyperv.c | 1 + arch/x86/kernel/cpu/mshyperv.c | 1 + drivers/hv/hv_proc.c | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/hyperv/mshyperv.c b/arch/arm64/hyperv/mshyperv.c index 2265ea5ce5ad..4e27cc29c79e 100644 --- a/arch/arm64/hyperv/mshyperv.c +++ b/arch/arm64/hyperv/mshyperv.c @@ -26,6 +26,7 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info) return 0; } +EXPORT_SYMBOL_GPL(hv_get_hypervisor_version); static int __init hyperv_init(void) { diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index fd285b18d6b4..fcd0e066d9bd 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -420,6 +420,7 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info) return 0; } +EXPORT_SYMBOL_GPL(hv_get_hypervisor_version); static void __init ms_hyperv_init_platform(void) { diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c index 605999f10e17..7d7ecb6f6137 100644 --- a/drivers/hv/hv_proc.c +++ b/drivers/hv/hv_proc.c @@ -107,6 +107,7 @@ free_buf: kfree(counts); return ret; } +EXPORT_SYMBOL_GPL(hv_call_deposit_pages); int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) { @@ -191,4 +192,4 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) return ret; } - +EXPORT_SYMBOL_GPL(hv_call_create_vp); From 04df7ac39943aa1f503d99572782689d356e3e63 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 14 Mar 2025 12:28:53 -0700 Subject: [PATCH 19/24] Drivers: hv: Introduce per-cpu event ring tail Add a pointer hv_synic_eventring_tail to track the tail pointer for the SynIC event ring buffer for each SINT. This will be used by the mshv driver, but must be tracked independently since the driver module could be removed and re-inserted. Signed-off-by: Nuno Das Neves Reviewed-by: Wei Liu Reviewed-by: Stanislav Kinsburskii Link: https://lore.kernel.org/r/1741980536-3865-8-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1741980536-3865-8-git-send-email-nunodasneves@linux.microsoft.com> --- drivers/hv/hv_common.c | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 885bbc3d86d8..3cd9b96ffc67 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -68,6 +68,16 @@ static void hv_kmsg_dump_unregister(void); static struct ctl_table_header *hv_ctl_table_hdr; +/* + * Per-cpu array holding the tail pointer for the SynIC event ring buffer + * for each SINT. + * + * We cannot maintain this in mshv driver because the tail pointer should + * persist even if the mshv driver is unloaded. + */ +u8 * __percpu *hv_synic_eventring_tail; +EXPORT_SYMBOL_GPL(hv_synic_eventring_tail); + /* * Hyper-V specific initialization and shutdown code that is * common across all architectures. Called from architecture @@ -90,6 +100,9 @@ void __init hv_common_free(void) free_percpu(hyperv_pcpu_input_arg); hyperv_pcpu_input_arg = NULL; + + free_percpu(hv_synic_eventring_tail); + hv_synic_eventring_tail = NULL; } /* @@ -372,6 +385,11 @@ int __init hv_common_init(void) BUG_ON(!hyperv_pcpu_output_arg); } + if (hv_root_partition()) { + hv_synic_eventring_tail = alloc_percpu(u8 *); + BUG_ON(!hv_synic_eventring_tail); + } + hv_vp_index = kmalloc_array(nr_cpu_ids, sizeof(*hv_vp_index), GFP_KERNEL); if (!hv_vp_index) { @@ -460,11 +478,12 @@ error: int hv_common_cpu_init(unsigned int cpu) { void **inputarg, **outputarg; + u8 **synic_eventring_tail; u64 msr_vp_index; gfp_t flags; const int pgcount = hv_output_page_exists() ? 2 : 1; void *mem; - int ret; + int ret = 0; /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL; @@ -472,8 +491,8 @@ int hv_common_cpu_init(unsigned int cpu) inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); /* - * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already - * allocated if this CPU was previously online and then taken offline + * The per-cpu memory is already allocated if this CPU was previously + * online and then taken offline */ if (!*inputarg) { mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); @@ -520,11 +539,21 @@ int hv_common_cpu_init(unsigned int cpu) if (msr_vp_index > hv_max_vp_index) hv_max_vp_index = msr_vp_index; - return 0; + if (hv_root_partition()) { + synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail); + *synic_eventring_tail = kcalloc(HV_SYNIC_SINT_COUNT, + sizeof(u8), flags); + /* No need to unwind any of the above on failure here */ + if (unlikely(!*synic_eventring_tail)) + ret = -ENOMEM; + } + + return ret; } int hv_common_cpu_die(unsigned int cpu) { + u8 **synic_eventring_tail; /* * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg @@ -537,6 +566,10 @@ int hv_common_cpu_die(unsigned int cpu) * originally allocated memory is reused in hv_common_cpu_init(). */ + synic_eventring_tail = this_cpu_ptr(hv_synic_eventring_tail); + kfree(*synic_eventring_tail); + *synic_eventring_tail = NULL; + return 0; } From e2575ffe57ac07e730be16a6c451efca0471af7c Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 14 Mar 2025 12:28:54 -0700 Subject: [PATCH 20/24] x86: hyperv: Add mshv_handler() irq handler and setup function Add mshv_handler() to process messages related to managing guest partitions such as intercepts, doorbells, and scheduling messages. In a (non-nested) root partition, the same interrupt vector is shared between the vmbus and mshv_root drivers. Introduce a stub for mshv_handler() and call it in sysvec_hyperv_callback alongside vmbus_handler(). Even though both handlers will be called for every Hyper-V interrupt, the messages for each driver are delivered to different offsets within the SYNIC message page, so they won't step on each other. Signed-off-by: Nuno Das Neves Reviewed-by: Wei Liu Reviewed-by: Tianyu Lan Reviewed-by: Stanislav Kinsburskii Link: https://lore.kernel.org/r/1741980536-3865-9-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1741980536-3865-9-git-send-email-nunodasneves@linux.microsoft.com> --- arch/x86/kernel/cpu/mshyperv.c | 9 +++++++++ drivers/hv/hv_common.c | 5 +++++ include/asm-generic/mshyperv.h | 1 + 3 files changed, 15 insertions(+) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index fcd0e066d9bd..3e2533954675 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -107,6 +107,7 @@ void hv_set_msr(unsigned int reg, u64 value) } EXPORT_SYMBOL_GPL(hv_set_msr); +static void (*mshv_handler)(void); static void (*vmbus_handler)(void); static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); @@ -117,6 +118,9 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) struct pt_regs *old_regs = set_irq_regs(regs); inc_irq_stat(irq_hv_callback_count); + if (mshv_handler) + mshv_handler(); + if (vmbus_handler) vmbus_handler(); @@ -126,6 +130,11 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) set_irq_regs(old_regs); } +void hv_setup_mshv_handler(void (*handler)(void)) +{ + mshv_handler = handler; +} + void hv_setup_vmbus_handler(void (*handler)(void)) { vmbus_handler = handler; diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 3cd9b96ffc67..b3b11be11650 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -680,6 +680,11 @@ void __weak hv_remove_vmbus_handler(void) } EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler); +void __weak hv_setup_mshv_handler(void (*handler)(void)) +{ +} +EXPORT_SYMBOL_GPL(hv_setup_mshv_handler); + void __weak hv_setup_kexec_handler(void (*handler)(void)) { } diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 8519b8ec8e9d..ccccb1cbf7df 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -208,6 +208,7 @@ void hv_setup_kexec_handler(void (*handler)(void)); void hv_remove_kexec_handler(void); void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); void hv_remove_crash_handler(void); +void hv_setup_mshv_handler(void (*handler)(void)); extern int vmbus_interrupt; extern int vmbus_irq; From 0bd921a4b4d9ca832578fcc61a6a99bd980776c9 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 14 Mar 2025 12:28:55 -0700 Subject: [PATCH 21/24] hyperv: Add definitions for root partition driver to hv headers A few additional definitions are required for the mshv driver code (to follow). Introduce those here and clean up a little bit while at it. Signed-off-by: Nuno Das Neves Reviewed-by: Roman Kisel Reviewed-by: Stanislav Kinsburskii Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/r/1741980536-3865-10-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1741980536-3865-10-git-send-email-nunodasneves@linux.microsoft.com> --- include/hyperv/hvgdk_mini.h | 72 ++++++++++++++++++-- include/hyperv/hvhdk.h | 132 ++++++++++++++++++++++++++++++++++-- include/hyperv/hvhdk_mini.h | 91 +++++++++++++++++++++++++ 3 files changed, 284 insertions(+), 11 deletions(-) diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 58895883f636..735329859f21 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -13,7 +13,7 @@ struct hv_u128 { u64 high_part; } __packed; -/* NOTE: when adding below, update hv_status_to_string() */ +/* NOTE: when adding below, update hv_result_to_string() */ #define HV_STATUS_SUCCESS 0x0 #define HV_STATUS_INVALID_HYPERCALL_CODE 0x2 #define HV_STATUS_INVALID_HYPERCALL_INPUT 0x3 @@ -51,6 +51,7 @@ struct hv_u128 { #define HV_HYP_PAGE_SHIFT 12 #define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) #define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) +#define HV_HYP_LARGE_PAGE_SHIFT 21 #define HV_PARTITION_ID_INVALID ((u64)0) #define HV_PARTITION_ID_SELF ((u64)-1) @@ -374,6 +375,10 @@ union hv_hypervisor_version_info { #define HV_SHARED_GPA_BOUNDARY_ACTIVE BIT(5) #define HV_SHARED_GPA_BOUNDARY_BITS GENMASK(11, 6) +/* HYPERV_CPUID_FEATURES.ECX bits. */ +#define HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE BIT(9) +#define HV_VP_GHCB_ROOT_MAPPING_AVAILABLE BIT(10) + enum hv_isolation_type { HV_ISOLATION_TYPE_NONE = 0, /* HV_PARTITION_ISOLATION_TYPE_NONE */ HV_ISOLATION_TYPE_VBS = 1, @@ -436,10 +441,13 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_WITHDRAW_MEMORY 0x0049 #define HVCALL_MAP_GPA_PAGES 0x004b #define HVCALL_UNMAP_GPA_PAGES 0x004c +#define HVCALL_INSTALL_INTERCEPT 0x004d #define HVCALL_CREATE_VP 0x004e #define HVCALL_DELETE_VP 0x004f #define HVCALL_GET_VP_REGISTERS 0x0050 #define HVCALL_SET_VP_REGISTERS 0x0051 +#define HVCALL_TRANSLATE_VIRTUAL_ADDRESS 0x0052 +#define HVCALL_CLEAR_VIRTUAL_INTERRUPT 0x0056 #define HVCALL_DELETE_PORT 0x0058 #define HVCALL_DISCONNECT_PORT 0x005b #define HVCALL_POST_MESSAGE 0x005c @@ -447,12 +455,15 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_POST_DEBUG_DATA 0x0069 #define HVCALL_RETRIEVE_DEBUG_DATA 0x006a #define HVCALL_RESET_DEBUG_SESSION 0x006b +#define HVCALL_MAP_STATS_PAGE 0x006c +#define HVCALL_UNMAP_STATS_PAGE 0x006d #define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 #define HVCALL_GET_SYSTEM_PROPERTY 0x007b #define HVCALL_MAP_DEVICE_INTERRUPT 0x007c #define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d #define HVCALL_RETARGET_INTERRUPT 0x007e #define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b +#define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091 #define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 #define HVCALL_CREATE_PORT 0x0095 #define HVCALL_CONNECT_PORT 0x0096 @@ -460,12 +471,18 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_GET_VP_ID_FROM_APIC_ID 0x009a #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 +#define HVCALL_SIGNAL_EVENT_DIRECT 0x00c0 +#define HVCALL_POST_MESSAGE_DIRECT 0x00c1 #define HVCALL_DISPATCH_VP 0x00c2 +#define HVCALL_GET_GPA_PAGES_ACCESS_STATES 0x00c9 +#define HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS 0x00d7 +#define HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS 0x00d8 #define HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY 0x00db #define HVCALL_MAP_VP_STATE_PAGE 0x00e1 #define HVCALL_UNMAP_VP_STATE_PAGE 0x00e2 #define HVCALL_GET_VP_STATE 0x00e3 #define HVCALL_SET_VP_STATE 0x00e4 +#define HVCALL_GET_VP_CPUID_VALUES 0x00f4 #define HVCALL_MMIO_READ 0x0106 #define HVCALL_MMIO_WRITE 0x0107 @@ -775,10 +792,10 @@ struct hv_message_page { /* Define timer message payload structure. */ struct hv_timer_message_payload { - __u32 timer_index; - __u32 reserved; - __u64 expiration_time; /* When the timer expired */ - __u64 delivery_time; /* When the message was delivered */ + u32 timer_index; + u32 reserved; + u64 expiration_time; /* When the timer expired */ + u64 delivery_time; /* When the message was delivered */ } __packed; struct hv_x64_segment_register { @@ -807,6 +824,8 @@ struct hv_x64_table_register { u64 base; } __packed; +#define HV_NORMAL_VTL 0 + union hv_input_vtl { u8 as_uint8; struct { @@ -1325,6 +1344,49 @@ struct hv_retarget_device_interrupt { /* HV_INPUT_RETARGET_DEVICE_INTERRUPT */ struct hv_device_interrupt_target int_target; } __packed __aligned(8); +enum hv_intercept_type { +#if defined(CONFIG_X86) + HV_INTERCEPT_TYPE_X64_IO_PORT = 0x00000000, + HV_INTERCEPT_TYPE_X64_MSR = 0x00000001, + HV_INTERCEPT_TYPE_X64_CPUID = 0x00000002, +#endif + HV_INTERCEPT_TYPE_EXCEPTION = 0x00000003, + /* Used to be HV_INTERCEPT_TYPE_REGISTER */ + HV_INTERCEPT_TYPE_RESERVED0 = 0x00000004, + HV_INTERCEPT_TYPE_MMIO = 0x00000005, +#if defined(CONFIG_X86) + HV_INTERCEPT_TYPE_X64_GLOBAL_CPUID = 0x00000006, + HV_INTERCEPT_TYPE_X64_APIC_SMI = 0x00000007, +#endif + HV_INTERCEPT_TYPE_HYPERCALL = 0x00000008, +#if defined(CONFIG_X86) + HV_INTERCEPT_TYPE_X64_APIC_INIT_SIPI = 0x00000009, + HV_INTERCEPT_MC_UPDATE_PATCH_LEVEL_MSR_READ = 0x0000000A, + HV_INTERCEPT_TYPE_X64_APIC_WRITE = 0x0000000B, + HV_INTERCEPT_TYPE_X64_MSR_INDEX = 0x0000000C, +#endif + HV_INTERCEPT_TYPE_MAX, + HV_INTERCEPT_TYPE_INVALID = 0xFFFFFFFF, +}; + +union hv_intercept_parameters { + /* HV_INTERCEPT_PARAMETERS is defined to be an 8-byte field. */ + u64 as_uint64; +#if defined(CONFIG_X86) + /* HV_INTERCEPT_TYPE_X64_IO_PORT */ + u16 io_port; + /* HV_INTERCEPT_TYPE_X64_CPUID */ + u32 cpuid_index; + /* HV_INTERCEPT_TYPE_X64_APIC_WRITE */ + u32 apic_write_mask; + /* HV_INTERCEPT_TYPE_EXCEPTION */ + u16 exception_vector; + /* HV_INTERCEPT_TYPE_X64_MSR_INDEX */ + u32 msr_index; +#endif + /* N.B. Other intercept types do not have any parameters. */ +}; + /* Data structures for HVCALL_MMIO_READ and HVCALL_MMIO_WRITE */ #define HV_HYPERCALL_MMIO_MAX_DATA_LENGTH 64 diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index 64407c2a3809..b4067ada02cf 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -19,11 +19,24 @@ #define HV_VP_REGISTER_PAGE_VERSION_1 1u +#define HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT 7 + +union hv_vp_register_page_interrupt_vectors { + u64 as_uint64; + struct { + u8 vector_count; + u8 vector[HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT]; + } __packed; +}; + struct hv_vp_register_page { u16 version; u8 isvalid; u8 rsvdz; u32 dirty; + +#if IS_ENABLED(CONFIG_X86) + union { struct { /* General purpose registers @@ -95,6 +108,22 @@ struct hv_vp_register_page { union hv_x64_pending_interruption_register pending_interruption; union hv_x64_interrupt_state_register interrupt_state; u64 instruction_emulation_hints; + u64 xfem; + + /* + * Fields from this point are not included in the register page save chunk. + * The reserved field is intended to maintain alignment for unsaved fields. + */ + u8 reserved1[0x100]; + + /* + * Interrupts injected as part of HvCallDispatchVp. + */ + union hv_vp_register_page_interrupt_vectors interrupt_vectors; + +#elif IS_ENABLED(CONFIG_ARM64) + /* Not yet supported in ARM */ +#endif } __packed; #define HV_PARTITION_PROCESSOR_FEATURES_BANKS 2 @@ -299,10 +328,11 @@ union hv_partition_isolation_properties { #define HV_PARTITION_ISOLATION_HOST_TYPE_RESERVED 0x2 /* Note: Exo partition is enabled by default */ -#define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8) -#define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED BIT(13) -#define HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED BIT(19) -#define HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE BIT(22) +#define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED BIT(4) +#define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8) +#define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED BIT(13) +#define HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED BIT(19) +#define HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE BIT(22) struct hv_input_create_partition { u64 flags; @@ -349,13 +379,23 @@ struct hv_input_set_partition_property { enum hv_vp_state_page_type { HV_VP_STATE_PAGE_REGISTERS = 0, HV_VP_STATE_PAGE_INTERCEPT_MESSAGE = 1, + HV_VP_STATE_PAGE_GHCB = 2, HV_VP_STATE_PAGE_COUNT }; struct hv_input_map_vp_state_page { u64 partition_id; u32 vp_index; - u32 type; /* enum hv_vp_state_page_type */ + u16 type; /* enum hv_vp_state_page_type */ + union hv_input_vtl input_vtl; + union { + u8 as_uint8; + struct { + u8 map_location_provided : 1; + u8 reserved : 7; + }; + } flags; + u64 requested_map_location; } __packed; struct hv_output_map_vp_state_page { @@ -365,7 +405,14 @@ struct hv_output_map_vp_state_page { struct hv_input_unmap_vp_state_page { u64 partition_id; u32 vp_index; - u32 type; /* enum hv_vp_state_page_type */ + u16 type; /* enum hv_vp_state_page_type */ + union hv_input_vtl input_vtl; + u8 reserved0; +} __packed; + +struct hv_x64_apic_eoi_message { + u32 vp_index; + u32 interrupt_vector; } __packed; struct hv_opaque_intercept_message { @@ -515,6 +562,13 @@ struct hv_synthetic_timers_state { u64 reserved[5]; } __packed; +struct hv_async_completion_message_payload { + u64 partition_id; + u32 status; + u32 completion_count; + u64 sub_status; +} __packed; + union hv_input_delete_vp { u64 as_uint64[2]; struct { @@ -649,6 +703,57 @@ struct hv_input_set_vp_state { union hv_input_set_vp_state_data data[]; } __packed; +union hv_x64_vp_execution_state { + u16 as_uint16; + struct { + u16 cpl:2; + u16 cr0_pe:1; + u16 cr0_am:1; + u16 efer_lma:1; + u16 debug_active:1; + u16 interruption_pending:1; + u16 vtl:4; + u16 enclave_mode:1; + u16 interrupt_shadow:1; + u16 virtualization_fault_active:1; + u16 reserved:2; + } __packed; +}; + +struct hv_x64_intercept_message_header { + u32 vp_index; + u8 instruction_length:4; + u8 cr8:4; /* Only set for exo partitions */ + u8 intercept_access_type; + union hv_x64_vp_execution_state execution_state; + struct hv_x64_segment_register cs_segment; + u64 rip; + u64 rflags; +} __packed; + +union hv_x64_memory_access_info { + u8 as_uint8; + struct { + u8 gva_valid:1; + u8 gva_gpa_valid:1; + u8 hypercall_output_pending:1; + u8 tlb_locked_no_overlay:1; + u8 reserved:4; + } __packed; +}; + +struct hv_x64_memory_intercept_message { + struct hv_x64_intercept_message_header header; + u32 cache_type; /* enum hv_cache_type */ + u8 instruction_byte_count; + union hv_x64_memory_access_info memory_access_info; + u8 tpr_priority; + u8 reserved1; + u64 guest_virtual_address; + u64 guest_physical_address; + u8 instruction_bytes[16]; +} __packed; + /* * Dispatch state for the VP communicated by the hypervisor to the * VP-dispatching thread in the root on return from HVCALL_DISPATCH_VP. @@ -716,6 +821,7 @@ static_assert(sizeof(struct hv_vp_signal_pair_scheduler_message) == #define HV_DISPATCH_VP_FLAG_SKIP_VP_SPEC_FLUSH 0x8 #define HV_DISPATCH_VP_FLAG_SKIP_CALLER_SPEC_FLUSH 0x10 #define HV_DISPATCH_VP_FLAG_SKIP_CALLER_USER_SPEC_FLUSH 0x20 +#define HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION 0x40 struct hv_input_dispatch_vp { u64 partition_id; @@ -730,4 +836,18 @@ struct hv_output_dispatch_vp { u32 dispatch_event; /* enum hv_vp_dispatch_event */ } __packed; +struct hv_input_modify_sparse_spa_page_host_access { + u32 host_access : 2; + u32 reserved : 30; + u32 flags; + u64 partition_id; + u64 spa_page_list[]; +} __packed; + +/* hv_input_modify_sparse_spa_page_host_access flags */ +#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE 0x1 +#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED 0x2 +#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE 0x4 +#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_HUGE_PAGE 0x8 + #endif /* _HV_HVHDK_H */ diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index f8a39d3e9ce6..42e7876455b5 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -36,6 +36,52 @@ enum hv_scheduler_type { HV_SCHEDULER_TYPE_MAX }; +/* HV_STATS_AREA_TYPE */ +enum hv_stats_area_type { + HV_STATS_AREA_SELF = 0, + HV_STATS_AREA_PARENT = 1, + HV_STATS_AREA_INTERNAL = 2, + HV_STATS_AREA_COUNT +}; + +enum hv_stats_object_type { + HV_STATS_OBJECT_HYPERVISOR = 0x00000001, + HV_STATS_OBJECT_LOGICAL_PROCESSOR = 0x00000002, + HV_STATS_OBJECT_PARTITION = 0x00010001, + HV_STATS_OBJECT_VP = 0x00010002 +}; + +union hv_stats_object_identity { + /* hv_stats_hypervisor */ + struct { + u8 reserved[15]; + u8 stats_area_type; + } __packed hv; + + /* hv_stats_logical_processor */ + struct { + u32 lp_index; + u8 reserved[11]; + u8 stats_area_type; + } __packed lp; + + /* hv_stats_partition */ + struct { + u64 partition_id; + u8 reserved[7]; + u8 stats_area_type; + } __packed partition; + + /* hv_stats_vp */ + struct { + u64 partition_id; + u32 vp_index; + u16 flags; + u8 reserved; + u8 stats_area_type; + } __packed vp; +}; + enum hv_partition_property_code { /* Privilege properties */ HV_PARTITION_PROPERTY_PRIVILEGE_FLAGS = 0x00010000, @@ -47,19 +93,45 @@ enum hv_partition_property_code { /* Compatibility properties */ HV_PARTITION_PROPERTY_PROCESSOR_XSAVE_FEATURES = 0x00060002, + HV_PARTITION_PROPERTY_XSAVE_STATES = 0x00060007, HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE = 0x00060008, HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY = 0x00060009, }; +enum hv_snp_status { + HV_SNP_STATUS_NONE = 0, + HV_SNP_STATUS_AVAILABLE = 1, + HV_SNP_STATUS_INCOMPATIBLE = 2, + HV_SNP_STATUS_PSP_UNAVAILABLE = 3, + HV_SNP_STATUS_PSP_INIT_FAILED = 4, + HV_SNP_STATUS_PSP_BAD_FW_VERSION = 5, + HV_SNP_STATUS_BAD_CONFIGURATION = 6, + HV_SNP_STATUS_PSP_FW_UPDATE_IN_PROGRESS = 7, + HV_SNP_STATUS_PSP_RB_INIT_FAILED = 8, + HV_SNP_STATUS_PSP_PLATFORM_STATUS_FAILED = 9, + HV_SNP_STATUS_PSP_INIT_LATE_FAILED = 10, +}; + enum hv_system_property { /* Add more values when needed */ HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15, + HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21, +}; + +enum hv_dynamic_processor_feature_property { + /* Add more values when needed */ + HV_X64_DYNAMIC_PROCESSOR_FEATURE_MAX_ENCRYPTED_PARTITIONS = 13, + HV_X64_DYNAMIC_PROCESSOR_FEATURE_SNP_STATUS = 16, }; struct hv_input_get_system_property { u32 property_id; /* enum hv_system_property */ union { u32 as_uint32; +#if IS_ENABLED(CONFIG_X86) + /* enum hv_dynamic_processor_feature_property */ + u32 hv_processor_feature; +#endif /* More fields to be filled in when needed */ }; } __packed; @@ -67,9 +139,28 @@ struct hv_input_get_system_property { struct hv_output_get_system_property { union { u32 scheduler_type; /* enum hv_scheduler_type */ +#if IS_ENABLED(CONFIG_X86) + u64 hv_processor_feature_value; +#endif }; } __packed; +struct hv_input_map_stats_page { + u32 type; /* enum hv_stats_object_type */ + u32 padding; + union hv_stats_object_identity identity; +} __packed; + +struct hv_output_map_stats_page { + u64 map_location; +} __packed; + +struct hv_input_unmap_stats_page { + u32 type; /* enum hv_stats_object_type */ + u32 padding; + union hv_stats_object_identity identity; +} __packed; + struct hv_proximity_domain_flags { u32 proximity_preferred : 1; u32 reserved : 30; From 621191d709b14882270dfd8ea5d7d6cdfebe2c35 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 14 Mar 2025 12:28:56 -0700 Subject: [PATCH 22/24] Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs Provide a set of IOCTLs for creating and managing child partitions when running as root partition on Hyper-V. The new driver is enabled via CONFIG_MSHV_ROOT. A brief overview of the interface: MSHV_CREATE_PARTITION is the entry point, returning a file descriptor representing a child partition. IOCTLs on this fd can be used to map memory, create VPs, etc. Creating a VP returns another file descriptor representing that VP which in turn has another set of corresponding IOCTLs for running the VP, getting/setting state, etc. MSHV_ROOT_HVCALL is a generic "passthrough" hypercall IOCTL which can be used for a number of partition or VP hypercalls. This is for hypercalls that do not affect any state in the kernel driver, such as getting and setting VP registers and partition properties, translating addresses, etc. It is "passthrough" because the binary input and output for the hypercall is only interpreted by the VMM - the kernel driver does nothing but insert the VP and partition id where necessary (which are always in the same place), and execute the hypercall. Co-developed-by: Anirudh Rayabharam Signed-off-by: Anirudh Rayabharam Co-developed-by: Jinank Jain Signed-off-by: Jinank Jain Co-developed-by: Mukesh Rathor Signed-off-by: Mukesh Rathor Co-developed-by: Muminul Islam Signed-off-by: Muminul Islam Co-developed-by: Praveen K Paladugu Signed-off-by: Praveen K Paladugu Co-developed-by: Stanislav Kinsburskii Signed-off-by: Stanislav Kinsburskii Co-developed-by: Wei Liu Signed-off-by: Nuno Das Neves Reviewed-by: Roman Kisel Link: https://lore.kernel.org/r/1741980536-3865-11-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1741980536-3865-11-git-send-email-nunodasneves@linux.microsoft.com> --- .../userspace-api/ioctl/ioctl-number.rst | 2 + drivers/hv/Kconfig | 1 + drivers/hv/Makefile | 5 +- drivers/hv/mshv.h | 30 + drivers/hv/mshv_common.c | 161 ++ drivers/hv/mshv_eventfd.c | 833 ++++++ drivers/hv/mshv_eventfd.h | 71 + drivers/hv/mshv_irq.c | 124 + drivers/hv/mshv_portid_table.c | 83 + drivers/hv/mshv_root.h | 311 +++ drivers/hv/mshv_root_hv_call.c | 849 ++++++ drivers/hv/mshv_root_main.c | 2307 +++++++++++++++++ drivers/hv/mshv_synic.c | 665 +++++ include/uapi/linux/mshv.h | 291 +++ 14 files changed, 5732 insertions(+), 1 deletion(-) create mode 100644 drivers/hv/mshv.h create mode 100644 drivers/hv/mshv_common.c create mode 100644 drivers/hv/mshv_eventfd.c create mode 100644 drivers/hv/mshv_eventfd.h create mode 100644 drivers/hv/mshv_irq.c create mode 100644 drivers/hv/mshv_portid_table.c create mode 100644 drivers/hv/mshv_root.h create mode 100644 drivers/hv/mshv_root_hv_call.c create mode 100644 drivers/hv/mshv_root_main.c create mode 100644 drivers/hv/mshv_synic.c create mode 100644 include/uapi/linux/mshv.h diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 6d1465315df3..66dcfaae698b 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -370,6 +370,8 @@ Code Seq# Include File Comments 0xB7 all uapi/linux/remoteproc_cdev.h 0xB7 all uapi/linux/nsfs.h > 0xB8 01-02 uapi/misc/mrvl_cn10k_dpi.h Marvell CN10K DPI driver +0xB8 all uapi/linux/mshv.h Microsoft Hyper-V /dev/mshv driver + 0xC0 00-0F linux/usb/iowarrior.h 0xCA 00-0F uapi/misc/cxl.h 0xCA 10-2F uapi/misc/ocxl.h diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 3118d5472fab..6c1416167bd2 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -64,6 +64,7 @@ config MSHV_ROOT # e.g. When withdrawing memory, the hypervisor gives back 4k pages in # no particular order, making it impossible to reassemble larger pages depends on PAGE_SIZE_4KB + select EVENTFD default n help Select this option to enable support for booting and running as root diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index 2b8dc954b350..976189c725dc 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -2,6 +2,7 @@ obj-$(CONFIG_HYPERV) += hv_vmbus.o obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o +obj-$(CONFIG_MSHV_ROOT) += mshv_root.o CFLAGS_hv_trace.o = -I$(src) CFLAGS_hv_balloon.o = -I$(src) @@ -11,7 +12,9 @@ hv_vmbus-y := vmbus_drv.o \ channel_mgmt.o ring_buffer.o hv_trace.o hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o +mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \ + mshv_root_hv_call.o mshv_portid_table.o # Code that must be built-in obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o -obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o +obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o diff --git a/drivers/hv/mshv.h b/drivers/hv/mshv.h new file mode 100644 index 000000000000..0340a67acd0a --- /dev/null +++ b/drivers/hv/mshv.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2023, Microsoft Corporation. + */ + +#ifndef _MSHV_H_ +#define _MSHV_H_ + +#include +#include +#include + +#define mshv_field_nonzero(STRUCT, MEMBER) \ + memchr_inv(&((STRUCT).MEMBER), \ + 0, sizeof_field(typeof(STRUCT), MEMBER)) + +int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, + union hv_input_vtl input_vtl, + struct hv_register_assoc *registers); + +int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, + union hv_input_vtl input_vtl, + struct hv_register_assoc *registers); + +int hv_call_get_partition_property(u64 partition_id, u64 property_code, + u64 *property_value); + +int mshv_do_pre_guest_mode_work(ulong th_flags); + +#endif /* _MSHV_H */ diff --git a/drivers/hv/mshv_common.c b/drivers/hv/mshv_common.c new file mode 100644 index 000000000000..2575e6d7a71f --- /dev/null +++ b/drivers/hv/mshv_common.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. + * + * This file contains functions that will be called from one or more modules. + * If any of these modules are configured to build, this file is built and just + * statically linked in. + * + * Authors: Microsoft Linux virtualization team + */ + +#include +#include +#include +#include + +#include "mshv.h" + +#define HV_GET_REGISTER_BATCH_SIZE \ + (HV_HYP_PAGE_SIZE / sizeof(union hv_register_value)) +#define HV_SET_REGISTER_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_registers)) \ + / sizeof(struct hv_register_assoc)) + +int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, + union hv_input_vtl input_vtl, + struct hv_register_assoc *registers) +{ + struct hv_input_get_vp_registers *input_page; + union hv_register_value *output_page; + u16 completed = 0; + unsigned long remaining = count; + int rep_count, i; + u64 status = HV_STATUS_SUCCESS; + unsigned long flags; + + local_irq_save(flags); + + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); + + input_page->partition_id = partition_id; + input_page->vp_index = vp_index; + input_page->input_vtl.as_uint8 = input_vtl.as_uint8; + input_page->rsvd_z8 = 0; + input_page->rsvd_z16 = 0; + + while (remaining) { + rep_count = min(remaining, HV_GET_REGISTER_BATCH_SIZE); + for (i = 0; i < rep_count; ++i) + input_page->names[i] = registers[i].name; + + status = hv_do_rep_hypercall(HVCALL_GET_VP_REGISTERS, rep_count, + 0, input_page, output_page); + if (!hv_result_success(status)) + break; + + completed = hv_repcomp(status); + for (i = 0; i < completed; ++i) + registers[i].value = output_page[i]; + + registers += completed; + remaining -= completed; + } + local_irq_restore(flags); + + return hv_result_to_errno(status); +} +EXPORT_SYMBOL_GPL(hv_call_get_vp_registers); + +int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, + union hv_input_vtl input_vtl, + struct hv_register_assoc *registers) +{ + struct hv_input_set_vp_registers *input_page; + u16 completed = 0; + unsigned long remaining = count; + int rep_count; + u64 status = HV_STATUS_SUCCESS; + unsigned long flags; + + local_irq_save(flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->partition_id = partition_id; + input_page->vp_index = vp_index; + input_page->input_vtl.as_uint8 = input_vtl.as_uint8; + input_page->rsvd_z8 = 0; + input_page->rsvd_z16 = 0; + + while (remaining) { + rep_count = min(remaining, HV_SET_REGISTER_BATCH_SIZE); + memcpy(input_page->elements, registers, + sizeof(struct hv_register_assoc) * rep_count); + + status = hv_do_rep_hypercall(HVCALL_SET_VP_REGISTERS, rep_count, + 0, input_page, NULL); + if (!hv_result_success(status)) + break; + + completed = hv_repcomp(status); + registers += completed; + remaining -= completed; + } + + local_irq_restore(flags); + + return hv_result_to_errno(status); +} +EXPORT_SYMBOL_GPL(hv_call_set_vp_registers); + +int hv_call_get_partition_property(u64 partition_id, + u64 property_code, + u64 *property_value) +{ + u64 status; + unsigned long flags; + struct hv_input_get_partition_property *input; + struct hv_output_get_partition_property *output; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->property_code = property_code; + status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY, input, output); + + if (!hv_result_success(status)) { + local_irq_restore(flags); + return hv_result_to_errno(status); + } + *property_value = output->property_value; + + local_irq_restore(flags); + + return 0; +} +EXPORT_SYMBOL_GPL(hv_call_get_partition_property); + +/* + * Handle any pre-processing before going into the guest mode on this cpu, most + * notably call schedule(). Must be invoked with both preemption and + * interrupts enabled. + * + * Returns: 0 on success, -errno on error. + */ +int mshv_do_pre_guest_mode_work(ulong th_flags) +{ + if (th_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) + return -EINTR; + + if (th_flags & _TIF_NEED_RESCHED) + schedule(); + + if (th_flags & _TIF_NOTIFY_RESUME) + resume_user_mode_work(NULL); + + return 0; +} +EXPORT_SYMBOL_GPL(mshv_do_pre_guest_mode_work); diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c new file mode 100644 index 000000000000..8dd22be2ca0b --- /dev/null +++ b/drivers/hv/mshv_eventfd.c @@ -0,0 +1,833 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * eventfd support for mshv + * + * Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic + * framework code is taken from the kvm implementation. + * + * All credits to kvm developers. + */ + +#include +#include +#include +#include +#include +#include +#include + +#if IS_ENABLED(CONFIG_X86_64) +#include +#endif +#include + +#include "mshv_eventfd.h" +#include "mshv.h" +#include "mshv_root.h" + +static struct workqueue_struct *irqfd_cleanup_wq; + +void mshv_register_irq_ack_notifier(struct mshv_partition *partition, + struct mshv_irq_ack_notifier *mian) +{ + mutex_lock(&partition->pt_irq_lock); + hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list); + mutex_unlock(&partition->pt_irq_lock); +} + +void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition, + struct mshv_irq_ack_notifier *mian) +{ + mutex_lock(&partition->pt_irq_lock); + hlist_del_init_rcu(&mian->link); + mutex_unlock(&partition->pt_irq_lock); + synchronize_rcu(); +} + +bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi) +{ + struct mshv_irq_ack_notifier *mian; + bool acked = false; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list, + link) { + if (mian->irq_ack_gsi == gsi) { + mian->irq_acked(mian); + acked = true; + } + } + rcu_read_unlock(); + + return acked; +} + +#if IS_ENABLED(CONFIG_ARM64) +static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type) +{ + return false; +} +#elif IS_ENABLED(CONFIG_X86_64) +static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type) +{ + return type == HV_X64_INTERRUPT_TYPE_EXTINT; +} +#endif + +static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian) +{ + struct mshv_irqfd_resampler *resampler; + struct mshv_partition *partition; + struct mshv_irqfd *irqfd; + int idx; + + resampler = container_of(mian, struct mshv_irqfd_resampler, + rsmplr_notifier); + partition = resampler->rsmplr_partn; + + idx = srcu_read_lock(&partition->pt_irq_srcu); + + hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list, + irqfd_resampler_hnode) { + if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type)) + hv_call_clear_virtual_interrupt(partition->pt_id); + + eventfd_signal(irqfd->irqfd_resamplefd); + } + + srcu_read_unlock(&partition->pt_irq_srcu, idx); +} + +#if IS_ENABLED(CONFIG_X86_64) +static bool +mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv, + u32 vector) +{ + int i; + + for (i = 0; i < iv.vector_count; i++) { + if (iv.vector[i] == vector) + return true; + } + + return false; +} + +static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector) +{ + union hv_vp_register_page_interrupt_vectors iv, new_iv; + + iv = vp->vp_register_page->interrupt_vectors; + new_iv = iv; + + if (mshv_vp_irq_vector_injected(iv, vector)) + return 0; + + if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT) + return -ENOSPC; + + new_iv.vector[new_iv.vector_count++] = vector; + + if (cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64, + iv.as_uint64, new_iv.as_uint64) != iv.as_uint64) + return -EAGAIN; + + return 0; +} + +static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector) +{ + int ret; + + do { + ret = mshv_vp_irq_try_set_vector(vp, vector); + } while (ret == -EAGAIN && !need_resched()); + + return ret; +} + +/* + * Try to raise irq for guest via shared vector array. hyp does the actual + * inject of the interrupt. + */ +static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) +{ + struct mshv_partition *partition = irqfd->irqfd_partn; + struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq; + struct mshv_vp *vp; + + if (!(ms_hyperv.ext_features & + HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE)) + return -EOPNOTSUPP; + + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) + return -EOPNOTSUPP; + + if (irq->lapic_control.logical_dest_mode) + return -EOPNOTSUPP; + + vp = partition->pt_vp_array[irq->lapic_apic_id]; + + if (!vp->vp_register_page) + return -EOPNOTSUPP; + + if (mshv_vp_irq_set_vector(vp, irq->lapic_vector)) + return -EINVAL; + + if (vp->run.flags.root_sched_dispatched && + vp->vp_register_page->interrupt_vectors.as_uint64) + return -EBUSY; + + wake_up(&vp->run.vp_suspend_queue); + + return 0; +} +#else /* CONFIG_X86_64 */ +static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) +{ + return -EOPNOTSUPP; +} +#endif + +static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd) +{ + struct mshv_partition *partition = irqfd->irqfd_partn; + struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq; + unsigned int seq; + int idx; + + WARN_ON(irqfd->irqfd_resampler && + !irq->lapic_control.level_triggered); + + idx = srcu_read_lock(&partition->pt_irq_srcu); + if (irqfd->irqfd_girq_ent.guest_irq_num) { + if (!irqfd->irqfd_girq_ent.girq_entry_valid) { + srcu_read_unlock(&partition->pt_irq_srcu, idx); + return; + } + + do { + seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc); + } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq)); + } + + hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id, + irq->lapic_vector, irq->lapic_apic_id, + irq->lapic_control); + srcu_read_unlock(&partition->pt_irq_srcu, idx); +} + +static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd) +{ + struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler; + struct mshv_partition *pt = rp->rsmplr_partn; + + mutex_lock(&pt->irqfds_resampler_lock); + + hlist_del_rcu(&irqfd->irqfd_resampler_hnode); + synchronize_srcu(&pt->pt_irq_srcu); + + if (hlist_empty(&rp->rsmplr_irqfd_list)) { + hlist_del(&rp->rsmplr_hnode); + mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier); + kfree(rp); + } + + mutex_unlock(&pt->irqfds_resampler_lock); +} + +/* + * Race-free decouple logic (ordering is critical) + */ +static void mshv_irqfd_shutdown(struct work_struct *work) +{ + struct mshv_irqfd *irqfd = + container_of(work, struct mshv_irqfd, irqfd_shutdown); + + /* + * Synchronize with the wait-queue and unhook ourselves to prevent + * further events. + */ + remove_wait_queue(irqfd->irqfd_wqh, &irqfd->irqfd_wait); + + if (irqfd->irqfd_resampler) { + mshv_irqfd_resampler_shutdown(irqfd); + eventfd_ctx_put(irqfd->irqfd_resamplefd); + } + + /* + * It is now safe to release the object's resources + */ + eventfd_ctx_put(irqfd->irqfd_eventfd_ctx); + kfree(irqfd); +} + +/* assumes partition->pt_irqfds_lock is held */ +static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd) +{ + return !hlist_unhashed(&irqfd->irqfd_hnode); +} + +/* + * Mark the irqfd as inactive and schedule it for removal + * + * assumes partition->pt_irqfds_lock is held + */ +static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd) +{ + if (!mshv_irqfd_is_active(irqfd)) + return; + + hlist_del(&irqfd->irqfd_hnode); + + queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown); +} + +/* + * Called with wqh->lock held and interrupts disabled + */ +static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd, + irqfd_wait); + unsigned long flags = (unsigned long)key; + int idx; + unsigned int seq; + struct mshv_partition *pt = irqfd->irqfd_partn; + int ret = 0; + + if (flags & POLLIN) { + u64 cnt; + + eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt); + idx = srcu_read_lock(&pt->pt_irq_srcu); + do { + seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc); + } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq)); + + /* An event has been signaled, raise an interrupt */ + ret = mshv_try_assert_irq_fast(irqfd); + if (ret) + mshv_assert_irq_slow(irqfd); + + srcu_read_unlock(&pt->pt_irq_srcu, idx); + + ret = 1; + } + + if (flags & POLLHUP) { + /* The eventfd is closing, detach from the partition */ + unsigned long flags; + + spin_lock_irqsave(&pt->pt_irqfds_lock, flags); + + /* + * We must check if someone deactivated the irqfd before + * we could acquire the pt_irqfds_lock since the item is + * deactivated from the mshv side before it is unhooked from + * the wait-queue. If it is already deactivated, we can + * simply return knowing the other side will cleanup for us. + * We cannot race against the irqfd going away since the + * other side is required to acquire wqh->lock, which we hold + */ + if (mshv_irqfd_is_active(irqfd)) + mshv_irqfd_deactivate(irqfd); + + spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags); + } + + return ret; +} + +/* Must be called under pt_irqfds_lock */ +static void mshv_irqfd_update(struct mshv_partition *pt, + struct mshv_irqfd *irqfd) +{ + write_seqcount_begin(&irqfd->irqfd_irqe_sc); + irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt, + irqfd->irqfd_irqnum); + mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq); + write_seqcount_end(&irqfd->irqfd_irqe_sc); +} + +void mshv_irqfd_routing_update(struct mshv_partition *pt) +{ + struct mshv_irqfd *irqfd; + + spin_lock_irq(&pt->pt_irqfds_lock); + hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode) + mshv_irqfd_update(pt, irqfd); + spin_unlock_irq(&pt->pt_irqfds_lock); +} + +static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh, + poll_table *polltbl) +{ + struct mshv_irqfd *irqfd = + container_of(polltbl, struct mshv_irqfd, irqfd_polltbl); + + irqfd->irqfd_wqh = wqh; + add_wait_queue_priority(wqh, &irqfd->irqfd_wait); +} + +static int mshv_irqfd_assign(struct mshv_partition *pt, + struct mshv_user_irqfd *args) +{ + struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; + struct mshv_irqfd *irqfd, *tmp; + unsigned int events; + struct fd f; + int ret; + int idx; + + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + if (!irqfd) + return -ENOMEM; + + irqfd->irqfd_partn = pt; + irqfd->irqfd_irqnum = args->gsi; + INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown); + seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock); + + f = fdget(args->fd); + if (!fd_file(f)) { + ret = -EBADF; + goto out; + } + + eventfd = eventfd_ctx_fileget(fd_file(f)); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto fail; + } + + irqfd->irqfd_eventfd_ctx = eventfd; + + if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) { + struct mshv_irqfd_resampler *rp; + + resamplefd = eventfd_ctx_fdget(args->resamplefd); + if (IS_ERR(resamplefd)) { + ret = PTR_ERR(resamplefd); + goto fail; + } + + irqfd->irqfd_resamplefd = resamplefd; + + mutex_lock(&pt->irqfds_resampler_lock); + + hlist_for_each_entry(rp, &pt->irqfds_resampler_list, + rsmplr_hnode) { + if (rp->rsmplr_notifier.irq_ack_gsi == + irqfd->irqfd_irqnum) { + irqfd->irqfd_resampler = rp; + break; + } + } + + if (!irqfd->irqfd_resampler) { + rp = kzalloc(sizeof(*rp), GFP_KERNEL_ACCOUNT); + if (!rp) { + ret = -ENOMEM; + mutex_unlock(&pt->irqfds_resampler_lock); + goto fail; + } + + rp->rsmplr_partn = pt; + INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list); + rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum; + rp->rsmplr_notifier.irq_acked = + mshv_irqfd_resampler_ack; + + hlist_add_head(&rp->rsmplr_hnode, + &pt->irqfds_resampler_list); + mshv_register_irq_ack_notifier(pt, + &rp->rsmplr_notifier); + irqfd->irqfd_resampler = rp; + } + + hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode, + &irqfd->irqfd_resampler->rsmplr_irqfd_list); + + mutex_unlock(&pt->irqfds_resampler_lock); + } + + /* + * Install our own custom wake-up handling so we are notified via + * a callback whenever someone signals the underlying eventfd + */ + init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup); + init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc); + + spin_lock_irq(&pt->pt_irqfds_lock); + if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) && + !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) { + /* + * Resample Fd must be for level triggered interrupt + * Otherwise return with failure + */ + spin_unlock_irq(&pt->pt_irqfds_lock); + ret = -EINVAL; + goto fail; + } + ret = 0; + hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) { + if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx) + continue; + /* This fd is used for another irq already. */ + ret = -EBUSY; + spin_unlock_irq(&pt->pt_irqfds_lock); + goto fail; + } + + idx = srcu_read_lock(&pt->pt_irq_srcu); + mshv_irqfd_update(pt, irqfd); + hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list); + spin_unlock_irq(&pt->pt_irqfds_lock); + + /* + * Check if there was an event already pending on the eventfd + * before we registered, and trigger it as if we didn't miss it. + */ + events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl); + + if (events & POLLIN) + mshv_assert_irq_slow(irqfd); + + srcu_read_unlock(&pt->pt_irq_srcu, idx); + /* + * do not drop the file until the irqfd is fully initialized, otherwise + * we might race against the POLLHUP + */ + fdput(f); + + return 0; + +fail: + if (irqfd->irqfd_resampler) + mshv_irqfd_resampler_shutdown(irqfd); + + if (resamplefd && !IS_ERR(resamplefd)) + eventfd_ctx_put(resamplefd); + + if (eventfd && !IS_ERR(eventfd)) + eventfd_ctx_put(eventfd); + + fdput(f); + +out: + kfree(irqfd); + return ret; +} + +/* + * shutdown any irqfd's that match fd+gsi + */ +static int mshv_irqfd_deassign(struct mshv_partition *pt, + struct mshv_user_irqfd *args) +{ + struct mshv_irqfd *irqfd; + struct hlist_node *n; + struct eventfd_ctx *eventfd; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, + irqfd_hnode) { + if (irqfd->irqfd_eventfd_ctx == eventfd && + irqfd->irqfd_irqnum == args->gsi) + + mshv_irqfd_deactivate(irqfd); + } + + eventfd_ctx_put(eventfd); + + /* + * Block until we know all outstanding shutdown jobs have completed + * so that we guarantee there will not be any more interrupts on this + * gsi once this deassign function returns. + */ + flush_workqueue(irqfd_cleanup_wq); + + return 0; +} + +int mshv_set_unset_irqfd(struct mshv_partition *pt, + struct mshv_user_irqfd *args) +{ + if (args->flags & ~MSHV_IRQFD_FLAGS_MASK) + return -EINVAL; + + if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN)) + return mshv_irqfd_deassign(pt, args); + + return mshv_irqfd_assign(pt, args); +} + +/* + * This function is called as the mshv VM fd is being released. + * Shutdown all irqfds that still remain open + */ +static void mshv_irqfd_release(struct mshv_partition *pt) +{ + struct mshv_irqfd *irqfd; + struct hlist_node *n; + + spin_lock_irq(&pt->pt_irqfds_lock); + + hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode) + mshv_irqfd_deactivate(irqfd); + + spin_unlock_irq(&pt->pt_irqfds_lock); + + /* + * Block until we know all outstanding shutdown jobs have completed + * since we do not take a mshv_partition* reference. + */ + flush_workqueue(irqfd_cleanup_wq); +} + +int mshv_irqfd_wq_init(void) +{ + irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", 0, 0); + if (!irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + +void mshv_irqfd_wq_cleanup(void) +{ + destroy_workqueue(irqfd_cleanup_wq); +} + +/* + * -------------------------------------------------------------------- + * ioeventfd: translate a MMIO memory write to an eventfd signal. + * + * userspace can register a MMIO address with an eventfd for receiving + * notification when the memory has been touched. + * -------------------------------------------------------------------- + */ + +static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id) +{ + if (p->iovntfd_doorbell_id > 0) + mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id); + eventfd_ctx_put(p->iovntfd_eventfd); + kfree(p); +} + +/* MMIO writes trigger an event if the addr/val match */ +static void ioeventfd_mmio_write(int doorbell_id, void *data) +{ + struct mshv_partition *partition = (struct mshv_partition *)data; + struct mshv_ioeventfd *p; + + rcu_read_lock(); + hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode) + if (p->iovntfd_doorbell_id == doorbell_id) { + eventfd_signal(p->iovntfd_eventfd); + break; + } + + rcu_read_unlock(); +} + +static bool ioeventfd_check_collision(struct mshv_partition *pt, + struct mshv_ioeventfd *p) + __must_hold(&pt->mutex) +{ + struct mshv_ioeventfd *_p; + + hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode) + if (_p->iovntfd_addr == p->iovntfd_addr && + _p->iovntfd_length == p->iovntfd_length && + (_p->iovntfd_wildcard || p->iovntfd_wildcard || + _p->iovntfd_datamatch == p->iovntfd_datamatch)) + return true; + + return false; +} + +static int mshv_assign_ioeventfd(struct mshv_partition *pt, + struct mshv_user_ioeventfd *args) + __must_hold(&pt->mutex) +{ + struct mshv_ioeventfd *p; + struct eventfd_ctx *eventfd; + u64 doorbell_flags = 0; + int ret; + + /* This mutex is currently protecting ioeventfd.items list */ + WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex)); + + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO)) + return -EOPNOTSUPP; + + /* must be natural-word sized */ + switch (args->len) { + case 0: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY; + break; + case 1: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE; + break; + case 2: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD; + break; + case 4: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD; + break; + case 8: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD; + break; + default: + return -EINVAL; + } + + /* check for range overflow */ + if (args->addr + args->len < args->addr) + return -EINVAL; + + /* check for extra flags that we don't understand */ + if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) + return -EINVAL; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + ret = -ENOMEM; + goto fail; + } + + p->iovntfd_addr = args->addr; + p->iovntfd_length = args->len; + p->iovntfd_eventfd = eventfd; + + /* The datamatch feature is optional, otherwise this is a wildcard */ + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) { + p->iovntfd_datamatch = args->datamatch; + } else { + p->iovntfd_wildcard = true; + doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE; + } + + if (ioeventfd_check_collision(pt, p)) { + ret = -EEXIST; + goto unlock_fail; + } + + ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write, + (void *)pt, p->iovntfd_addr, + p->iovntfd_datamatch, doorbell_flags); + if (ret < 0) + goto unlock_fail; + + p->iovntfd_doorbell_id = ret; + + hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list); + + return 0; + +unlock_fail: + kfree(p); + +fail: + eventfd_ctx_put(eventfd); + + return ret; +} + +static int mshv_deassign_ioeventfd(struct mshv_partition *pt, + struct mshv_user_ioeventfd *args) + __must_hold(&pt->mutex) +{ + struct mshv_ioeventfd *p; + struct eventfd_ctx *eventfd; + struct hlist_node *n; + int ret = -ENOENT; + + /* This mutex is currently protecting ioeventfd.items list */ + WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex)); + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) { + bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)); + + if (p->iovntfd_eventfd != eventfd || + p->iovntfd_addr != args->addr || + p->iovntfd_length != args->len || + p->iovntfd_wildcard != wildcard) + continue; + + if (!p->iovntfd_wildcard && + p->iovntfd_datamatch != args->datamatch) + continue; + + hlist_del_rcu(&p->iovntfd_hnode); + synchronize_rcu(); + ioeventfd_release(p, pt->pt_id); + ret = 0; + break; + } + + eventfd_ctx_put(eventfd); + + return ret; +} + +int mshv_set_unset_ioeventfd(struct mshv_partition *pt, + struct mshv_user_ioeventfd *args) + __must_hold(&pt->mutex) +{ + if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) || + mshv_field_nonzero(*args, rsvd)) + return -EINVAL; + + /* PIO not yet implemented */ + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO)) + return -EOPNOTSUPP; + + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN)) + return mshv_deassign_ioeventfd(pt, args); + + return mshv_assign_ioeventfd(pt, args); +} + +void mshv_eventfd_init(struct mshv_partition *pt) +{ + spin_lock_init(&pt->pt_irqfds_lock); + INIT_HLIST_HEAD(&pt->pt_irqfds_list); + + INIT_HLIST_HEAD(&pt->irqfds_resampler_list); + mutex_init(&pt->irqfds_resampler_lock); + + INIT_HLIST_HEAD(&pt->ioeventfds_list); +} + +void mshv_eventfd_release(struct mshv_partition *pt) +{ + struct hlist_head items; + struct hlist_node *n; + struct mshv_ioeventfd *p; + + hlist_move_list(&pt->ioeventfds_list, &items); + synchronize_rcu(); + + hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) { + hlist_del(&p->iovntfd_hnode); + ioeventfd_release(p, pt->pt_id); + } + + mshv_irqfd_release(pt); +} diff --git a/drivers/hv/mshv_eventfd.h b/drivers/hv/mshv_eventfd.h new file mode 100644 index 000000000000..332e7670a344 --- /dev/null +++ b/drivers/hv/mshv_eventfd.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * irqfd: Allows an fd to be used to inject an interrupt to the guest. + * ioeventfd: Allow an fd to be used to receive a signal from the guest. + * All credit goes to kvm developers. + */ + +#ifndef __LINUX_MSHV_EVENTFD_H +#define __LINUX_MSHV_EVENTFD_H + +#include + +#include "mshv.h" +#include "mshv_root.h" + +/* struct to contain list of irqfds sharing an irq. Updates are protected by + * partition.irqfds.resampler_lock + */ +struct mshv_irqfd_resampler { + struct mshv_partition *rsmplr_partn; + struct hlist_head rsmplr_irqfd_list; + struct mshv_irq_ack_notifier rsmplr_notifier; + struct hlist_node rsmplr_hnode; +}; + +struct mshv_irqfd { + struct mshv_partition *irqfd_partn; + struct eventfd_ctx *irqfd_eventfd_ctx; + struct mshv_guest_irq_ent irqfd_girq_ent; + seqcount_spinlock_t irqfd_irqe_sc; + u32 irqfd_irqnum; + struct mshv_lapic_irq irqfd_lapic_irq; + struct hlist_node irqfd_hnode; + poll_table irqfd_polltbl; + wait_queue_head_t *irqfd_wqh; + wait_queue_entry_t irqfd_wait; + struct work_struct irqfd_shutdown; + struct mshv_irqfd_resampler *irqfd_resampler; + struct eventfd_ctx *irqfd_resamplefd; + struct hlist_node irqfd_resampler_hnode; +}; + +void mshv_eventfd_init(struct mshv_partition *partition); +void mshv_eventfd_release(struct mshv_partition *partition); + +void mshv_register_irq_ack_notifier(struct mshv_partition *partition, + struct mshv_irq_ack_notifier *mian); +void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition, + struct mshv_irq_ack_notifier *mian); +bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi); + +int mshv_set_unset_irqfd(struct mshv_partition *partition, + struct mshv_user_irqfd *args); + +int mshv_irqfd_wq_init(void); +void mshv_irqfd_wq_cleanup(void); + +struct mshv_ioeventfd { + struct hlist_node iovntfd_hnode; + u64 iovntfd_addr; + int iovntfd_length; + struct eventfd_ctx *iovntfd_eventfd; + u64 iovntfd_datamatch; + int iovntfd_doorbell_id; + bool iovntfd_wildcard; +}; + +int mshv_set_unset_ioeventfd(struct mshv_partition *pt, + struct mshv_user_ioeventfd *args); + +#endif /* __LINUX_MSHV_EVENTFD_H */ diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c new file mode 100644 index 000000000000..d0fb9ef734f4 --- /dev/null +++ b/drivers/hv/mshv_irq.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Authors: Microsoft Linux virtualization team + */ + +#include +#include +#include +#include + +#include "mshv_eventfd.h" +#include "mshv.h" +#include "mshv_root.h" + +/* called from the ioctl code, user wants to update the guest irq table */ +int mshv_update_routing_table(struct mshv_partition *partition, + const struct mshv_user_irq_entry *ue, + unsigned int numents) +{ + struct mshv_girq_routing_table *new = NULL, *old; + u32 i, nr_rt_entries = 0; + int r = 0; + + if (numents == 0) + goto swap_routes; + + for (i = 0; i < numents; i++) { + if (ue[i].gsi >= MSHV_MAX_GUEST_IRQS) + return -EINVAL; + + if (ue[i].address_hi) + return -EINVAL; + + nr_rt_entries = max(nr_rt_entries, ue[i].gsi); + } + nr_rt_entries += 1; + + new = kzalloc(struct_size(new, mshv_girq_info_tbl, nr_rt_entries), + GFP_KERNEL_ACCOUNT); + if (!new) + return -ENOMEM; + + new->num_rt_entries = nr_rt_entries; + for (i = 0; i < numents; i++) { + struct mshv_guest_irq_ent *girq; + + girq = &new->mshv_girq_info_tbl[ue[i].gsi]; + + /* + * Allow only one to one mapping between GSI and MSI routing. + */ + if (girq->guest_irq_num != 0) { + r = -EINVAL; + goto out; + } + + girq->guest_irq_num = ue[i].gsi; + girq->girq_addr_lo = ue[i].address_lo; + girq->girq_addr_hi = ue[i].address_hi; + girq->girq_irq_data = ue[i].data; + girq->girq_entry_valid = true; + } + +swap_routes: + mutex_lock(&partition->pt_irq_lock); + old = rcu_dereference_protected(partition->pt_girq_tbl, 1); + rcu_assign_pointer(partition->pt_girq_tbl, new); + mshv_irqfd_routing_update(partition); + mutex_unlock(&partition->pt_irq_lock); + + synchronize_srcu_expedited(&partition->pt_irq_srcu); + new = old; + +out: + kfree(new); + + return r; +} + +/* vm is going away, kfree the irq routing table */ +void mshv_free_routing_table(struct mshv_partition *partition) +{ + struct mshv_girq_routing_table *rt = + rcu_access_pointer(partition->pt_girq_tbl); + + kfree(rt); +} + +struct mshv_guest_irq_ent +mshv_ret_girq_entry(struct mshv_partition *partition, u32 irqnum) +{ + struct mshv_guest_irq_ent entry = { 0 }; + struct mshv_girq_routing_table *girq_tbl; + + girq_tbl = srcu_dereference_check(partition->pt_girq_tbl, + &partition->pt_irq_srcu, + lockdep_is_held(&partition->pt_irq_lock)); + if (!girq_tbl || irqnum >= girq_tbl->num_rt_entries) { + /* + * Premature register_irqfd, setting valid_entry = 0 + * would ignore this entry anyway + */ + entry.guest_irq_num = irqnum; + return entry; + } + + return girq_tbl->mshv_girq_info_tbl[irqnum]; +} + +void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent, + struct mshv_lapic_irq *lirq) +{ + memset(lirq, 0, sizeof(*lirq)); + if (!ent || !ent->girq_entry_valid) + return; + + lirq->lapic_vector = ent->girq_irq_data & 0xFF; + lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF; + lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8; + lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1; + lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1; +} diff --git a/drivers/hv/mshv_portid_table.c b/drivers/hv/mshv_portid_table.c new file mode 100644 index 000000000000..c349af1f0aaa --- /dev/null +++ b/drivers/hv/mshv_portid_table.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include "mshv.h" +#include "mshv_root.h" + +/* + * Ports and connections are hypervisor struct used for inter-partition + * communication. Port represents the source and connection represents + * the destination. Partitions are responsible for managing the port and + * connection ids. + * + */ + +#define PORTID_MIN 1 +#define PORTID_MAX INT_MAX + +static DEFINE_IDR(port_table_idr); + +void +mshv_port_table_fini(void) +{ + struct port_table_info *port_info; + unsigned long i, tmp; + + idr_lock(&port_table_idr); + if (!idr_is_empty(&port_table_idr)) { + idr_for_each_entry_ul(&port_table_idr, port_info, tmp, i) { + port_info = idr_remove(&port_table_idr, i); + kfree_rcu(port_info, portbl_rcu); + } + } + idr_unlock(&port_table_idr); +} + +int +mshv_portid_alloc(struct port_table_info *info) +{ + int ret = 0; + + idr_lock(&port_table_idr); + ret = idr_alloc(&port_table_idr, info, PORTID_MIN, + PORTID_MAX, GFP_KERNEL); + idr_unlock(&port_table_idr); + + return ret; +} + +void +mshv_portid_free(int port_id) +{ + struct port_table_info *info; + + idr_lock(&port_table_idr); + info = idr_remove(&port_table_idr, port_id); + WARN_ON(!info); + idr_unlock(&port_table_idr); + + synchronize_rcu(); + kfree(info); +} + +int +mshv_portid_lookup(int port_id, struct port_table_info *info) +{ + struct port_table_info *_info; + int ret = -ENOENT; + + rcu_read_lock(); + _info = idr_find(&port_table_idr, port_id); + rcu_read_unlock(); + + if (_info) { + *info = *_info; + ret = 0; + } + + return ret; +} diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h new file mode 100644 index 000000000000..e3931b0f1269 --- /dev/null +++ b/drivers/hv/mshv_root.h @@ -0,0 +1,311 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2023, Microsoft Corporation. + */ + +#ifndef _MSHV_ROOT_H_ +#define _MSHV_ROOT_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Hypervisor must be between these version numbers (inclusive) + * to guarantee compatibility + */ +#define MSHV_HV_MIN_VERSION (27744) +#define MSHV_HV_MAX_VERSION (27751) + +static_assert(HV_HYP_PAGE_SIZE == MSHV_HV_PAGE_SIZE); + +#define MSHV_MAX_VPS 256 + +#define MSHV_PARTITIONS_HASH_BITS 9 + +#define MSHV_PIN_PAGES_BATCH_SIZE (0x10000000ULL / HV_HYP_PAGE_SIZE) + +struct mshv_vp { + u32 vp_index; + struct mshv_partition *vp_partition; + struct mutex vp_mutex; + struct hv_vp_register_page *vp_register_page; + struct hv_message *vp_intercept_msg_page; + void *vp_ghcb_page; + struct hv_stats_page *vp_stats_pages[2]; + struct { + atomic64_t vp_signaled_count; + struct { + u64 intercept_suspend: 1; + u64 root_sched_blocked: 1; /* root scheduler only */ + u64 root_sched_dispatched: 1; /* root scheduler only */ + u64 reserved: 61; + } flags; + unsigned int kicked_by_hv; + wait_queue_head_t vp_suspend_queue; + } run; +}; + +#define vp_fmt(fmt) "p%lluvp%u: " fmt +#define vp_devprintk(level, v, fmt, ...) \ +do { \ + const struct mshv_vp *__vp = (v); \ + const struct mshv_partition *__pt = __vp->vp_partition; \ + dev_##level(__pt->pt_module_dev, vp_fmt(fmt), __pt->pt_id, \ + __vp->vp_index, ##__VA_ARGS__); \ +} while (0) +#define vp_emerg(v, fmt, ...) vp_devprintk(emerg, v, fmt, ##__VA_ARGS__) +#define vp_crit(v, fmt, ...) vp_devprintk(crit, v, fmt, ##__VA_ARGS__) +#define vp_alert(v, fmt, ...) vp_devprintk(alert, v, fmt, ##__VA_ARGS__) +#define vp_err(v, fmt, ...) vp_devprintk(err, v, fmt, ##__VA_ARGS__) +#define vp_warn(v, fmt, ...) vp_devprintk(warn, v, fmt, ##__VA_ARGS__) +#define vp_notice(v, fmt, ...) vp_devprintk(notice, v, fmt, ##__VA_ARGS__) +#define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__) +#define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__) + +struct mshv_mem_region { + struct hlist_node hnode; + u64 nr_pages; + u64 start_gfn; + u64 start_uaddr; + u32 hv_map_flags; + struct { + u64 large_pages: 1; /* 2MiB */ + u64 range_pinned: 1; + u64 reserved: 62; + } flags; + struct mshv_partition *partition; + struct page *pages[]; +}; + +struct mshv_irq_ack_notifier { + struct hlist_node link; + unsigned int irq_ack_gsi; + void (*irq_acked)(struct mshv_irq_ack_notifier *mian); +}; + +struct mshv_partition { + struct device *pt_module_dev; + + struct hlist_node pt_hnode; + u64 pt_id; + refcount_t pt_ref_count; + struct mutex pt_mutex; + struct hlist_head pt_mem_regions; // not ordered + + u32 pt_vp_count; + struct mshv_vp *pt_vp_array[MSHV_MAX_VPS]; + + struct mutex pt_irq_lock; + struct srcu_struct pt_irq_srcu; + struct hlist_head irq_ack_notifier_list; + + struct hlist_head pt_devices; + + /* + * MSHV does not support more than one async hypercall in flight + * for a single partition. Thus, it is okay to define per partition + * async hypercall status. + */ + struct completion async_hypercall; + u64 async_hypercall_status; + + spinlock_t pt_irqfds_lock; + struct hlist_head pt_irqfds_list; + struct mutex irqfds_resampler_lock; + struct hlist_head irqfds_resampler_list; + + struct hlist_head ioeventfds_list; + + struct mshv_girq_routing_table __rcu *pt_girq_tbl; + u64 isolation_type; + bool import_completed; + bool pt_initialized; +}; + +#define pt_fmt(fmt) "p%llu: " fmt +#define pt_devprintk(level, p, fmt, ...) \ +do { \ + const struct mshv_partition *__pt = (p); \ + dev_##level(__pt->pt_module_dev, pt_fmt(fmt), __pt->pt_id, \ + ##__VA_ARGS__); \ +} while (0) +#define pt_emerg(p, fmt, ...) pt_devprintk(emerg, p, fmt, ##__VA_ARGS__) +#define pt_crit(p, fmt, ...) pt_devprintk(crit, p, fmt, ##__VA_ARGS__) +#define pt_alert(p, fmt, ...) pt_devprintk(alert, p, fmt, ##__VA_ARGS__) +#define pt_err(p, fmt, ...) pt_devprintk(err, p, fmt, ##__VA_ARGS__) +#define pt_warn(p, fmt, ...) pt_devprintk(warn, p, fmt, ##__VA_ARGS__) +#define pt_notice(p, fmt, ...) pt_devprintk(notice, p, fmt, ##__VA_ARGS__) +#define pt_info(p, fmt, ...) pt_devprintk(info, p, fmt, ##__VA_ARGS__) +#define pt_dbg(p, fmt, ...) pt_devprintk(dbg, p, fmt, ##__VA_ARGS__) + +struct mshv_lapic_irq { + u32 lapic_vector; + u64 lapic_apic_id; + union hv_interrupt_control lapic_control; +}; + +#define MSHV_MAX_GUEST_IRQS 4096 + +/* representation of one guest irq entry, either msi or legacy */ +struct mshv_guest_irq_ent { + u32 girq_entry_valid; /* vfio looks at this */ + u32 guest_irq_num; /* a unique number for each irq */ + u32 girq_addr_lo; /* guest irq msi address info */ + u32 girq_addr_hi; + u32 girq_irq_data; /* idt vector in some cases */ +}; + +struct mshv_girq_routing_table { + u32 num_rt_entries; + struct mshv_guest_irq_ent mshv_girq_info_tbl[]; +}; + +struct hv_synic_pages { + struct hv_message_page *synic_message_page; + struct hv_synic_event_flags_page *synic_event_flags_page; + struct hv_synic_event_ring_page *synic_event_ring_page; +}; + +struct mshv_root { + struct hv_synic_pages __percpu *synic_pages; + spinlock_t pt_ht_lock; + DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS); +}; + +/* + * Callback for doorbell events. + * NOTE: This is called in interrupt context. Callback + * should defer slow and sleeping logic to later. + */ +typedef void (*doorbell_cb_t) (int doorbell_id, void *); + +/* + * port table information + */ +struct port_table_info { + struct rcu_head portbl_rcu; + enum hv_port_type hv_port_type; + union { + struct { + u64 reserved[2]; + } hv_port_message; + struct { + u64 reserved[2]; + } hv_port_event; + struct { + u64 reserved[2]; + } hv_port_monitor; + struct { + doorbell_cb_t doorbell_cb; + void *data; + } hv_port_doorbell; + }; +}; + +int mshv_update_routing_table(struct mshv_partition *partition, + const struct mshv_user_irq_entry *entries, + unsigned int numents); +void mshv_free_routing_table(struct mshv_partition *partition); + +struct mshv_guest_irq_ent mshv_ret_girq_entry(struct mshv_partition *partition, + u32 irq_num); + +void mshv_copy_girq_info(struct mshv_guest_irq_ent *src_irq, + struct mshv_lapic_irq *dest_irq); + +void mshv_irqfd_routing_update(struct mshv_partition *partition); + +void mshv_port_table_fini(void); +int mshv_portid_alloc(struct port_table_info *info); +int mshv_portid_lookup(int port_id, struct port_table_info *info); +void mshv_portid_free(int port_id); + +int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, + void *data, u64 gpa, u64 val, u64 flags); +void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid); + +void mshv_isr(void); +int mshv_synic_init(unsigned int cpu); +int mshv_synic_cleanup(unsigned int cpu); + +static inline bool mshv_partition_encrypted(struct mshv_partition *partition) +{ + return partition->isolation_type == HV_PARTITION_ISOLATION_TYPE_SNP; +} + +struct mshv_partition *mshv_partition_get(struct mshv_partition *partition); +void mshv_partition_put(struct mshv_partition *partition); +struct mshv_partition *mshv_partition_find(u64 partition_id) __must_hold(RCU); + +/* hypercalls */ + +int hv_call_withdraw_memory(u64 count, int node, u64 partition_id); +int hv_call_create_partition(u64 flags, + struct hv_partition_creation_properties creation_properties, + union hv_partition_isolation_properties isolation_properties, + u64 *partition_id); +int hv_call_initialize_partition(u64 partition_id); +int hv_call_finalize_partition(u64 partition_id); +int hv_call_delete_partition(u64 partition_id); +int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs); +int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, + u32 flags, struct page **pages); +int hv_call_unmap_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, + u32 flags); +int hv_call_delete_vp(u64 partition_id, u32 vp_index); +int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, + u64 dest_addr, + union hv_interrupt_control control); +int hv_call_clear_virtual_interrupt(u64 partition_id); +int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn, + union hv_gpa_page_access_state_flags state_flags, + int *written_total, + union hv_gpa_page_access_state *states); +int hv_call_get_vp_state(u32 vp_index, u64 partition_id, + struct hv_vp_state_data state_data, + /* Choose between pages and ret_output */ + u64 page_count, struct page **pages, + union hv_output_get_vp_state *ret_output); +int hv_call_set_vp_state(u32 vp_index, u64 partition_id, + /* Choose between pages and bytes */ + struct hv_vp_state_data state_data, u64 page_count, + struct page **pages, u32 num_bytes, u8 *bytes); +int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page); +int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl); +int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, + u64 connection_partition_id, struct hv_port_info *port_info, + u8 port_vtl, u8 min_connection_vtl, int node); +int hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id); +int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, + u64 connection_partition_id, + union hv_connection_id connection_id, + struct hv_connection_info *connection_info, + u8 connection_vtl, int node); +int hv_call_disconnect_port(u64 connection_partition_id, + union hv_connection_id connection_id); +int hv_call_notify_port_ring_empty(u32 sint_index); +int hv_call_map_stat_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr); +int hv_call_unmap_stat_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity); +int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, + u64 page_struct_count, u32 host_access, + u32 flags, u8 acquire); + +extern struct mshv_root mshv_root; +extern enum hv_scheduler_type hv_scheduler_type; +extern u8 * __percpu *hv_synic_eventring_tail; + +#endif /* _MSHV_ROOT_H_ */ diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c new file mode 100644 index 000000000000..a222a16107f6 --- /dev/null +++ b/drivers/hv/mshv_root_hv_call.c @@ -0,0 +1,849 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Hypercall helper functions used by the mshv_root module. + * + * Authors: Microsoft Linux virtualization team + */ + +#include +#include +#include + +#include "mshv_root.h" + +/* Determined empirically */ +#define HV_INIT_PARTITION_DEPOSIT_PAGES 208 +#define HV_MAP_GPA_DEPOSIT_PAGES 256 +#define HV_UMAP_GPA_PAGES 512 + +#define HV_PAGE_COUNT_2M_ALIGNED(pg_count) (!((pg_count) & (0x200 - 1))) + +#define HV_WITHDRAW_BATCH_SIZE (HV_HYP_PAGE_SIZE / sizeof(u64)) +#define HV_MAP_GPA_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_map_gpa_pages)) \ + / sizeof(u64)) +#define HV_GET_VP_STATE_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_vp_state)) \ + / sizeof(u64)) +#define HV_SET_VP_STATE_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_state)) \ + / sizeof(u64)) +#define HV_GET_GPA_ACCESS_STATES_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(union hv_gpa_page_access_state)) \ + / sizeof(union hv_gpa_page_access_state)) +#define HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT \ + ((HV_HYP_PAGE_SIZE - \ + sizeof(struct hv_input_modify_sparse_spa_page_host_access)) / \ + sizeof(u64)) + +int hv_call_withdraw_memory(u64 count, int node, u64 partition_id) +{ + struct hv_input_withdraw_memory *input_page; + struct hv_output_withdraw_memory *output_page; + struct page *page; + u16 completed; + unsigned long remaining = count; + u64 status; + int i; + unsigned long flags; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + output_page = page_address(page); + + while (remaining) { + local_irq_save(flags); + + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input_page, 0, sizeof(*input_page)); + input_page->partition_id = partition_id; + status = hv_do_rep_hypercall(HVCALL_WITHDRAW_MEMORY, + min(remaining, HV_WITHDRAW_BATCH_SIZE), + 0, input_page, output_page); + + local_irq_restore(flags); + + completed = hv_repcomp(status); + + for (i = 0; i < completed; i++) + __free_page(pfn_to_page(output_page->gpa_page_list[i])); + + if (!hv_result_success(status)) { + if (hv_result(status) == HV_STATUS_NO_RESOURCES) + status = HV_STATUS_SUCCESS; + break; + } + + remaining -= completed; + } + free_page((unsigned long)output_page); + + return hv_result_to_errno(status); +} + +int hv_call_create_partition(u64 flags, + struct hv_partition_creation_properties creation_properties, + union hv_partition_isolation_properties isolation_properties, + u64 *partition_id) +{ + struct hv_input_create_partition *input; + struct hv_output_create_partition *output; + u64 status; + int ret; + unsigned long irq_flags; + + do { + local_irq_save(irq_flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->flags = flags; + input->compatibility_version = HV_COMPATIBILITY_21_H2; + + memcpy(&input->partition_creation_properties, &creation_properties, + sizeof(creation_properties)); + + memcpy(&input->isolation_properties, &isolation_properties, + sizeof(isolation_properties)); + + status = hv_do_hypercall(HVCALL_CREATE_PARTITION, + input, output); + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (hv_result_success(status)) + *partition_id = output->partition_id; + local_irq_restore(irq_flags); + ret = hv_result_to_errno(status); + break; + } + local_irq_restore(irq_flags); + ret = hv_call_deposit_pages(NUMA_NO_NODE, + hv_current_partition_id, 1); + } while (!ret); + + return ret; +} + +int hv_call_initialize_partition(u64 partition_id) +{ + struct hv_input_initialize_partition input; + u64 status; + int ret; + + input.partition_id = partition_id; + + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, + HV_INIT_PARTITION_DEPOSIT_PAGES); + if (ret) + return ret; + + do { + status = hv_do_fast_hypercall8(HVCALL_INITIALIZE_PARTITION, + *(u64 *)&input); + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + ret = hv_result_to_errno(status); + break; + } + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1); + } while (!ret); + + return ret; +} + +int hv_call_finalize_partition(u64 partition_id) +{ + struct hv_input_finalize_partition input; + u64 status; + + input.partition_id = partition_id; + status = hv_do_fast_hypercall8(HVCALL_FINALIZE_PARTITION, + *(u64 *)&input); + + return hv_result_to_errno(status); +} + +int hv_call_delete_partition(u64 partition_id) +{ + struct hv_input_delete_partition input; + u64 status; + + input.partition_id = partition_id; + status = hv_do_fast_hypercall8(HVCALL_DELETE_PARTITION, *(u64 *)&input); + + return hv_result_to_errno(status); +} + +/* Ask the hypervisor to map guest ram pages or the guest mmio space */ +static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count, + u32 flags, struct page **pages, u64 mmio_spa) +{ + struct hv_input_map_gpa_pages *input_page; + u64 status, *pfnlist; + unsigned long irq_flags, large_shift = 0; + int ret = 0, done = 0; + u64 page_count = page_struct_count; + + if (page_count == 0 || (pages && mmio_spa)) + return -EINVAL; + + if (flags & HV_MAP_GPA_LARGE_PAGE) { + if (mmio_spa) + return -EINVAL; + + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) + return -EINVAL; + + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; + page_count >>= large_shift; + } + + while (done < page_count) { + ulong i, completed, remain = page_count - done; + int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE); + + local_irq_save(irq_flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->target_partition_id = partition_id; + input_page->target_gpa_base = gfn + (done << large_shift); + input_page->map_flags = flags; + pfnlist = input_page->source_gpa_page_list; + + for (i = 0; i < rep_count; i++) + if (flags & HV_MAP_GPA_NO_ACCESS) { + pfnlist[i] = 0; + } else if (pages) { + u64 index = (done + i) << large_shift; + + if (index >= page_struct_count) { + ret = -EINVAL; + break; + } + pfnlist[i] = page_to_pfn(pages[index]); + } else { + pfnlist[i] = mmio_spa + done + i; + } + if (ret) + break; + + status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0, + input_page, NULL); + local_irq_restore(irq_flags); + + completed = hv_repcomp(status); + + if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, + HV_MAP_GPA_DEPOSIT_PAGES); + if (ret) + break; + + } else if (!hv_result_success(status)) { + ret = hv_result_to_errno(status); + break; + } + + done += completed; + } + + if (ret && done) { + u32 unmap_flags = 0; + + if (flags & HV_MAP_GPA_LARGE_PAGE) + unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; + hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags); + } + + return ret; +} + +/* Ask the hypervisor to map guest ram pages */ +int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, + u32 flags, struct page **pages) +{ + return hv_do_map_gpa_hcall(partition_id, gpa_target, page_count, + flags, pages, 0); +} + +/* Ask the hypervisor to map guest mmio space */ +int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs) +{ + int i; + u32 flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE | + HV_MAP_GPA_NOT_CACHED; + + for (i = 0; i < numpgs; i++) + if (page_is_ram(mmio_spa + i)) + return -EINVAL; + + return hv_do_map_gpa_hcall(partition_id, gfn, numpgs, flags, NULL, + mmio_spa); +} + +int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k, + u32 flags) +{ + struct hv_input_unmap_gpa_pages *input_page; + u64 status, page_count = page_count_4k; + unsigned long irq_flags, large_shift = 0; + int ret = 0, done = 0; + + if (page_count == 0) + return -EINVAL; + + if (flags & HV_UNMAP_GPA_LARGE_PAGE) { + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) + return -EINVAL; + + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; + page_count >>= large_shift; + } + + while (done < page_count) { + ulong completed, remain = page_count - done; + int rep_count = min(remain, HV_UMAP_GPA_PAGES); + + local_irq_save(irq_flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->target_partition_id = partition_id; + input_page->target_gpa_base = gfn + (done << large_shift); + input_page->unmap_flags = flags; + status = hv_do_rep_hypercall(HVCALL_UNMAP_GPA_PAGES, rep_count, + 0, input_page, NULL); + local_irq_restore(irq_flags); + + completed = hv_repcomp(status); + if (!hv_result_success(status)) { + ret = hv_result_to_errno(status); + break; + } + + done += completed; + } + + return ret; +} + +int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn, + union hv_gpa_page_access_state_flags state_flags, + int *written_total, + union hv_gpa_page_access_state *states) +{ + struct hv_input_get_gpa_pages_access_state *input_page; + union hv_gpa_page_access_state *output_page; + int completed = 0; + unsigned long remaining = count; + int rep_count, i; + u64 status = 0; + unsigned long flags; + + *written_total = 0; + while (remaining) { + local_irq_save(flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); + + input_page->partition_id = partition_id; + input_page->hv_gpa_page_number = gpa_base_pfn + *written_total; + input_page->flags = state_flags; + rep_count = min(remaining, HV_GET_GPA_ACCESS_STATES_BATCH_SIZE); + + status = hv_do_rep_hypercall(HVCALL_GET_GPA_PAGES_ACCESS_STATES, rep_count, + 0, input_page, output_page); + if (!hv_result_success(status)) { + local_irq_restore(flags); + break; + } + completed = hv_repcomp(status); + for (i = 0; i < completed; ++i) + states[i].as_uint8 = output_page[i].as_uint8; + + local_irq_restore(flags); + states += completed; + *written_total += completed; + remaining -= completed; + } + + return hv_result_to_errno(status); +} + +int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, + u64 dest_addr, + union hv_interrupt_control control) +{ + struct hv_input_assert_virtual_interrupt *input; + unsigned long flags; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->vector = vector; + input->dest_addr = dest_addr; + input->control = control; + status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL); + local_irq_restore(flags); + + return hv_result_to_errno(status); +} + +int hv_call_delete_vp(u64 partition_id, u32 vp_index) +{ + union hv_input_delete_vp input = {}; + u64 status; + + input.partition_id = partition_id; + input.vp_index = vp_index; + + status = hv_do_fast_hypercall16(HVCALL_DELETE_VP, + input.as_uint64[0], input.as_uint64[1]); + + return hv_result_to_errno(status); +} +EXPORT_SYMBOL_GPL(hv_call_delete_vp); + +int hv_call_get_vp_state(u32 vp_index, u64 partition_id, + struct hv_vp_state_data state_data, + /* Choose between pages and ret_output */ + u64 page_count, struct page **pages, + union hv_output_get_vp_state *ret_output) +{ + struct hv_input_get_vp_state *input; + union hv_output_get_vp_state *output; + u64 status; + int i; + u64 control; + unsigned long flags; + int ret = 0; + + if (page_count > HV_GET_VP_STATE_BATCH_SIZE) + return -EINVAL; + + if (!page_count && !ret_output) + return -EINVAL; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + memset(input, 0, sizeof(*input)); + memset(output, 0, sizeof(*output)); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->state_data = state_data; + for (i = 0; i < page_count; i++) + input->output_data_pfns[i] = page_to_pfn(pages[i]); + + control = (HVCALL_GET_VP_STATE) | + (page_count << HV_HYPERCALL_VARHEAD_OFFSET); + + status = hv_do_hypercall(control, input, output); + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (hv_result_success(status) && ret_output) + memcpy(ret_output, output, sizeof(*output)); + + local_irq_restore(flags); + ret = hv_result_to_errno(status); + break; + } + local_irq_restore(flags); + + ret = hv_call_deposit_pages(NUMA_NO_NODE, + partition_id, 1); + } while (!ret); + + return ret; +} + +int hv_call_set_vp_state(u32 vp_index, u64 partition_id, + /* Choose between pages and bytes */ + struct hv_vp_state_data state_data, u64 page_count, + struct page **pages, u32 num_bytes, u8 *bytes) +{ + struct hv_input_set_vp_state *input; + u64 status; + int i; + u64 control; + unsigned long flags; + int ret = 0; + u16 varhead_sz; + + if (page_count > HV_SET_VP_STATE_BATCH_SIZE) + return -EINVAL; + if (sizeof(*input) + num_bytes > HV_HYP_PAGE_SIZE) + return -EINVAL; + + if (num_bytes) + /* round up to 8 and divide by 8 */ + varhead_sz = (num_bytes + 7) >> 3; + else if (page_count) + varhead_sz = page_count; + else + return -EINVAL; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->state_data = state_data; + if (num_bytes) { + memcpy((u8 *)input->data, bytes, num_bytes); + } else { + for (i = 0; i < page_count; i++) + input->data[i].pfns = page_to_pfn(pages[i]); + } + + control = (HVCALL_SET_VP_STATE) | + (varhead_sz << HV_HYPERCALL_VARHEAD_OFFSET); + + status = hv_do_hypercall(control, input, NULL); + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + local_irq_restore(flags); + ret = hv_result_to_errno(status); + break; + } + local_irq_restore(flags); + + ret = hv_call_deposit_pages(NUMA_NO_NODE, + partition_id, 1); + } while (!ret); + + return ret; +} + +int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page) +{ + struct hv_input_map_vp_state_page *input; + struct hv_output_map_vp_state_page *output; + u64 status; + int ret; + unsigned long flags; + + do { + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->type = type; + input->input_vtl = input_vtl; + + status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, output); + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (hv_result_success(status)) + *state_page = pfn_to_page(output->map_location); + local_irq_restore(flags); + ret = hv_result_to_errno(status); + break; + } + + local_irq_restore(flags); + + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1); + } while (!ret); + + return ret; +} + +int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl) +{ + unsigned long flags; + u64 status; + struct hv_input_unmap_vp_state_page *input; + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->type = type; + input->input_vtl = input_vtl; + + status = hv_do_hypercall(HVCALL_UNMAP_VP_STATE_PAGE, input, NULL); + + local_irq_restore(flags); + + return hv_result_to_errno(status); +} + +int +hv_call_clear_virtual_interrupt(u64 partition_id) +{ + int status; + + status = hv_do_fast_hypercall8(HVCALL_CLEAR_VIRTUAL_INTERRUPT, + partition_id); + + return hv_result_to_errno(status); +} + +int +hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, + u64 connection_partition_id, + struct hv_port_info *port_info, + u8 port_vtl, u8 min_connection_vtl, int node) +{ + struct hv_input_create_port *input; + unsigned long flags; + int ret = 0; + int status; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + + input->port_partition_id = port_partition_id; + input->port_id = port_id; + input->connection_partition_id = connection_partition_id; + input->port_info = *port_info; + input->port_vtl = port_vtl; + input->min_connection_vtl = min_connection_vtl; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); + status = hv_do_hypercall(HVCALL_CREATE_PORT, input, NULL); + local_irq_restore(flags); + if (hv_result_success(status)) + break; + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + ret = hv_result_to_errno(status); + break; + } + ret = hv_call_deposit_pages(NUMA_NO_NODE, port_partition_id, 1); + + } while (!ret); + + return ret; +} + +int +hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id) +{ + union hv_input_delete_port input = { 0 }; + int status; + + input.port_partition_id = port_partition_id; + input.port_id = port_id; + status = hv_do_fast_hypercall16(HVCALL_DELETE_PORT, + input.as_uint64[0], + input.as_uint64[1]); + + return hv_result_to_errno(status); +} + +int +hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, + u64 connection_partition_id, + union hv_connection_id connection_id, + struct hv_connection_info *connection_info, + u8 connection_vtl, int node) +{ + struct hv_input_connect_port *input; + unsigned long flags; + int ret = 0, status; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->port_partition_id = port_partition_id; + input->port_id = port_id; + input->connection_partition_id = connection_partition_id; + input->connection_id = connection_id; + input->connection_info = *connection_info; + input->connection_vtl = connection_vtl; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); + status = hv_do_hypercall(HVCALL_CONNECT_PORT, input, NULL); + + local_irq_restore(flags); + if (hv_result_success(status)) + break; + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + ret = hv_result_to_errno(status); + break; + } + ret = hv_call_deposit_pages(NUMA_NO_NODE, + connection_partition_id, 1); + } while (!ret); + + return ret; +} + +int +hv_call_disconnect_port(u64 connection_partition_id, + union hv_connection_id connection_id) +{ + union hv_input_disconnect_port input = { 0 }; + int status; + + input.connection_partition_id = connection_partition_id; + input.connection_id = connection_id; + input.is_doorbell = 1; + status = hv_do_fast_hypercall16(HVCALL_DISCONNECT_PORT, + input.as_uint64[0], + input.as_uint64[1]); + + return hv_result_to_errno(status); +} + +int +hv_call_notify_port_ring_empty(u32 sint_index) +{ + union hv_input_notify_port_ring_empty input = { 0 }; + int status; + + input.sint_index = sint_index; + status = hv_do_fast_hypercall8(HVCALL_NOTIFY_PORT_RING_EMPTY, + input.as_uint64); + + return hv_result_to_errno(status); +} + +int hv_call_map_stat_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr) +{ + unsigned long flags; + struct hv_input_map_stats_page *input; + struct hv_output_map_stats_page *output; + u64 status, pfn; + int ret = 0; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->type = type; + input->identity = *identity; + + status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE, input, output); + pfn = output->map_location; + + local_irq_restore(flags); + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + ret = hv_result_to_errno(status); + if (hv_result_success(status)) + break; + return ret; + } + + ret = hv_call_deposit_pages(NUMA_NO_NODE, + hv_current_partition_id, 1); + if (ret) + return ret; + } while (!ret); + + *addr = page_address(pfn_to_page(pfn)); + + return ret; +} + +int hv_call_unmap_stat_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity) +{ + unsigned long flags; + struct hv_input_unmap_stats_page *input; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + input->type = type; + input->identity = *identity; + + status = hv_do_hypercall(HVCALL_UNMAP_STATS_PAGE, input, NULL); + local_irq_restore(flags); + + return hv_result_to_errno(status); +} + +int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, + u64 page_struct_count, u32 host_access, + u32 flags, u8 acquire) +{ + struct hv_input_modify_sparse_spa_page_host_access *input_page; + u64 status; + int done = 0; + unsigned long irq_flags, large_shift = 0; + u64 page_count = page_struct_count; + u16 code = acquire ? HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS : + HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS; + + if (page_count == 0) + return -EINVAL; + + if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE) { + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) + return -EINVAL; + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; + page_count >>= large_shift; + } + + while (done < page_count) { + ulong i, completed, remain = page_count - done; + int rep_count = min(remain, + HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT); + + local_irq_save(irq_flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input_page, 0, sizeof(*input_page)); + /* Only set the partition id if you are making the pages + * exclusive + */ + if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE) + input_page->partition_id = partition_id; + input_page->flags = flags; + input_page->host_access = host_access; + + for (i = 0; i < rep_count; i++) { + u64 index = (done + i) << large_shift; + + if (index >= page_struct_count) + return -EINVAL; + + input_page->spa_page_list[i] = + page_to_pfn(pages[index]); + } + + status = hv_do_rep_hypercall(code, rep_count, 0, input_page, + NULL); + local_irq_restore(irq_flags); + + completed = hv_repcomp(status); + + if (!hv_result_success(status)) + return hv_result_to_errno(status); + + done += completed; + } + + return 0; +} diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c new file mode 100644 index 000000000000..72df774e410a --- /dev/null +++ b/drivers/hv/mshv_root_main.c @@ -0,0 +1,2307 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. + * + * The main part of the mshv_root module, providing APIs to create + * and manage guest partitions. + * + * Authors: Microsoft Linux virtualization team + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mshv_eventfd.h" +#include "mshv.h" +#include "mshv_root.h" + +MODULE_AUTHOR("Microsoft"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); + +/* TODO move this to mshyperv.h when needed outside driver */ +static inline bool hv_parent_partition(void) +{ + return hv_root_partition(); +} + +/* TODO move this to another file when debugfs code is added */ +enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ +#if defined(CONFIG_X86) + VpRootDispatchThreadBlocked = 201, +#elif defined(CONFIG_ARM64) + VpRootDispatchThreadBlocked = 94, +#endif + VpStatsMaxCounter +}; + +struct hv_stats_page { + union { + u64 vp_cntrs[VpStatsMaxCounter]; /* VP counters */ + u8 data[HV_HYP_PAGE_SIZE]; + }; +} __packed; + +struct mshv_root mshv_root; + +enum hv_scheduler_type hv_scheduler_type; + +/* Once we implement the fast extended hypercall ABI they can go away. */ +static void * __percpu *root_scheduler_input; +static void * __percpu *root_scheduler_output; + +static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +static int mshv_dev_open(struct inode *inode, struct file *filp); +static int mshv_dev_release(struct inode *inode, struct file *filp); +static int mshv_vp_release(struct inode *inode, struct file *filp); +static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +static int mshv_partition_release(struct inode *inode, struct file *filp); +static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); +static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); +static int mshv_init_async_handler(struct mshv_partition *partition); +static void mshv_async_hvcall_handler(void *data, u64 *status); + +static const union hv_input_vtl input_vtl_zero; +static const union hv_input_vtl input_vtl_normal = { + .target_vtl = HV_NORMAL_VTL, + .use_target_vtl = 1, +}; + +static const struct vm_operations_struct mshv_vp_vm_ops = { + .fault = mshv_vp_fault, +}; + +static const struct file_operations mshv_vp_fops = { + .owner = THIS_MODULE, + .release = mshv_vp_release, + .unlocked_ioctl = mshv_vp_ioctl, + .llseek = noop_llseek, + .mmap = mshv_vp_mmap, +}; + +static const struct file_operations mshv_partition_fops = { + .owner = THIS_MODULE, + .release = mshv_partition_release, + .unlocked_ioctl = mshv_partition_ioctl, + .llseek = noop_llseek, +}; + +static const struct file_operations mshv_dev_fops = { + .owner = THIS_MODULE, + .open = mshv_dev_open, + .release = mshv_dev_release, + .unlocked_ioctl = mshv_dev_ioctl, + .llseek = noop_llseek, +}; + +static struct miscdevice mshv_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "mshv", + .fops = &mshv_dev_fops, + .mode = 0600, +}; + +/* + * Only allow hypercalls that have a u64 partition id as the first member of + * the input structure. + * These are sorted by value. + */ +static u16 mshv_passthru_hvcalls[] = { + HVCALL_GET_PARTITION_PROPERTY, + HVCALL_SET_PARTITION_PROPERTY, + HVCALL_INSTALL_INTERCEPT, + HVCALL_GET_VP_REGISTERS, + HVCALL_SET_VP_REGISTERS, + HVCALL_TRANSLATE_VIRTUAL_ADDRESS, + HVCALL_CLEAR_VIRTUAL_INTERRUPT, + HVCALL_REGISTER_INTERCEPT_RESULT, + HVCALL_ASSERT_VIRTUAL_INTERRUPT, + HVCALL_GET_GPA_PAGES_ACCESS_STATES, + HVCALL_SIGNAL_EVENT_DIRECT, + HVCALL_POST_MESSAGE_DIRECT, + HVCALL_GET_VP_CPUID_VALUES, +}; + +static bool mshv_hvcall_is_async(u16 code) +{ + switch (code) { + case HVCALL_SET_PARTITION_PROPERTY: + return true; + default: + break; + } + return false; +} + +static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, + bool partition_locked, + void __user *user_args) +{ + u64 status; + int ret = 0, i; + bool is_async; + struct mshv_root_hvcall args; + struct page *page; + unsigned int pages_order; + void *input_pg = NULL; + void *output_pg = NULL; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || + mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) + return -EINVAL; + + if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i) + if (args.code == mshv_passthru_hvcalls[i]) + break; + + if (i >= ARRAY_SIZE(mshv_passthru_hvcalls)) + return -EINVAL; + + is_async = mshv_hvcall_is_async(args.code); + if (is_async) { + /* async hypercalls can only be called from partition fd */ + if (!partition_locked) + return -EINVAL; + ret = mshv_init_async_handler(partition); + if (ret) + return ret; + } + + pages_order = args.out_ptr ? 1 : 0; + page = alloc_pages(GFP_KERNEL, pages_order); + if (!page) + return -ENOMEM; + input_pg = page_address(page); + + if (args.out_ptr) + output_pg = (char *)input_pg + PAGE_SIZE; + else + output_pg = NULL; + + if (copy_from_user(input_pg, (void __user *)args.in_ptr, + args.in_sz)) { + ret = -EFAULT; + goto free_pages_out; + } + + /* + * NOTE: This only works because all the allowed hypercalls' input + * structs begin with a u64 partition_id field. + */ + *(u64 *)input_pg = partition->pt_id; + + if (args.reps) + status = hv_do_rep_hypercall(args.code, args.reps, 0, + input_pg, output_pg); + else + status = hv_do_hypercall(args.code, input_pg, output_pg); + + if (hv_result(status) == HV_STATUS_CALL_PENDING) { + if (is_async) { + mshv_async_hvcall_handler(partition, &status); + } else { /* Paranoia check. This shouldn't happen! */ + ret = -EBADFD; + goto free_pages_out; + } + } + + if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1); + if (!ret) + ret = -EAGAIN; + } else if (!hv_result_success(status)) { + ret = hv_result_to_errno(status); + } + + /* + * Always return the status and output data regardless of result. + * The VMM may need it to determine how to proceed. E.g. the status may + * contain the number of reps completed if a rep hypercall partially + * succeeded. + */ + args.status = hv_result(status); + args.reps = args.reps ? hv_repcomp(status) : 0; + if (copy_to_user(user_args, &args, sizeof(args))) + ret = -EFAULT; + + if (output_pg && + copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) + ret = -EFAULT; + +free_pages_out: + free_pages((unsigned long)input_pg, pages_order); + + return ret; +} + +static inline bool is_ghcb_mapping_available(void) +{ +#if IS_ENABLED(CONFIG_X86_64) + return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; +#else + return 0; +#endif +} + +static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, + struct hv_register_assoc *registers) +{ + return hv_call_get_vp_registers(vp_index, partition_id, + count, input_vtl_zero, registers); +} + +static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, + struct hv_register_assoc *registers) +{ + return hv_call_set_vp_registers(vp_index, partition_id, + count, input_vtl_zero, registers); +} + +/* + * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by + * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, + * done by the hypervisor. + * "Intercept" suspend leads to asynchronous message delivery to dom0 which + * should be awaited to keep the VP loop consistent (i.e. no message pending + * upon VP resume). + * VP intercept suspend can't be done when the VP is explicitly suspended + * already, and thus can be only two possible race scenarios: + * 1. implicit suspend bit set -> explicit suspend bit set -> message sent + * 2. implicit suspend bit set -> message sent -> explicit suspend bit set + * Checking for implicit suspend bit set after explicit suspend request has + * succeeded in either case allows us to reliably identify, if there is a + * message to receive and deliver to VMM. + */ +static int +mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) +{ + struct hv_register_assoc explicit_suspend = { + .name = HV_REGISTER_EXPLICIT_SUSPEND + }; + struct hv_register_assoc intercept_suspend = { + .name = HV_REGISTER_INTERCEPT_SUSPEND + }; + union hv_explicit_suspend_register *es = + &explicit_suspend.value.explicit_suspend; + union hv_intercept_suspend_register *is = + &intercept_suspend.value.intercept_suspend; + int ret; + + es->suspended = 1; + + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &explicit_suspend); + if (ret) { + vp_err(vp, "Failed to explicitly suspend vCPU\n"); + return ret; + } + + ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &intercept_suspend); + if (ret) { + vp_err(vp, "Failed to get intercept suspend state\n"); + return ret; + } + + *message_in_flight = is->suspended; + + return 0; +} + +/* + * This function is used when VPs are scheduled by the hypervisor's + * scheduler. + * + * Caller has to make sure the registers contain cleared + * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers + * exactly in this order (the hypervisor clears them sequentially) to avoid + * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND + * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the + * opposite order. + */ +static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) +{ + long ret; + struct hv_register_assoc suspend_regs[2] = { + { .name = HV_REGISTER_INTERCEPT_SUSPEND }, + { .name = HV_REGISTER_EXPLICIT_SUSPEND } + }; + size_t count = ARRAY_SIZE(suspend_regs); + + /* Resume VP execution */ + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + count, suspend_regs); + if (ret) { + vp_err(vp, "Failed to resume vp execution. %lx\n", ret); + return ret; + } + + ret = wait_event_interruptible(vp->run.vp_suspend_queue, + vp->run.kicked_by_hv == 1); + if (ret) { + bool message_in_flight; + + /* + * Otherwise the waiting was interrupted by a signal: suspend + * the vCPU explicitly and copy message in flight (if any). + */ + ret = mshv_suspend_vp(vp, &message_in_flight); + if (ret) + return ret; + + /* Return if no message in flight */ + if (!message_in_flight) + return -EINTR; + + /* Wait for the message in flight. */ + wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); + } + + /* + * Reset the flag to make the wait_event call above work + * next time. + */ + vp->run.kicked_by_hv = 0; + + return 0; +} + +static int +mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, + struct hv_output_dispatch_vp *res) +{ + struct hv_input_dispatch_vp *input; + struct hv_output_dispatch_vp *output; + u64 status; + + preempt_disable(); + input = *this_cpu_ptr(root_scheduler_input); + output = *this_cpu_ptr(root_scheduler_output); + + memset(input, 0, sizeof(*input)); + memset(output, 0, sizeof(*output)); + + input->partition_id = vp->vp_partition->pt_id; + input->vp_index = vp->vp_index; + input->time_slice = 0; /* Run forever until something happens */ + input->spec_ctrl = 0; /* TODO: set sensible flags */ + input->flags = flags; + + vp->run.flags.root_sched_dispatched = 1; + status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); + vp->run.flags.root_sched_dispatched = 0; + + *res = *output; + preempt_enable(); + + if (!hv_result_success(status)) + vp_err(vp, "%s: status %s\n", __func__, + hv_result_to_string(status)); + + return hv_result_to_errno(status); +} + +static int +mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) +{ + struct hv_register_assoc explicit_suspend = { + .name = HV_REGISTER_EXPLICIT_SUSPEND, + .value.explicit_suspend.suspended = 0, + }; + int ret; + + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &explicit_suspend); + + if (ret) + vp_err(vp, "Failed to unsuspend\n"); + + return ret; +} + +#if IS_ENABLED(CONFIG_X86_64) +static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) +{ + if (!vp->vp_register_page) + return 0; + return vp->vp_register_page->interrupt_vectors.as_uint64; +} +#else +static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) +{ + return 0; +} +#endif + +static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) +{ + struct hv_stats_page **stats = vp->vp_stats_pages; + u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs; + u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs; + + if (self_vp_cntrs[VpRootDispatchThreadBlocked]) + return self_vp_cntrs[VpRootDispatchThreadBlocked]; + return parent_vp_cntrs[VpRootDispatchThreadBlocked]; +} + +static int +mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) +{ + int ret; + + ret = wait_event_interruptible(vp->run.vp_suspend_queue, + (vp->run.kicked_by_hv == 1 && + !mshv_vp_dispatch_thread_blocked(vp)) || + mshv_vp_interrupt_pending(vp)); + if (ret) + return -EINTR; + + vp->run.flags.root_sched_blocked = 0; + vp->run.kicked_by_hv = 0; + + return 0; +} + +static int mshv_pre_guest_mode_work(struct mshv_vp *vp) +{ + const ulong work_flags = _TIF_NOTIFY_SIGNAL | _TIF_SIGPENDING | + _TIF_NEED_RESCHED | _TIF_NOTIFY_RESUME; + ulong th_flags; + + th_flags = read_thread_flags(); + while (th_flags & work_flags) { + int ret; + + /* nb: following will call schedule */ + ret = mshv_do_pre_guest_mode_work(th_flags); + + if (ret) + return ret; + + th_flags = read_thread_flags(); + } + + return 0; +} + +/* Must be called with interrupts enabled */ +static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) +{ + long ret; + + if (vp->run.flags.root_sched_blocked) { + /* + * Dispatch state of this VP is blocked. Need to wait + * for the hypervisor to clear the blocked state before + * dispatching it. + */ + ret = mshv_vp_wait_for_hv_kick(vp); + if (ret) + return ret; + } + + do { + u32 flags = 0; + struct hv_output_dispatch_vp output; + + ret = mshv_pre_guest_mode_work(vp); + if (ret) + break; + + if (vp->run.flags.intercept_suspend) + flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; + + if (mshv_vp_interrupt_pending(vp)) + flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; + + ret = mshv_vp_dispatch(vp, flags, &output); + if (ret) + break; + + vp->run.flags.intercept_suspend = 0; + + if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { + if (output.dispatch_event == + HV_VP_DISPATCH_EVENT_SUSPEND) { + /* + * TODO: remove the warning once VP canceling + * is supported + */ + WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), + "%s: vp#%d: unexpected explicit suspend\n", + __func__, vp->vp_index); + /* + * Need to clear explicit suspend before + * dispatching. + * Explicit suspend is either: + * - set right after the first VP dispatch or + * - set explicitly via hypercall + * Since the latter case is not yet supported, + * simply clear it here. + */ + ret = mshv_vp_clear_explicit_suspend(vp); + if (ret) + break; + + ret = mshv_vp_wait_for_hv_kick(vp); + if (ret) + break; + } else { + vp->run.flags.root_sched_blocked = 1; + ret = mshv_vp_wait_for_hv_kick(vp); + if (ret) + break; + } + } else { + /* HV_VP_DISPATCH_STATE_READY */ + if (output.dispatch_event == + HV_VP_DISPATCH_EVENT_INTERCEPT) + vp->run.flags.intercept_suspend = 1; + } + } while (!vp->run.flags.intercept_suspend); + + return ret; +} + +static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, + "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); + +static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) +{ + long rc; + + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + rc = mshv_run_vp_with_root_scheduler(vp); + else + rc = mshv_run_vp_with_hyp_scheduler(vp); + + if (rc) + return rc; + + if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, + sizeof(struct hv_message))) + rc = -EFAULT; + + return rc; +} + +static int +mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, + struct hv_vp_state_data state_data, + unsigned long user_pfn, size_t page_count, + bool is_set) +{ + int completed, ret = 0; + unsigned long check; + struct page **pages; + + if (page_count > INT_MAX) + return -EINVAL; + /* + * Check the arithmetic for wraparound/overflow. + * The last page address in the buffer is: + * (user_pfn + (page_count - 1)) * PAGE_SIZE + */ + if (check_add_overflow(user_pfn, (page_count - 1), &check)) + return -EOVERFLOW; + if (check_mul_overflow(check, PAGE_SIZE, &check)) + return -EOVERFLOW; + + /* Pin user pages so hypervisor can copy directly to them */ + pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + for (completed = 0; completed < page_count; completed += ret) { + unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; + int remaining = page_count - completed; + + ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, + &pages[completed]); + if (ret < 0) { + vp_err(vp, "%s: Failed to pin user pages error %i\n", + __func__, ret); + goto unpin_pages; + } + } + + if (is_set) + ret = hv_call_set_vp_state(vp->vp_index, + vp->vp_partition->pt_id, + state_data, page_count, pages, + 0, NULL); + else + ret = hv_call_get_vp_state(vp->vp_index, + vp->vp_partition->pt_id, + state_data, page_count, pages, + NULL); + +unpin_pages: + unpin_user_pages(pages, completed); + kfree(pages); + return ret; +} + +static long +mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, + struct mshv_get_set_vp_state __user *user_args, + bool is_set) +{ + struct mshv_get_set_vp_state args; + long ret = 0; + union hv_output_get_vp_state vp_state; + u32 data_sz; + struct hv_vp_state_data state_data = {}; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || + !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || + !PAGE_ALIGNED(args.buf_ptr)) + return -EINVAL; + + if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) + return -EFAULT; + + switch (args.type) { + case MSHV_VP_STATE_LAPIC: + state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; + data_sz = HV_HYP_PAGE_SIZE; + break; + case MSHV_VP_STATE_XSAVE: + { + u64 data_sz_64; + + ret = hv_call_get_partition_property(vp->vp_partition->pt_id, + HV_PARTITION_PROPERTY_XSAVE_STATES, + &state_data.xsave.states.as_uint64); + if (ret) + return ret; + + ret = hv_call_get_partition_property(vp->vp_partition->pt_id, + HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, + &data_sz_64); + if (ret) + return ret; + + data_sz = (u32)data_sz_64; + state_data.xsave.flags = 0; + /* Always request legacy states */ + state_data.xsave.states.legacy_x87 = 1; + state_data.xsave.states.legacy_sse = 1; + state_data.type = HV_GET_SET_VP_STATE_XSAVE; + break; + } + case MSHV_VP_STATE_SIMP: + state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; + data_sz = HV_HYP_PAGE_SIZE; + break; + case MSHV_VP_STATE_SIEFP: + state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; + data_sz = HV_HYP_PAGE_SIZE; + break; + case MSHV_VP_STATE_SYNTHETIC_TIMERS: + state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; + data_sz = sizeof(vp_state.synthetic_timers_state); + break; + default: + return -EINVAL; + } + + if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) + return -EFAULT; + + if (data_sz > args.buf_sz) + return -EINVAL; + + /* If the data is transmitted via pfns, delegate to helper */ + if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { + unsigned long user_pfn = PFN_DOWN(args.buf_ptr); + size_t page_count = PFN_DOWN(args.buf_sz); + + return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, + page_count, is_set); + } + + /* Paranoia check - this shouldn't happen! */ + if (data_sz > sizeof(vp_state)) { + vp_err(vp, "Invalid vp state data size!\n"); + return -EINVAL; + } + + if (is_set) { + if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) + return -EFAULT; + + return hv_call_set_vp_state(vp->vp_index, + vp->vp_partition->pt_id, + state_data, 0, NULL, + sizeof(vp_state), (u8 *)&vp_state); + } + + ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, + state_data, 0, NULL, &vp_state); + if (ret) + return ret; + + if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) + return -EFAULT; + + return 0; +} + +static long +mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct mshv_vp *vp = filp->private_data; + long r = -ENOTTY; + + if (mutex_lock_killable(&vp->vp_mutex)) + return -EINTR; + + switch (ioctl) { + case MSHV_RUN_VP: + r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); + break; + case MSHV_GET_VP_STATE: + r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); + break; + case MSHV_SET_VP_STATE: + r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); + break; + case MSHV_ROOT_HVCALL: + r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, + (void __user *)arg); + break; + default: + vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); + break; + } + mutex_unlock(&vp->vp_mutex); + + return r; +} + +static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) +{ + struct mshv_vp *vp = vmf->vma->vm_file->private_data; + + switch (vmf->vma->vm_pgoff) { + case MSHV_VP_MMAP_OFFSET_REGISTERS: + vmf->page = virt_to_page(vp->vp_register_page); + break; + case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: + vmf->page = virt_to_page(vp->vp_intercept_msg_page); + break; + case MSHV_VP_MMAP_OFFSET_GHCB: + vmf->page = virt_to_page(vp->vp_ghcb_page); + break; + default: + return VM_FAULT_SIGBUS; + } + + get_page(vmf->page); + + return 0; +} + +static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct mshv_vp *vp = file->private_data; + + switch (vma->vm_pgoff) { + case MSHV_VP_MMAP_OFFSET_REGISTERS: + if (!vp->vp_register_page) + return -ENODEV; + break; + case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: + if (!vp->vp_intercept_msg_page) + return -ENODEV; + break; + case MSHV_VP_MMAP_OFFSET_GHCB: + if (!vp->vp_ghcb_page) + return -ENODEV; + break; + default: + return -EINVAL; + } + + vma->vm_ops = &mshv_vp_vm_ops; + return 0; +} + +static int +mshv_vp_release(struct inode *inode, struct file *filp) +{ + struct mshv_vp *vp = filp->private_data; + + /* Rest of VP cleanup happens in destroy_partition() */ + mshv_partition_put(vp->vp_partition); + return 0; +} + +static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) +{ + union hv_stats_object_identity identity = { + .vp.partition_id = partition_id, + .vp.vp_index = vp_index, + }; + + identity.vp.stats_area_type = HV_STATS_AREA_SELF; + hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + + identity.vp.stats_area_type = HV_STATS_AREA_PARENT; + hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); +} + +static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, + void *stats_pages[]) +{ + union hv_stats_object_identity identity = { + .vp.partition_id = partition_id, + .vp.vp_index = vp_index, + }; + int err; + + identity.vp.stats_area_type = HV_STATS_AREA_SELF; + err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_SELF]); + if (err) + return err; + + identity.vp.stats_area_type = HV_STATS_AREA_PARENT; + err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_PARENT]); + if (err) + goto unmap_self; + + return 0; + +unmap_self: + identity.vp.stats_area_type = HV_STATS_AREA_SELF; + hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + return err; +} + +static long +mshv_partition_ioctl_create_vp(struct mshv_partition *partition, + void __user *arg) +{ + struct mshv_create_vp args; + struct mshv_vp *vp; + struct page *intercept_message_page, *register_page, *ghcb_page; + void *stats_pages[2]; + long ret; + + if (copy_from_user(&args, arg, sizeof(args))) + return -EFAULT; + + if (args.vp_index >= MSHV_MAX_VPS) + return -EINVAL; + + if (partition->pt_vp_array[args.vp_index]) + return -EEXIST; + + ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, + 0 /* Only valid for root partition VPs */); + if (ret) + return ret; + + ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + input_vtl_zero, + &intercept_message_page); + if (ret) + goto destroy_vp; + + if (!mshv_partition_encrypted(partition)) { + ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_REGISTERS, + input_vtl_zero, + ®ister_page); + if (ret) + goto unmap_intercept_message_page; + } + + if (mshv_partition_encrypted(partition) && + is_ghcb_mapping_available()) { + ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_GHCB, + input_vtl_normal, + &ghcb_page); + if (ret) + goto unmap_register_page; + } + + if (hv_parent_partition()) { + ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, + stats_pages); + if (ret) + goto unmap_ghcb_page; + } + + vp = kzalloc(sizeof(*vp), GFP_KERNEL); + if (!vp) + goto unmap_stats_pages; + + vp->vp_partition = mshv_partition_get(partition); + if (!vp->vp_partition) { + ret = -EBADF; + goto free_vp; + } + + mutex_init(&vp->vp_mutex); + init_waitqueue_head(&vp->run.vp_suspend_queue); + atomic64_set(&vp->run.vp_signaled_count, 0); + + vp->vp_index = args.vp_index; + vp->vp_intercept_msg_page = page_to_virt(intercept_message_page); + if (!mshv_partition_encrypted(partition)) + vp->vp_register_page = page_to_virt(register_page); + + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) + vp->vp_ghcb_page = page_to_virt(ghcb_page); + + if (hv_parent_partition()) + memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); + + /* + * Keep anon_inode_getfd last: it installs fd in the file struct and + * thus makes the state accessible in user space. + */ + ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, + O_RDWR | O_CLOEXEC); + if (ret < 0) + goto put_partition; + + /* already exclusive with the partition mutex for all ioctls */ + partition->pt_vp_count++; + partition->pt_vp_array[args.vp_index] = vp; + + return ret; + +put_partition: + mshv_partition_put(partition); +free_vp: + kfree(vp); +unmap_stats_pages: + if (hv_parent_partition()) + mshv_vp_stats_unmap(partition->pt_id, args.vp_index); +unmap_ghcb_page: + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { + hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_GHCB, + input_vtl_normal); + } +unmap_register_page: + if (!mshv_partition_encrypted(partition)) { + hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_REGISTERS, + input_vtl_zero); + } +unmap_intercept_message_page: + hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + input_vtl_zero); +destroy_vp: + hv_call_delete_vp(partition->pt_id, args.vp_index); + return ret; +} + +static int mshv_init_async_handler(struct mshv_partition *partition) +{ + if (completion_done(&partition->async_hypercall)) { + pt_err(partition, + "Cannot issue async hypercall while another one in progress!\n"); + return -EPERM; + } + + reinit_completion(&partition->async_hypercall); + return 0; +} + +static void mshv_async_hvcall_handler(void *data, u64 *status) +{ + struct mshv_partition *partition = data; + + wait_for_completion(&partition->async_hypercall); + pt_dbg(partition, "Async hypercall completed!\n"); + + *status = partition->async_hypercall_status; +} + +static int +mshv_partition_region_share(struct mshv_mem_region *region) +{ + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; + + if (region->flags.large_pages) + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; + + return hv_call_modify_spa_host_access(region->partition->pt_id, + region->pages, region->nr_pages, + HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, + flags, true); +} + +static int +mshv_partition_region_unshare(struct mshv_mem_region *region) +{ + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; + + if (region->flags.large_pages) + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; + + return hv_call_modify_spa_host_access(region->partition->pt_id, + region->pages, region->nr_pages, + 0, + flags, false); +} + +static int +mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags, + u64 page_offset, u64 page_count) +{ + if (page_offset + page_count > region->nr_pages) + return -EINVAL; + + if (region->flags.large_pages) + map_flags |= HV_MAP_GPA_LARGE_PAGE; + + /* ask the hypervisor to map guest ram */ + return hv_call_map_gpa_pages(region->partition->pt_id, + region->start_gfn + page_offset, + page_count, map_flags, + region->pages + page_offset); +} + +static int +mshv_region_map(struct mshv_mem_region *region) +{ + u32 map_flags = region->hv_map_flags; + + return mshv_region_remap_pages(region, map_flags, + 0, region->nr_pages); +} + +static void +mshv_region_evict_pages(struct mshv_mem_region *region, + u64 page_offset, u64 page_count) +{ + if (region->flags.range_pinned) + unpin_user_pages(region->pages + page_offset, page_count); + + memset(region->pages + page_offset, 0, + page_count * sizeof(struct page *)); +} + +static void +mshv_region_evict(struct mshv_mem_region *region) +{ + mshv_region_evict_pages(region, 0, region->nr_pages); +} + +static int +mshv_region_populate_pages(struct mshv_mem_region *region, + u64 page_offset, u64 page_count) +{ + u64 done_count, nr_pages; + struct page **pages; + __u64 userspace_addr; + int ret; + + if (page_offset + page_count > region->nr_pages) + return -EINVAL; + + for (done_count = 0; done_count < page_count; done_count += ret) { + pages = region->pages + page_offset + done_count; + userspace_addr = region->start_uaddr + + (page_offset + done_count) * + HV_HYP_PAGE_SIZE; + nr_pages = min(page_count - done_count, + MSHV_PIN_PAGES_BATCH_SIZE); + + /* + * Pinning assuming 4k pages works for large pages too. + * All page structs within the large page are returned. + * + * Pin requests are batched because pin_user_pages_fast + * with the FOLL_LONGTERM flag does a large temporary + * allocation of contiguous memory. + */ + if (region->flags.range_pinned) + ret = pin_user_pages_fast(userspace_addr, + nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages); + else + ret = -EOPNOTSUPP; + + if (ret < 0) + goto release_pages; + } + + if (PageHuge(region->pages[page_offset])) + region->flags.large_pages = true; + + return 0; + +release_pages: + mshv_region_evict_pages(region, page_offset, done_count); + return ret; +} + +static int +mshv_region_populate(struct mshv_mem_region *region) +{ + return mshv_region_populate_pages(region, 0, region->nr_pages); +} + +static struct mshv_mem_region * +mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) +{ + struct mshv_mem_region *region; + + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { + if (gfn >= region->start_gfn && + gfn < region->start_gfn + region->nr_pages) + return region; + } + + return NULL; +} + +static struct mshv_mem_region * +mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr) +{ + struct mshv_mem_region *region; + + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { + if (uaddr >= region->start_uaddr && + uaddr < region->start_uaddr + + (region->nr_pages << HV_HYP_PAGE_SHIFT)) + return region; + } + + return NULL; +} + +/* + * NB: caller checks and makes sure mem->size is page aligned + * Returns: 0 with regionpp updated on success, or -errno + */ +static int mshv_partition_create_region(struct mshv_partition *partition, + struct mshv_user_mem_region *mem, + struct mshv_mem_region **regionpp, + bool is_mmio) +{ + struct mshv_mem_region *region; + u64 nr_pages = HVPFN_DOWN(mem->size); + + /* Reject overlapping regions */ + if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) || + mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) || + mshv_partition_region_by_uaddr(partition, mem->userspace_addr) || + mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1)) + return -EEXIST; + + region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); + if (!region) + return -ENOMEM; + + region->nr_pages = nr_pages; + region->start_gfn = mem->guest_pfn; + region->start_uaddr = mem->userspace_addr; + region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; + if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) + region->hv_map_flags |= HV_MAP_GPA_WRITABLE; + if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) + region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; + + /* Note: large_pages flag populated when we pin the pages */ + if (!is_mmio) + region->flags.range_pinned = true; + + region->partition = partition; + + *regionpp = region; + + return 0; +} + +/* + * Map guest ram. if snp, make sure to release that from the host first + * Side Effects: In case of failure, pages are unpinned when feasible. + */ +static int +mshv_partition_mem_region_map(struct mshv_mem_region *region) +{ + struct mshv_partition *partition = region->partition; + int ret; + + ret = mshv_region_populate(region); + if (ret) { + pt_err(partition, "Failed to populate memory region: %d\n", + ret); + goto err_out; + } + + /* + * For an SNP partition it is a requirement that for every memory region + * that we are going to map for this partition we should make sure that + * host access to that region is released. This is ensured by doing an + * additional hypercall which will update the SLAT to release host + * access to guest memory regions. + */ + if (mshv_partition_encrypted(partition)) { + ret = mshv_partition_region_unshare(region); + if (ret) { + pt_err(partition, + "Failed to unshare memory region (guest_pfn: %llu): %d\n", + region->start_gfn, ret); + goto evict_region; + } + } + + ret = mshv_region_map(region); + if (ret && mshv_partition_encrypted(partition)) { + int shrc; + + shrc = mshv_partition_region_share(region); + if (!shrc) + goto evict_region; + + pt_err(partition, + "Failed to share memory region (guest_pfn: %llu): %d\n", + region->start_gfn, shrc); + /* + * Don't unpin if marking shared failed because pages are no + * longer mapped in the host, ie root, anymore. + */ + goto err_out; + } + + return 0; + +evict_region: + mshv_region_evict(region); +err_out: + return ret; +} + +/* + * This maps two things: guest RAM and for pci passthru mmio space. + * + * mmio: + * - vfio overloads vm_pgoff to store the mmio start pfn/spa. + * - Two things need to happen for mapping mmio range: + * 1. mapped in the uaddr so VMM can access it. + * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. + * + * This function takes care of the second. The first one is managed by vfio, + * and hence is taken care of via vfio_pci_mmap_fault(). + */ +static long +mshv_map_user_memory(struct mshv_partition *partition, + struct mshv_user_mem_region mem) +{ + struct mshv_mem_region *region; + struct vm_area_struct *vma; + bool is_mmio; + ulong mmio_pfn; + long ret; + + if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || + !access_ok((const void *)mem.userspace_addr, mem.size)) + return -EINVAL; + + mmap_read_lock(current->mm); + vma = vma_lookup(current->mm, mem.userspace_addr); + is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; + mmio_pfn = is_mmio ? vma->vm_pgoff : 0; + mmap_read_unlock(current->mm); + + if (!vma) + return -EINVAL; + + ret = mshv_partition_create_region(partition, &mem, ®ion, + is_mmio); + if (ret) + return ret; + + if (is_mmio) + ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn, + mmio_pfn, HVPFN_DOWN(mem.size)); + else + ret = mshv_partition_mem_region_map(region); + + if (ret) + goto errout; + + /* Install the new region */ + hlist_add_head(®ion->hnode, &partition->pt_mem_regions); + + return 0; + +errout: + vfree(region); + return ret; +} + +/* Called for unmapping both the guest ram and the mmio space */ +static long +mshv_unmap_user_memory(struct mshv_partition *partition, + struct mshv_user_mem_region mem) +{ + struct mshv_mem_region *region; + u32 unmap_flags = 0; + + if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) + return -EINVAL; + + region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); + if (!region) + return -EINVAL; + + /* Paranoia check */ + if (region->start_uaddr != mem.userspace_addr || + region->start_gfn != mem.guest_pfn || + region->nr_pages != HVPFN_DOWN(mem.size)) + return -EINVAL; + + hlist_del(®ion->hnode); + + if (region->flags.large_pages) + unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; + + /* ignore unmap failures and continue as process may be exiting */ + hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, + region->nr_pages, unmap_flags); + + mshv_region_evict(region); + + vfree(region); + return 0; +} + +static long +mshv_partition_ioctl_set_memory(struct mshv_partition *partition, + struct mshv_user_mem_region __user *user_mem) +{ + struct mshv_user_mem_region mem; + + if (copy_from_user(&mem, user_mem, sizeof(mem))) + return -EFAULT; + + if (!mem.size || + !PAGE_ALIGNED(mem.size) || + !PAGE_ALIGNED(mem.userspace_addr) || + (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || + mshv_field_nonzero(mem, rsvd)) + return -EINVAL; + + if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) + return mshv_unmap_user_memory(partition, mem); + + return mshv_map_user_memory(partition, mem); +} + +static long +mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, + void __user *user_args) +{ + struct mshv_user_ioeventfd args; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + return mshv_set_unset_ioeventfd(partition, &args); +} + +static long +mshv_partition_ioctl_irqfd(struct mshv_partition *partition, + void __user *user_args) +{ + struct mshv_user_irqfd args; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + return mshv_set_unset_irqfd(partition, &args); +} + +static long +mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, + void __user *user_args) +{ + struct mshv_gpap_access_bitmap args; + union hv_gpa_page_access_state *states; + long ret, i; + union hv_gpa_page_access_state_flags hv_flags = {}; + u8 hv_type_mask; + ulong bitmap_buf_sz, states_buf_sz; + int written = 0; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || + args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || + mshv_field_nonzero(args, rsvd) || !args.page_count || + !args.bitmap_ptr) + return -EINVAL; + + if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) + return -E2BIG; + + /* Num bytes needed to store bitmap; one bit per page rounded up */ + bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); + + /* Sanity check */ + if (bitmap_buf_sz > states_buf_sz) + return -EBADFD; + + switch (args.access_type) { + case MSHV_GPAP_ACCESS_TYPE_ACCESSED: + hv_type_mask = 1; + if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { + hv_flags.clear_accessed = 1; + /* not accessed implies not dirty */ + hv_flags.clear_dirty = 1; + } else { /* MSHV_GPAP_ACCESS_OP_SET */ + hv_flags.set_accessed = 1; + } + break; + case MSHV_GPAP_ACCESS_TYPE_DIRTY: + hv_type_mask = 2; + if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { + hv_flags.clear_dirty = 1; + } else { /* MSHV_GPAP_ACCESS_OP_SET */ + hv_flags.set_dirty = 1; + /* dirty implies accessed */ + hv_flags.set_accessed = 1; + } + break; + } + + states = vzalloc(states_buf_sz); + if (!states) + return -ENOMEM; + + ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, + args.gpap_base, hv_flags, &written, + states); + if (ret) + goto free_return; + + /* + * Overwrite states buffer with bitmap - the bits in hv_type_mask + * correspond to bitfields in hv_gpa_page_access_state + */ + for (i = 0; i < written; ++i) + __assign_bit(i, (ulong *)states, + states[i].as_uint8 & hv_type_mask); + + /* zero the unused bits in the last byte(s) of the returned bitmap */ + for (i = written; i < bitmap_buf_sz * 8; ++i) + __clear_bit(i, (ulong *)states); + + if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) + ret = -EFAULT; + +free_return: + vfree(states); + return ret; +} + +static long +mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, + void __user *user_args) +{ + struct mshv_user_irq_entry *entries = NULL; + struct mshv_user_irq_table args; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + if (args.nr > MSHV_MAX_GUEST_IRQS || + mshv_field_nonzero(args, rsvd)) + return -EINVAL; + + if (args.nr) { + struct mshv_user_irq_table __user *urouting = user_args; + + entries = vmemdup_user(urouting->entries, + array_size(sizeof(*entries), + args.nr)); + if (IS_ERR(entries)) + return PTR_ERR(entries); + } + ret = mshv_update_routing_table(partition, entries, args.nr); + kvfree(entries); + + return ret; +} + +static long +mshv_partition_ioctl_initialize(struct mshv_partition *partition) +{ + long ret; + + if (partition->pt_initialized) + return 0; + + ret = hv_call_initialize_partition(partition->pt_id); + if (ret) + goto withdraw_mem; + + partition->pt_initialized = true; + + return 0; + +withdraw_mem: + hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); + + return ret; +} + +static long +mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct mshv_partition *partition = filp->private_data; + long ret; + void __user *uarg = (void __user *)arg; + + if (mutex_lock_killable(&partition->pt_mutex)) + return -EINTR; + + switch (ioctl) { + case MSHV_INITIALIZE_PARTITION: + ret = mshv_partition_ioctl_initialize(partition); + break; + case MSHV_SET_GUEST_MEMORY: + ret = mshv_partition_ioctl_set_memory(partition, uarg); + break; + case MSHV_CREATE_VP: + ret = mshv_partition_ioctl_create_vp(partition, uarg); + break; + case MSHV_IRQFD: + ret = mshv_partition_ioctl_irqfd(partition, uarg); + break; + case MSHV_IOEVENTFD: + ret = mshv_partition_ioctl_ioeventfd(partition, uarg); + break; + case MSHV_SET_MSI_ROUTING: + ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); + break; + case MSHV_GET_GPAP_ACCESS_BITMAP: + ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, + uarg); + break; + case MSHV_ROOT_HVCALL: + ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); + break; + default: + ret = -ENOTTY; + } + + mutex_unlock(&partition->pt_mutex); + return ret; +} + +static int +disable_vp_dispatch(struct mshv_vp *vp) +{ + int ret; + struct hv_register_assoc dispatch_suspend = { + .name = HV_REGISTER_DISPATCH_SUSPEND, + .value.dispatch_suspend.suspended = 1, + }; + + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &dispatch_suspend); + if (ret) + vp_err(vp, "failed to suspend\n"); + + return ret; +} + +static int +get_vp_signaled_count(struct mshv_vp *vp, u64 *count) +{ + int ret; + struct hv_register_assoc root_signal_count = { + .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, + }; + + ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &root_signal_count); + + if (ret) { + vp_err(vp, "Failed to get root signal count"); + *count = 0; + return ret; + } + + *count = root_signal_count.value.reg64; + + return ret; +} + +static void +drain_vp_signals(struct mshv_vp *vp) +{ + u64 hv_signal_count; + u64 vp_signal_count; + + get_vp_signaled_count(vp, &hv_signal_count); + + vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); + + /* + * There should be at most 1 outstanding notification, but be extra + * careful anyway. + */ + while (hv_signal_count != vp_signal_count) { + WARN_ON(hv_signal_count - vp_signal_count != 1); + + if (wait_event_interruptible(vp->run.vp_suspend_queue, + vp->run.kicked_by_hv == 1)) + break; + vp->run.kicked_by_hv = 0; + vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); + } +} + +static void drain_all_vps(const struct mshv_partition *partition) +{ + int i; + struct mshv_vp *vp; + + /* + * VPs are reachable from ISR. It is safe to not take the partition + * lock because nobody else can enter this function and drop the + * partition from the list. + */ + for (i = 0; i < MSHV_MAX_VPS; i++) { + vp = partition->pt_vp_array[i]; + if (!vp) + continue; + /* + * Disable dispatching of the VP in the hypervisor. After this + * the hypervisor guarantees it won't generate any signals for + * the VP and the hypervisor's VP signal count won't change. + */ + disable_vp_dispatch(vp); + drain_vp_signals(vp); + } +} + +static void +remove_partition(struct mshv_partition *partition) +{ + spin_lock(&mshv_root.pt_ht_lock); + hlist_del_rcu(&partition->pt_hnode); + spin_unlock(&mshv_root.pt_ht_lock); + + synchronize_rcu(); +} + +/* + * Tear down a partition and remove it from the list. + * Partition's refcount must be 0 + */ +static void destroy_partition(struct mshv_partition *partition) +{ + struct mshv_vp *vp; + struct mshv_mem_region *region; + int i, ret; + struct hlist_node *n; + + if (refcount_read(&partition->pt_ref_count)) { + pt_err(partition, + "Attempt to destroy partition but refcount > 0\n"); + return; + } + + if (partition->pt_initialized) { + /* + * We only need to drain signals for root scheduler. This should be + * done before removing the partition from the partition list. + */ + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + drain_all_vps(partition); + + /* Remove vps */ + for (i = 0; i < MSHV_MAX_VPS; ++i) { + vp = partition->pt_vp_array[i]; + if (!vp) + continue; + + if (hv_parent_partition()) + mshv_vp_stats_unmap(partition->pt_id, vp->vp_index); + + if (vp->vp_register_page) { + (void)hv_call_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_REGISTERS, + input_vtl_zero); + vp->vp_register_page = NULL; + } + + (void)hv_call_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + input_vtl_zero); + vp->vp_intercept_msg_page = NULL; + + if (vp->vp_ghcb_page) { + (void)hv_call_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_GHCB, + input_vtl_normal); + vp->vp_ghcb_page = NULL; + } + + kfree(vp); + + partition->pt_vp_array[i] = NULL; + } + + /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ + hv_call_finalize_partition(partition->pt_id); + + partition->pt_initialized = false; + } + + remove_partition(partition); + + /* Remove regions, regain access to the memory and unpin the pages */ + hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, + hnode) { + hlist_del(®ion->hnode); + + if (mshv_partition_encrypted(partition)) { + ret = mshv_partition_region_share(region); + if (ret) { + pt_err(partition, + "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", + ret); + return; + } + } + + mshv_region_evict(region); + + vfree(region); + } + + /* Withdraw and free all pages we deposited */ + hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); + hv_call_delete_partition(partition->pt_id); + + mshv_free_routing_table(partition); + kfree(partition); +} + +struct +mshv_partition *mshv_partition_get(struct mshv_partition *partition) +{ + if (refcount_inc_not_zero(&partition->pt_ref_count)) + return partition; + return NULL; +} + +struct +mshv_partition *mshv_partition_find(u64 partition_id) + __must_hold(RCU) +{ + struct mshv_partition *p; + + hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, + partition_id) + if (p->pt_id == partition_id) + return p; + + return NULL; +} + +void +mshv_partition_put(struct mshv_partition *partition) +{ + if (refcount_dec_and_test(&partition->pt_ref_count)) + destroy_partition(partition); +} + +static int +mshv_partition_release(struct inode *inode, struct file *filp) +{ + struct mshv_partition *partition = filp->private_data; + + mshv_eventfd_release(partition); + + cleanup_srcu_struct(&partition->pt_irq_srcu); + + mshv_partition_put(partition); + + return 0; +} + +static int +add_partition(struct mshv_partition *partition) +{ + spin_lock(&mshv_root.pt_ht_lock); + + hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, + partition->pt_id); + + spin_unlock(&mshv_root.pt_ht_lock); + + return 0; +} + +static long +mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) +{ + struct mshv_create_partition args; + u64 creation_flags; + struct hv_partition_creation_properties creation_properties = {}; + union hv_partition_isolation_properties isolation_properties = {}; + struct mshv_partition *partition; + struct file *file; + int fd; + long ret; + + if (copy_from_user(&args, user_arg, sizeof(args))) + return -EFAULT; + + if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || + args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) + return -EINVAL; + + /* Only support EXO partitions */ + creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | + HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; + + if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC)) + creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; + if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC)) + creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; + if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES)) + creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; + + switch (args.pt_isolation) { + case MSHV_PT_ISOLATION_NONE: + isolation_properties.isolation_type = + HV_PARTITION_ISOLATION_TYPE_NONE; + break; + } + + partition = kzalloc(sizeof(*partition), GFP_KERNEL); + if (!partition) + return -ENOMEM; + + partition->pt_module_dev = module_dev; + partition->isolation_type = isolation_properties.isolation_type; + + refcount_set(&partition->pt_ref_count, 1); + + mutex_init(&partition->pt_mutex); + + mutex_init(&partition->pt_irq_lock); + + init_completion(&partition->async_hypercall); + + INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); + + INIT_HLIST_HEAD(&partition->pt_devices); + + INIT_HLIST_HEAD(&partition->pt_mem_regions); + + mshv_eventfd_init(partition); + + ret = init_srcu_struct(&partition->pt_irq_srcu); + if (ret) + goto free_partition; + + ret = hv_call_create_partition(creation_flags, + creation_properties, + isolation_properties, + &partition->pt_id); + if (ret) + goto cleanup_irq_srcu; + + ret = add_partition(partition); + if (ret) + goto delete_partition; + + ret = mshv_init_async_handler(partition); + if (ret) + goto remove_partition; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + ret = fd; + goto remove_partition; + } + + file = anon_inode_getfile("mshv_partition", &mshv_partition_fops, + partition, O_RDWR); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto put_fd; + } + + fd_install(fd, file); + + return fd; + +put_fd: + put_unused_fd(fd); +remove_partition: + remove_partition(partition); +delete_partition: + hv_call_delete_partition(partition->pt_id); +cleanup_irq_srcu: + cleanup_srcu_struct(&partition->pt_irq_srcu); +free_partition: + kfree(partition); + + return ret; +} + +static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct miscdevice *misc = filp->private_data; + + switch (ioctl) { + case MSHV_CREATE_PARTITION: + return mshv_ioctl_create_partition((void __user *)arg, + misc->this_device); + } + + return -ENOTTY; +} + +static int +mshv_dev_open(struct inode *inode, struct file *filp) +{ + return 0; +} + +static int +mshv_dev_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static int mshv_cpuhp_online; +static int mshv_root_sched_online; + +static const char *scheduler_type_to_string(enum hv_scheduler_type type) +{ + switch (type) { + case HV_SCHEDULER_TYPE_LP: + return "classic scheduler without SMT"; + case HV_SCHEDULER_TYPE_LP_SMT: + return "classic scheduler with SMT"; + case HV_SCHEDULER_TYPE_CORE_SMT: + return "core scheduler"; + case HV_SCHEDULER_TYPE_ROOT: + return "root scheduler"; + default: + return "unknown scheduler"; + }; +} + +/* TODO move this to hv_common.c when needed outside */ +static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) +{ + struct hv_input_get_system_property *input; + struct hv_output_get_system_property *output; + unsigned long flags; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + memset(output, 0, sizeof(*output)); + input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; + + status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); + if (!hv_result_success(status)) { + local_irq_restore(flags); + pr_err("%s: %s\n", __func__, hv_result_to_string(status)); + return hv_result_to_errno(status); + } + + *out = output->scheduler_type; + local_irq_restore(flags); + + return 0; +} + +/* Retrieve and stash the supported scheduler type */ +static int __init mshv_retrieve_scheduler_type(struct device *dev) +{ + int ret; + + ret = hv_retrieve_scheduler_type(&hv_scheduler_type); + if (ret) + return ret; + + dev_info(dev, "Hypervisor using %s\n", + scheduler_type_to_string(hv_scheduler_type)); + + switch (hv_scheduler_type) { + case HV_SCHEDULER_TYPE_CORE_SMT: + case HV_SCHEDULER_TYPE_LP_SMT: + case HV_SCHEDULER_TYPE_ROOT: + case HV_SCHEDULER_TYPE_LP: + /* Supported scheduler, nothing to do */ + break; + default: + dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", + hv_scheduler_type); + return -EOPNOTSUPP; + } + + return 0; +} + +static int mshv_root_scheduler_init(unsigned int cpu) +{ + void **inputarg, **outputarg, *p; + + inputarg = (void **)this_cpu_ptr(root_scheduler_input); + outputarg = (void **)this_cpu_ptr(root_scheduler_output); + + /* Allocate two consecutive pages. One for input, one for output. */ + p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); + if (!p) + return -ENOMEM; + + *inputarg = p; + *outputarg = (char *)p + HV_HYP_PAGE_SIZE; + + return 0; +} + +static int mshv_root_scheduler_cleanup(unsigned int cpu) +{ + void *p, **inputarg, **outputarg; + + inputarg = (void **)this_cpu_ptr(root_scheduler_input); + outputarg = (void **)this_cpu_ptr(root_scheduler_output); + + p = *inputarg; + + *inputarg = NULL; + *outputarg = NULL; + + kfree(p); + + return 0; +} + +/* Must be called after retrieving the scheduler type */ +static int +root_scheduler_init(struct device *dev) +{ + int ret; + + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) + return 0; + + root_scheduler_input = alloc_percpu(void *); + root_scheduler_output = alloc_percpu(void *); + + if (!root_scheduler_input || !root_scheduler_output) { + dev_err(dev, "Failed to allocate root scheduler buffers\n"); + ret = -ENOMEM; + goto out; + } + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", + mshv_root_scheduler_init, + mshv_root_scheduler_cleanup); + + if (ret < 0) { + dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); + goto out; + } + + mshv_root_sched_online = ret; + + return 0; + +out: + free_percpu(root_scheduler_input); + free_percpu(root_scheduler_output); + return ret; +} + +static void +root_scheduler_deinit(void) +{ + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) + return; + + cpuhp_remove_state(mshv_root_sched_online); + free_percpu(root_scheduler_input); + free_percpu(root_scheduler_output); +} + +static int mshv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + cpuhp_remove_state(mshv_cpuhp_online); + return 0; +} + +struct notifier_block mshv_reboot_nb = { + .notifier_call = mshv_reboot_notify, +}; + +static void mshv_root_partition_exit(void) +{ + unregister_reboot_notifier(&mshv_reboot_nb); + root_scheduler_deinit(); +} + +static int __init mshv_root_partition_init(struct device *dev) +{ + int err; + + if (mshv_retrieve_scheduler_type(dev)) + return -ENODEV; + + err = root_scheduler_init(dev); + if (err) + return err; + + err = register_reboot_notifier(&mshv_reboot_nb); + if (err) + goto root_sched_deinit; + + return 0; + +root_sched_deinit: + root_scheduler_deinit(); + return err; +} + +static int __init mshv_parent_partition_init(void) +{ + int ret; + struct device *dev; + union hv_hypervisor_version_info version_info; + + if (!hv_root_partition() || is_kdump_kernel()) + return -ENODEV; + + if (hv_get_hypervisor_version(&version_info)) + return -ENODEV; + + ret = misc_register(&mshv_dev); + if (ret) + return ret; + + dev = mshv_dev.this_device; + + if (version_info.build_number < MSHV_HV_MIN_VERSION || + version_info.build_number > MSHV_HV_MAX_VERSION) { + dev_err(dev, "Running on unvalidated Hyper-V version\n"); + dev_err(dev, "Versions: current: %u min: %u max: %u\n", + version_info.build_number, MSHV_HV_MIN_VERSION, + MSHV_HV_MAX_VERSION); + } + + mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages); + if (!mshv_root.synic_pages) { + dev_err(dev, "Failed to allocate percpu synic page\n"); + ret = -ENOMEM; + goto device_deregister; + } + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic", + mshv_synic_init, + mshv_synic_cleanup); + if (ret < 0) { + dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret); + goto free_synic_pages; + } + + mshv_cpuhp_online = ret; + + ret = mshv_root_partition_init(dev); + if (ret) + goto remove_cpu_state; + + ret = mshv_irqfd_wq_init(); + if (ret) + goto exit_partition; + + spin_lock_init(&mshv_root.pt_ht_lock); + hash_init(mshv_root.pt_htable); + + hv_setup_mshv_handler(mshv_isr); + + return 0; + +exit_partition: + if (hv_root_partition()) + mshv_root_partition_exit(); +remove_cpu_state: + cpuhp_remove_state(mshv_cpuhp_online); +free_synic_pages: + free_percpu(mshv_root.synic_pages); +device_deregister: + misc_deregister(&mshv_dev); + return ret; +} + +static void __exit mshv_parent_partition_exit(void) +{ + hv_setup_mshv_handler(NULL); + mshv_port_table_fini(); + misc_deregister(&mshv_dev); + mshv_irqfd_wq_cleanup(); + if (hv_root_partition()) + mshv_root_partition_exit(); + cpuhp_remove_state(mshv_cpuhp_online); + free_percpu(mshv_root.synic_pages); +} + +module_init(mshv_parent_partition_init); +module_exit(mshv_parent_partition_exit); diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c new file mode 100644 index 000000000000..e6b6381b7c36 --- /dev/null +++ b/drivers/hv/mshv_synic.c @@ -0,0 +1,665 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * mshv_root module's main interrupt handler and associated functionality. + * + * Authors: Microsoft Linux virtualization team + */ + +#include +#include +#include +#include +#include +#include + +#include "mshv_eventfd.h" +#include "mshv.h" + +static u32 synic_event_ring_get_queued_port(u32 sint_index) +{ + struct hv_synic_event_ring_page **event_ring_page; + volatile struct hv_synic_event_ring *ring; + struct hv_synic_pages *spages; + u8 **synic_eventring_tail; + u32 message; + u8 tail; + + spages = this_cpu_ptr(mshv_root.synic_pages); + event_ring_page = &spages->synic_event_ring_page; + synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail); + + if (unlikely(!*synic_eventring_tail)) { + pr_debug("Missing synic event ring tail!\n"); + return 0; + } + tail = (*synic_eventring_tail)[sint_index]; + + if (unlikely(!*event_ring_page)) { + pr_debug("Missing synic event ring page!\n"); + return 0; + } + + ring = &(*event_ring_page)->sint_event_ring[sint_index]; + + /* + * Get the message. + */ + message = ring->data[tail]; + + if (!message) { + if (ring->ring_full) { + /* + * Ring is marked full, but we would have consumed all + * the messages. Notify the hypervisor that ring is now + * empty and check again. + */ + ring->ring_full = 0; + hv_call_notify_port_ring_empty(sint_index); + message = ring->data[tail]; + } + + if (!message) { + ring->signal_masked = 0; + /* + * Unmask the signal and sync with hypervisor + * before one last check for any message. + */ + mb(); + message = ring->data[tail]; + + /* + * Ok, lets bail out. + */ + if (!message) + return 0; + } + + ring->signal_masked = 1; + } + + /* + * Clear the message in the ring buffer. + */ + ring->data[tail] = 0; + + if (++tail == HV_SYNIC_EVENT_RING_MESSAGE_COUNT) + tail = 0; + + (*synic_eventring_tail)[sint_index] = tail; + + return message; +} + +static bool +mshv_doorbell_isr(struct hv_message *msg) +{ + struct hv_notification_message_payload *notification; + u32 port; + + if (msg->header.message_type != HVMSG_SYNIC_SINT_INTERCEPT) + return false; + + notification = (struct hv_notification_message_payload *)msg->u.payload; + if (notification->sint_index != HV_SYNIC_DOORBELL_SINT_INDEX) + return false; + + while ((port = synic_event_ring_get_queued_port(HV_SYNIC_DOORBELL_SINT_INDEX))) { + struct port_table_info ptinfo = { 0 }; + + if (mshv_portid_lookup(port, &ptinfo)) { + pr_debug("Failed to get port info from port_table!\n"); + continue; + } + + if (ptinfo.hv_port_type != HV_PORT_TYPE_DOORBELL) { + pr_debug("Not a doorbell port!, port: %d, port_type: %d\n", + port, ptinfo.hv_port_type); + continue; + } + + /* Invoke the callback */ + ptinfo.hv_port_doorbell.doorbell_cb(port, + ptinfo.hv_port_doorbell.data); + } + + return true; +} + +static bool mshv_async_call_completion_isr(struct hv_message *msg) +{ + bool handled = false; + struct hv_async_completion_message_payload *async_msg; + struct mshv_partition *partition; + u64 partition_id; + + if (msg->header.message_type != HVMSG_ASYNC_CALL_COMPLETION) + goto out; + + async_msg = + (struct hv_async_completion_message_payload *)msg->u.payload; + + partition_id = async_msg->partition_id; + + /* + * Hold this lock for the rest of the isr, because the partition could + * be released anytime. + * e.g. the MSHV_RUN_VP thread could wake on another cpu; it could + * release the partition unless we hold this! + */ + rcu_read_lock(); + + partition = mshv_partition_find(partition_id); + + if (unlikely(!partition)) { + pr_debug("failed to find partition %llu\n", partition_id); + goto unlock_out; + } + + partition->async_hypercall_status = async_msg->status; + complete(&partition->async_hypercall); + + handled = true; + +unlock_out: + rcu_read_unlock(); +out: + return handled; +} + +static void kick_vp(struct mshv_vp *vp) +{ + atomic64_inc(&vp->run.vp_signaled_count); + vp->run.kicked_by_hv = 1; + wake_up(&vp->run.vp_suspend_queue); +} + +static void +handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg) +{ + int bank_idx, vps_signaled = 0, bank_mask_size; + struct mshv_partition *partition; + const struct hv_vpset *vpset; + const u64 *bank_contents; + u64 partition_id = msg->partition_id; + + if (msg->vp_bitset.bitset.format != HV_GENERIC_SET_SPARSE_4K) { + pr_debug("scheduler message format is not HV_GENERIC_SET_SPARSE_4K"); + return; + } + + if (msg->vp_count == 0) { + pr_debug("scheduler message with no VP specified"); + return; + } + + rcu_read_lock(); + + partition = mshv_partition_find(partition_id); + if (unlikely(!partition)) { + pr_debug("failed to find partition %llu\n", partition_id); + goto unlock_out; + } + + vpset = &msg->vp_bitset.bitset; + + bank_idx = -1; + bank_contents = vpset->bank_contents; + bank_mask_size = sizeof(vpset->valid_bank_mask) * BITS_PER_BYTE; + + while (true) { + int vp_bank_idx = -1; + int vp_bank_size = sizeof(*bank_contents) * BITS_PER_BYTE; + int vp_index; + + bank_idx = find_next_bit((unsigned long *)&vpset->valid_bank_mask, + bank_mask_size, bank_idx + 1); + if (bank_idx == bank_mask_size) + break; + + while (true) { + struct mshv_vp *vp; + + vp_bank_idx = find_next_bit((unsigned long *)bank_contents, + vp_bank_size, vp_bank_idx + 1); + if (vp_bank_idx == vp_bank_size) + break; + + vp_index = (bank_idx * vp_bank_size) + vp_bank_idx; + + /* This shouldn't happen, but just in case. */ + if (unlikely(vp_index >= MSHV_MAX_VPS)) { + pr_debug("VP index %u out of bounds\n", + vp_index); + goto unlock_out; + } + + vp = partition->pt_vp_array[vp_index]; + if (unlikely(!vp)) { + pr_debug("failed to find VP %u\n", vp_index); + goto unlock_out; + } + + kick_vp(vp); + vps_signaled++; + } + + bank_contents++; + } + +unlock_out: + rcu_read_unlock(); + + if (vps_signaled != msg->vp_count) + pr_debug("asked to signal %u VPs but only did %u\n", + msg->vp_count, vps_signaled); +} + +static void +handle_pair_message(const struct hv_vp_signal_pair_scheduler_message *msg) +{ + struct mshv_partition *partition = NULL; + struct mshv_vp *vp; + int idx; + + rcu_read_lock(); + + for (idx = 0; idx < msg->vp_count; idx++) { + u64 partition_id = msg->partition_ids[idx]; + u32 vp_index = msg->vp_indexes[idx]; + + if (idx == 0 || partition->pt_id != partition_id) { + partition = mshv_partition_find(partition_id); + if (unlikely(!partition)) { + pr_debug("failed to find partition %llu\n", + partition_id); + break; + } + } + + /* This shouldn't happen, but just in case. */ + if (unlikely(vp_index >= MSHV_MAX_VPS)) { + pr_debug("VP index %u out of bounds\n", vp_index); + break; + } + + vp = partition->pt_vp_array[vp_index]; + if (!vp) { + pr_debug("failed to find VP %u\n", vp_index); + break; + } + + kick_vp(vp); + } + + rcu_read_unlock(); +} + +static bool +mshv_scheduler_isr(struct hv_message *msg) +{ + if (msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_BITSET && + msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_PAIR) + return false; + + if (msg->header.message_type == HVMSG_SCHEDULER_VP_SIGNAL_BITSET) + handle_bitset_message((struct hv_vp_signal_bitset_scheduler_message *) + msg->u.payload); + else + handle_pair_message((struct hv_vp_signal_pair_scheduler_message *) + msg->u.payload); + + return true; +} + +static bool +mshv_intercept_isr(struct hv_message *msg) +{ + struct mshv_partition *partition; + bool handled = false; + struct mshv_vp *vp; + u64 partition_id; + u32 vp_index; + + partition_id = msg->header.sender; + + rcu_read_lock(); + + partition = mshv_partition_find(partition_id); + if (unlikely(!partition)) { + pr_debug("failed to find partition %llu\n", + partition_id); + goto unlock_out; + } + + if (msg->header.message_type == HVMSG_X64_APIC_EOI) { + /* + * Check if this gsi is registered in the + * ack_notifier list and invoke the callback + * if registered. + */ + + /* + * If there is a notifier, the ack callback is supposed + * to handle the VMEXIT. So we need not pass this message + * to vcpu thread. + */ + struct hv_x64_apic_eoi_message *eoi_msg = + (struct hv_x64_apic_eoi_message *)&msg->u.payload[0]; + + if (mshv_notify_acked_gsi(partition, eoi_msg->interrupt_vector)) { + handled = true; + goto unlock_out; + } + } + + /* + * We should get an opaque intercept message here for all intercept + * messages, since we're using the mapped VP intercept message page. + * + * The intercept message will have been placed in intercept message + * page at this point. + * + * Make sure the message type matches our expectation. + */ + if (msg->header.message_type != HVMSG_OPAQUE_INTERCEPT) { + pr_debug("wrong message type %d", msg->header.message_type); + goto unlock_out; + } + + /* + * Since we directly index the vp, and it has to exist for us to be here + * (because the vp is only deleted when the partition is), no additional + * locking is needed here + */ + vp_index = + ((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index; + vp = partition->pt_vp_array[vp_index]; + if (unlikely(!vp)) { + pr_debug("failed to find VP %u\n", vp_index); + goto unlock_out; + } + + kick_vp(vp); + + handled = true; + +unlock_out: + rcu_read_unlock(); + + return handled; +} + +void mshv_isr(void) +{ + struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); + struct hv_message_page **msg_page = &spages->synic_message_page; + struct hv_message *msg; + bool handled; + + if (unlikely(!(*msg_page))) { + pr_debug("Missing synic page!\n"); + return; + } + + msg = &((*msg_page)->sint_message[HV_SYNIC_INTERCEPTION_SINT_INDEX]); + + /* + * If the type isn't set, there isn't really a message; + * it may be some other hyperv interrupt + */ + if (msg->header.message_type == HVMSG_NONE) + return; + + handled = mshv_doorbell_isr(msg); + + if (!handled) + handled = mshv_scheduler_isr(msg); + + if (!handled) + handled = mshv_async_call_completion_isr(msg); + + if (!handled) + handled = mshv_intercept_isr(msg); + + if (handled) { + /* + * Acknowledge message with hypervisor if another message is + * pending. + */ + msg->header.message_type = HVMSG_NONE; + /* + * Ensure the write is complete so the hypervisor will deliver + * the next message if available. + */ + mb(); + if (msg->header.message_flags.msg_pending) + hv_set_non_nested_msr(HV_MSR_EOM, 0); + +#ifdef HYPERVISOR_CALLBACK_VECTOR + add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR); +#endif + } else { + pr_warn_once("%s: unknown message type 0x%x\n", __func__, + msg->header.message_type); + } +} + +int mshv_synic_init(unsigned int cpu) +{ + union hv_synic_simp simp; + union hv_synic_siefp siefp; + union hv_synic_sirbp sirbp; +#ifdef HYPERVISOR_CALLBACK_VECTOR + union hv_synic_sint sint; +#endif + union hv_synic_scontrol sctrl; + struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); + struct hv_message_page **msg_page = &spages->synic_message_page; + struct hv_synic_event_flags_page **event_flags_page = + &spages->synic_event_flags_page; + struct hv_synic_event_ring_page **event_ring_page = + &spages->synic_event_ring_page; + + /* Setup the Synic's message page */ + simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP); + simp.simp_enabled = true; + *msg_page = memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT, + HV_HYP_PAGE_SIZE, + MEMREMAP_WB); + + if (!(*msg_page)) + return -EFAULT; + + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); + + /* Setup the Synic's event flags page */ + siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP); + siefp.siefp_enabled = true; + *event_flags_page = memremap(siefp.base_siefp_gpa << PAGE_SHIFT, + PAGE_SIZE, MEMREMAP_WB); + + if (!(*event_flags_page)) + goto cleanup; + + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); + + /* Setup the Synic's event ring page */ + sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP); + sirbp.sirbp_enabled = true; + *event_ring_page = memremap(sirbp.base_sirbp_gpa << PAGE_SHIFT, + PAGE_SIZE, MEMREMAP_WB); + + if (!(*event_ring_page)) + goto cleanup; + + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); + +#ifdef HYPERVISOR_CALLBACK_VECTOR + /* Enable intercepts */ + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = false; + sint.auto_eoi = hv_recommend_using_aeoi(); + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, + sint.as_uint64); + + /* Doorbell SINT */ + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = false; + sint.as_intercept = 1; + sint.auto_eoi = hv_recommend_using_aeoi(); + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX, + sint.as_uint64); +#endif + + /* Enable global synic bit */ + sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL); + sctrl.enable = 1; + hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64); + + return 0; + +cleanup: + if (*event_ring_page) { + sirbp.sirbp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); + memunmap(*event_ring_page); + } + if (*event_flags_page) { + siefp.siefp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); + memunmap(*event_flags_page); + } + if (*msg_page) { + simp.simp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); + memunmap(*msg_page); + } + + return -EFAULT; +} + +int mshv_synic_cleanup(unsigned int cpu) +{ + union hv_synic_sint sint; + union hv_synic_simp simp; + union hv_synic_siefp siefp; + union hv_synic_sirbp sirbp; + union hv_synic_scontrol sctrl; + struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); + struct hv_message_page **msg_page = &spages->synic_message_page; + struct hv_synic_event_flags_page **event_flags_page = + &spages->synic_event_flags_page; + struct hv_synic_event_ring_page **event_ring_page = + &spages->synic_event_ring_page; + + /* Disable the interrupt */ + sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX); + sint.masked = true; + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, + sint.as_uint64); + + /* Disable Doorbell SINT */ + sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX); + sint.masked = true; + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX, + sint.as_uint64); + + /* Disable Synic's event ring page */ + sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP); + sirbp.sirbp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); + memunmap(*event_ring_page); + + /* Disable Synic's event flags page */ + siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP); + siefp.siefp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); + memunmap(*event_flags_page); + + /* Disable Synic's message page */ + simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP); + simp.simp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); + memunmap(*msg_page); + + /* Disable global synic bit */ + sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL); + sctrl.enable = 0; + hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64); + + return 0; +} + +int +mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, void *data, + u64 gpa, u64 val, u64 flags) +{ + struct hv_connection_info connection_info = { 0 }; + union hv_connection_id connection_id = { 0 }; + struct port_table_info *port_table_info; + struct hv_port_info port_info = { 0 }; + union hv_port_id port_id = { 0 }; + int ret; + + port_table_info = kmalloc(sizeof(*port_table_info), GFP_KERNEL); + if (!port_table_info) + return -ENOMEM; + + port_table_info->hv_port_type = HV_PORT_TYPE_DOORBELL; + port_table_info->hv_port_doorbell.doorbell_cb = doorbell_cb; + port_table_info->hv_port_doorbell.data = data; + ret = mshv_portid_alloc(port_table_info); + if (ret < 0) { + kfree(port_table_info); + return ret; + } + + port_id.u.id = ret; + port_info.port_type = HV_PORT_TYPE_DOORBELL; + port_info.doorbell_port_info.target_sint = HV_SYNIC_DOORBELL_SINT_INDEX; + port_info.doorbell_port_info.target_vp = HV_ANY_VP; + ret = hv_call_create_port(hv_current_partition_id, port_id, partition_id, + &port_info, + 0, 0, NUMA_NO_NODE); + + if (ret < 0) { + mshv_portid_free(port_id.u.id); + return ret; + } + + connection_id.u.id = port_id.u.id; + connection_info.port_type = HV_PORT_TYPE_DOORBELL; + connection_info.doorbell_connection_info.gpa = gpa; + connection_info.doorbell_connection_info.trigger_value = val; + connection_info.doorbell_connection_info.flags = flags; + + ret = hv_call_connect_port(hv_current_partition_id, port_id, partition_id, + connection_id, &connection_info, 0, NUMA_NO_NODE); + if (ret < 0) { + hv_call_delete_port(hv_current_partition_id, port_id); + mshv_portid_free(port_id.u.id); + return ret; + } + + // lets use the port_id as the doorbell_id + return port_id.u.id; +} + +void +mshv_unregister_doorbell(u64 partition_id, int doorbell_portid) +{ + union hv_port_id port_id = { 0 }; + union hv_connection_id connection_id = { 0 }; + + connection_id.u.id = doorbell_portid; + hv_call_disconnect_port(partition_id, connection_id); + + port_id.u.id = doorbell_portid; + hv_call_delete_port(hv_current_partition_id, port_id); + + mshv_portid_free(doorbell_portid); +} diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h new file mode 100644 index 000000000000..876bfe4e4227 --- /dev/null +++ b/include/uapi/linux/mshv.h @@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Userspace interfaces for /dev/mshv* devices and derived fds + * + * This file is divided into sections containing data structures and IOCTLs for + * a particular set of related devices or derived file descriptors. + * + * The IOCTL definitions are at the end of each section. They are grouped by + * device/fd, so that new IOCTLs can easily be added with a monotonically + * increasing number. + */ +#ifndef _UAPI_LINUX_MSHV_H +#define _UAPI_LINUX_MSHV_H + +#include + +#define MSHV_IOCTL 0xB8 + +/* + ******************************************* + * Entry point to main VMM APIs: /dev/mshv * + ******************************************* + */ + +enum { + MSHV_PT_BIT_LAPIC, + MSHV_PT_BIT_X2APIC, + MSHV_PT_BIT_GPA_SUPER_PAGES, + MSHV_PT_BIT_COUNT, +}; + +#define MSHV_PT_FLAGS_MASK ((1 << MSHV_PT_BIT_COUNT) - 1) + +enum { + MSHV_PT_ISOLATION_NONE, + MSHV_PT_ISOLATION_COUNT, +}; + +/** + * struct mshv_create_partition - arguments for MSHV_CREATE_PARTITION + * @pt_flags: Bitmask of 1 << MSHV_PT_BIT_* + * @pt_isolation: MSHV_PT_ISOLATION_* + * + * Returns a file descriptor to act as a handle to a guest partition. + * At this point the partition is not yet initialized in the hypervisor. + * Some operations must be done with the partition in this state, e.g. setting + * so-called "early" partition properties. The partition can then be + * initialized with MSHV_INITIALIZE_PARTITION. + */ +struct mshv_create_partition { + __u64 pt_flags; + __u64 pt_isolation; +}; + +/* /dev/mshv */ +#define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition) + +/* + ************************ + * Child partition APIs * + ************************ + */ + +struct mshv_create_vp { + __u32 vp_index; +}; + +enum { + MSHV_SET_MEM_BIT_WRITABLE, + MSHV_SET_MEM_BIT_EXECUTABLE, + MSHV_SET_MEM_BIT_UNMAP, + MSHV_SET_MEM_BIT_COUNT +}; + +#define MSHV_SET_MEM_FLAGS_MASK ((1 << MSHV_SET_MEM_BIT_COUNT) - 1) + +/* The hypervisor's "native" page size */ +#define MSHV_HV_PAGE_SIZE 0x1000 + +/** + * struct mshv_user_mem_region - arguments for MSHV_SET_GUEST_MEMORY + * @size: Size of the memory region (bytes). Must be aligned to + * MSHV_HV_PAGE_SIZE + * @guest_pfn: Base guest page number to map + * @userspace_addr: Base address of userspace memory. Must be aligned to + * MSHV_HV_PAGE_SIZE + * @flags: Bitmask of 1 << MSHV_SET_MEM_BIT_*. If (1 << MSHV_SET_MEM_BIT_UNMAP) + * is set, ignore other bits. + * @rsvd: MBZ + * + * Map or unmap a region of userspace memory to Guest Physical Addresses (GPA). + * Mappings can't overlap in GPA space or userspace. + * To unmap, these fields must match an existing mapping. + */ +struct mshv_user_mem_region { + __u64 size; + __u64 guest_pfn; + __u64 userspace_addr; + __u8 flags; + __u8 rsvd[7]; +}; + +enum { + MSHV_IRQFD_BIT_DEASSIGN, + MSHV_IRQFD_BIT_RESAMPLE, + MSHV_IRQFD_BIT_COUNT, +}; + +#define MSHV_IRQFD_FLAGS_MASK ((1 << MSHV_IRQFD_BIT_COUNT) - 1) + +struct mshv_user_irqfd { + __s32 fd; + __s32 resamplefd; + __u32 gsi; + __u32 flags; +}; + +enum { + MSHV_IOEVENTFD_BIT_DATAMATCH, + MSHV_IOEVENTFD_BIT_PIO, + MSHV_IOEVENTFD_BIT_DEASSIGN, + MSHV_IOEVENTFD_BIT_COUNT, +}; + +#define MSHV_IOEVENTFD_FLAGS_MASK ((1 << MSHV_IOEVENTFD_BIT_COUNT) - 1) + +struct mshv_user_ioeventfd { + __u64 datamatch; + __u64 addr; /* legal pio/mmio address */ + __u32 len; /* 1, 2, 4, or 8 bytes */ + __s32 fd; + __u32 flags; + __u8 rsvd[4]; +}; + +struct mshv_user_irq_entry { + __u32 gsi; + __u32 address_lo; + __u32 address_hi; + __u32 data; +}; + +struct mshv_user_irq_table { + __u32 nr; + __u32 rsvd; /* MBZ */ + struct mshv_user_irq_entry entries[]; +}; + +enum { + MSHV_GPAP_ACCESS_TYPE_ACCESSED, + MSHV_GPAP_ACCESS_TYPE_DIRTY, + MSHV_GPAP_ACCESS_TYPE_COUNT /* Count of enum members */ +}; + +enum { + MSHV_GPAP_ACCESS_OP_NOOP, + MSHV_GPAP_ACCESS_OP_CLEAR, + MSHV_GPAP_ACCESS_OP_SET, + MSHV_GPAP_ACCESS_OP_COUNT /* Count of enum members */ +}; + +/** + * struct mshv_gpap_access_bitmap - arguments for MSHV_GET_GPAP_ACCESS_BITMAP + * @access_type: MSHV_GPAP_ACCESS_TYPE_* - The type of access to record in the + * bitmap + * @access_op: MSHV_GPAP_ACCESS_OP_* - Allows an optional clear or set of all + * the access states in the range, after retrieving the current + * states. + * @rsvd: MBZ + * @page_count: Number of pages + * @gpap_base: Base gpa page number + * @bitmap_ptr: Output buffer for bitmap, at least (page_count + 7) / 8 bytes + * + * Retrieve a bitmap of either ACCESSED or DIRTY bits for a given range of guest + * memory, and optionally clear or set the bits. + */ +struct mshv_gpap_access_bitmap { + __u8 access_type; + __u8 access_op; + __u8 rsvd[6]; + __u64 page_count; + __u64 gpap_base; + __u64 bitmap_ptr; +}; + +/** + * struct mshv_root_hvcall - arguments for MSHV_ROOT_HVCALL + * @code: Hypercall code (HVCALL_*) + * @reps: in: Rep count ('repcount') + * out: Reps completed ('repcomp'). MBZ unless rep hvcall + * @in_sz: Size of input incl rep data. <= MSHV_HV_PAGE_SIZE + * @out_sz: Size of output buffer. <= MSHV_HV_PAGE_SIZE. MBZ if out_ptr is 0 + * @status: in: MBZ + * out: HV_STATUS_* from hypercall + * @rsvd: MBZ + * @in_ptr: Input data buffer (struct hv_input_*). If used with partition or + * vp fd, partition id field is populated by kernel. + * @out_ptr: Output data buffer (optional) + */ +struct mshv_root_hvcall { + __u16 code; + __u16 reps; + __u16 in_sz; + __u16 out_sz; + __u16 status; + __u8 rsvd[6]; + __u64 in_ptr; + __u64 out_ptr; +}; + +/* Partition fds created with MSHV_CREATE_PARTITION */ +#define MSHV_INITIALIZE_PARTITION _IO(MSHV_IOCTL, 0x00) +#define MSHV_CREATE_VP _IOW(MSHV_IOCTL, 0x01, struct mshv_create_vp) +#define MSHV_SET_GUEST_MEMORY _IOW(MSHV_IOCTL, 0x02, struct mshv_user_mem_region) +#define MSHV_IRQFD _IOW(MSHV_IOCTL, 0x03, struct mshv_user_irqfd) +#define MSHV_IOEVENTFD _IOW(MSHV_IOCTL, 0x04, struct mshv_user_ioeventfd) +#define MSHV_SET_MSI_ROUTING _IOW(MSHV_IOCTL, 0x05, struct mshv_user_irq_table) +#define MSHV_GET_GPAP_ACCESS_BITMAP _IOWR(MSHV_IOCTL, 0x06, struct mshv_gpap_access_bitmap) +/* Generic hypercall */ +#define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) + +/* + ******************************** + * VP APIs for child partitions * + ******************************** + */ + +#define MSHV_RUN_VP_BUF_SZ 256 + +/* + * VP state pages may be mapped to userspace via mmap(). + * To specify which state page, use MSHV_VP_MMAP_OFFSET_ values multiplied by + * the system page size. + * e.g. + * long page_size = sysconf(_SC_PAGE_SIZE); + * void *reg_page = mmap(NULL, MSHV_HV_PAGE_SIZE, PROT_READ|PROT_WRITE, + * MAP_SHARED, vp_fd, + * MSHV_VP_MMAP_OFFSET_REGISTERS * page_size); + */ +enum { + MSHV_VP_MMAP_OFFSET_REGISTERS, + MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE, + MSHV_VP_MMAP_OFFSET_GHCB, + MSHV_VP_MMAP_OFFSET_COUNT +}; + +/** + * struct mshv_run_vp - argument for MSHV_RUN_VP + * @msg_buf: On success, the intercept message is copied here. It can be + * interpreted using the relevant hypervisor definitions. + */ +struct mshv_run_vp { + __u8 msg_buf[MSHV_RUN_VP_BUF_SZ]; +}; + +enum { + MSHV_VP_STATE_LAPIC, /* Local interrupt controller state (either arch) */ + MSHV_VP_STATE_XSAVE, /* XSAVE data in compacted form (x86_64) */ + MSHV_VP_STATE_SIMP, + MSHV_VP_STATE_SIEFP, + MSHV_VP_STATE_SYNTHETIC_TIMERS, + MSHV_VP_STATE_COUNT, +}; + +/** + * struct mshv_get_set_vp_state - arguments for MSHV_[GET,SET]_VP_STATE + * @type: MSHV_VP_STATE_* + * @rsvd: MBZ + * @buf_sz: in: 4k page-aligned size of buffer + * out: Actual size of data (on EINVAL, check this to see if buffer + * was too small) + * @buf_ptr: 4k page-aligned data buffer + */ +struct mshv_get_set_vp_state { + __u8 type; + __u8 rsvd[3]; + __u32 buf_sz; + __u64 buf_ptr; +}; + +/* VP fds created with MSHV_CREATE_VP */ +#define MSHV_RUN_VP _IOR(MSHV_IOCTL, 0x00, struct mshv_run_vp) +#define MSHV_GET_VP_STATE _IOWR(MSHV_IOCTL, 0x01, struct mshv_get_set_vp_state) +#define MSHV_SET_VP_STATE _IOWR(MSHV_IOCTL, 0x02, struct mshv_get_set_vp_state) +/* + * Generic hypercall + * Defined above in partition IOCTLs, avoid redefining it here + * #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) + */ + +#endif From 999ad14259a0d45cb3b616e2e95a7c8b622a7ecd Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Tue, 18 Mar 2025 14:49:19 -0700 Subject: [PATCH 23/24] x86/hyperv: Add comments about hv_vpset and var size hypercall input args Current code varies in how the size of the variable size input header for hypercalls is calculated when the input contains struct hv_vpset. Surprisingly, this variation is correct, as different hypercalls make different choices for what portion of struct hv_vpset is treated as part of the variable size input header. The Hyper-V TLFS is silent on these details, but the behavior has been confirmed with Hyper-V developers. To avoid future confusion about these differences, add comments to struct hv_vpset, and to hypercall call sites with input that contains a struct hv_vpset. The comments describe the overall situation and the calculation that should be used at each particular call site. No functional change as only comments are updated. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20250318214919.958953-1-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20250318214919.958953-1-mhklinux@outlook.com> --- arch/x86/hyperv/hv_apic.c | 5 +++++ arch/x86/hyperv/mmu.c | 4 ++++ include/hyperv/hvgdk_mini.h | 9 ++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index f022d5f64fb6..6d91ac5f9836 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -145,6 +145,11 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, ipi_arg->vp_set.format = HV_GENERIC_SET_ALL; } + /* + * For this hypercall, Hyper-V treats the valid_bank_mask field + * of ipi_arg->vp_set as part of the fixed size input header. + * So the variable input header size is equal to nr_bank. + */ status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, ipi_arg, NULL); diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index cc8c3bd0e7c2..80d9350612d2 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -205,6 +205,10 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, /* * We can flush not more than max_gvas with one hypercall. Flush the * whole address space if we were asked to do more. + * + * For these hypercalls, Hyper-V treats the valid_bank_mask field + * of flush->hv_vp_set as part of the fixed size input header. + * So the variable input header size is equal to nr_bank. */ max_gvas = (PAGE_SIZE - sizeof(*flush) - nr_bank * diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 735329859f21..abf0bd76e370 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -205,7 +205,14 @@ union hv_reference_tsc_msr { /* The number of vCPUs in one sparse bank */ #define HV_VCPUS_PER_SPARSE_BANK (64) -/* Some of Hyper-V structs do not use hv_vpset where linux uses them */ +/* + * Some of Hyper-V structs do not use hv_vpset where linux uses them. + * + * struct hv_vpset is usually used as part of hypercall input. The portion + * that counts as "fixed size input header" vs. "variable size input header" + * varies per hypercall. See comments at relevant hypercall call sites as to + * how the "valid_bank_mask" field should be accounted. + */ struct hv_vpset { /* HV_VP_SET */ u64 format; u64 valid_bank_mask; From 628cc040b3a2980df6032766e8ef0688e981ab95 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Fri, 21 Mar 2025 22:40:14 +0000 Subject: [PATCH 24/24] x86/hyperv: fix an indentation issue in mshyperv.h Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202503220640.hjiacW2C-lkp@intel.com/ Signed-off-by: Wei Liu --- arch/x86/include/asm/mshyperv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 7dfca93ef048..07aadf0e839f 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -154,7 +154,7 @@ static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1) : "cc", "edi", "esi"); } #endif - return hv_status; + return hv_status; } static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)