Merge tag 'kvm-x86-irqs-6.17' of https://github.com/kvm-x86/linux into HEAD

KVM IRQ changes for 6.17

 - Rework irqbypass to track/match producers and consumers via an xarray
   instead of a linked list.  Using a linked list leads to O(n^2) insertion
   times, which is hugely problematic for use cases that create large numbers
   of VMs.  Such use cases typically don't actually use irqbypass, but
   eliminating the pointless registration is a future problem to solve as it
   likely requires new uAPI.

 - Track irqbypass's "token" as "struct eventfd_ctx *" instead of a "void *",
   to avoid making a simple concept unnecessarily difficult to understand.

 - Add CONFIG_KVM_IOAPIC for x86 to allow disabling support for I/O APIC, PIC,
   and PIT emulation at compile time.

 - Drop x86's irq_comm.c, and move a pile of IRQ related code into irq.c.

 - Fix a variety of flaws and bugs in the AVIC device posted IRQ code.

 - Inhibited AVIC if a vCPU's ID is too big (relative to what hardware
   supports) instead of rejecting vCPU creation.

 - Extend enable_ipiv module param support to SVM, by simply leaving IsRunning
   clear in the vCPU's physical ID table entry.

 - Disable IPI virtualization, via enable_ipiv, if the CPU is affected by
   erratum #1235, to allow (safely) enabling AVIC on such CPUs.

 - Dedup x86's device posted IRQ code, as the vast majority of functionality
   can be shared verbatime between SVM and VMX.

 - Harden the device posted IRQ code against bugs and runtime errors.

 - Use vcpu_idx, not vcpu_id, for GA log tag/metadata, to make lookups O(1)
   instead of O(n).

 - Generate GA Log interrupts if and only if the target vCPU is blocking, i.e.
   only if KVM needs a notification in order to wake the vCPU.

 - Decouple device posted IRQs from VFIO device assignment, as binding a VM to
   a VFIO group is not a requirement for enabling device posted IRQs.

 - Clean up and document/comment the irqfd assignment code.

 - Disallow binding multiple irqfds to an eventfd with a priority waiter, i.e.
   ensure an eventfd is bound to at most one irqfd through the entire host,
   and add a selftest to verify eventfd:irqfd bindings are globally unique.
pull/1309/head
Paolo Bonzini 2025-07-28 11:03:04 -04:00
commit f02b1bcc73
56 changed files with 1847 additions and 1765 deletions

View File

@ -2765,19 +2765,15 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq);
}
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
{
if (old->type != KVM_IRQ_ROUTING_MSI ||
new->type != KVM_IRQ_ROUTING_MSI)
return true;
if (old->type == KVM_IRQ_ROUTING_MSI &&
new->type == KVM_IRQ_ROUTING_MSI &&
!memcmp(&old->msi, &new->msi, sizeof(new->msi)))
return;
return memcmp(&old->msi, &new->msi, sizeof(new->msi));
}
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
/*
* Remapping the vLPI requires taking the its_lock mutex to resolve
* the new translation. We're in spinlock land at this point, so no
@ -2785,7 +2781,7 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
*
* Unmap the vLPI and fall back to software LPI injection.
*/
return kvm_vgic_v4_unset_forwarding(kvm, host_irq);
return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq);
}
void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)

View File

@ -758,7 +758,7 @@ static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
if (irq) {
scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
if (irq->hw)
WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
its_unmap_vlpi(ite->irq->host_irq);
irq->hw = false;
}

View File

@ -527,28 +527,26 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq)
return NULL;
}
int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
{
struct vgic_irq *irq;
unsigned long flags;
int ret = 0;
if (!vgic_supports_direct_msis(kvm))
return 0;
return;
irq = __vgic_host_irq_get_vlpi(kvm, host_irq);
if (!irq)
return 0;
return;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
WARN_ON(irq->hw && irq->host_irq != host_irq);
if (irq->hw) {
atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
irq->hw = false;
ret = its_unmap_vlpi(host_irq);
its_unmap_vlpi(host_irq);
}
raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(kvm, irq);
return ret;
}

View File

@ -26,7 +26,22 @@ enum {
IRQ_REMAP_X2APIC_MODE,
};
struct vcpu_data {
/*
* This is mainly used to communicate information back-and-forth
* between SVM and IOMMU for setting up and tearing down posted
* interrupt
*/
struct amd_iommu_pi_data {
u64 vapic_addr; /* Physical address of the vCPU's vAPIC. */
u32 ga_tag;
u32 vector; /* Guest vector of the interrupt */
int cpu;
bool ga_log_intr;
bool is_guest_mode;
void *ir_data;
};
struct intel_iommu_pi_data {
u64 pi_desc_addr; /* Physical address of PI Descriptor */
u32 vector; /* Guest vector of the interrupt */
};

View File

@ -112,7 +112,7 @@ KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
KVM_X86_OP_OPTIONAL(vcpu_unblocking)
KVM_X86_OP_OPTIONAL(pi_update_irte)
KVM_X86_OP_OPTIONAL(pi_start_assignment)
KVM_X86_OP_OPTIONAL(pi_start_bypass)
KVM_X86_OP_OPTIONAL(apicv_pre_state_restore)
KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)

View File

@ -297,6 +297,7 @@ enum x86_intercept_stage;
*/
#define KVM_APIC_PV_EOI_PENDING 1
struct kvm_kernel_irqfd;
struct kvm_kernel_irq_routing_entry;
/*
@ -1320,6 +1321,12 @@ enum kvm_apicv_inhibit {
*/
APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
/*
* AVIC is disabled because the vCPU's APIC ID is beyond the max
* supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable.
*/
APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG,
NR_APICV_INHIBIT_REASONS,
};
@ -1338,7 +1345,8 @@ enum kvm_apicv_inhibit {
__APICV_INHIBIT_REASON(IRQWIN), \
__APICV_INHIBIT_REASON(PIT_REINJ), \
__APICV_INHIBIT_REASON(SEV), \
__APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
__APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \
__APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)
struct kvm_arch {
unsigned long n_used_mmu_pages;
@ -1381,9 +1389,13 @@ struct kvm_arch {
atomic_t noncoherent_dma_count;
#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
atomic_t assigned_device_count;
unsigned long nr_possible_bypass_irqs;
#ifdef CONFIG_KVM_IOAPIC
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
#endif
atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
struct kvm_apic_map __rcu *apic_map;
@ -1403,7 +1415,6 @@ struct kvm_arch {
bool pause_in_guest;
bool cstate_in_guest;
unsigned long irq_sources_bitmap;
s64 kvmclock_offset;
/*
@ -1432,9 +1443,6 @@ struct kvm_arch {
struct delayed_work kvmclock_update_work;
struct delayed_work kvmclock_sync_work;
/* reads protected by irq_srcu, writes by irq_lock */
struct hlist_head mask_notifier_list;
#ifdef CONFIG_KVM_HYPERV
struct kvm_hv hyperv;
#endif
@ -1853,9 +1861,10 @@ struct kvm_x86_ops {
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void (*pi_start_assignment)(struct kvm *kvm);
int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
unsigned int host_irq, uint32_t guest_irq,
struct kvm_vcpu *vcpu, u32 vector);
void (*pi_start_bypass)(struct kvm *kvm);
void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
@ -1950,6 +1959,7 @@ struct kvm_arch_async_pf {
extern u32 __read_mostly kvm_nr_uret_msrs;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
extern bool __read_mostly enable_ipiv;
extern bool __read_mostly enable_device_posted_irqs;
extern struct kvm_x86_ops kvm_x86_ops;
@ -2044,19 +2054,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
struct kvm_irq_mask_notifier {
void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
int irq;
struct hlist_node link;
};
void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn);
void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn);
void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
bool mask);
extern bool tdp_enabled;
u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
@ -2215,9 +2212,6 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
return !!(*irq_state);
}
int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
@ -2394,9 +2388,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq);
static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
{
/* We can only post Fixed and LowPrio IRQs */

View File

@ -252,16 +252,21 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
/*
* GA_LOG_INTR is a synthetic flag that's never propagated to hardware-visible
* tables. GA_LOG_INTR is set if the vCPU needs device posted IRQs to generate
* GA log interrupts to wake the vCPU (because it's blocking or about to block).
*/
#define AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR BIT_ULL(61)
#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK GENMASK_ULL(51, 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFFULL)
#define AVIC_DOORBELL_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
@ -290,8 +295,6 @@ enum avic_ipi_failure_cause {
static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID);
static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0)
#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)
#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)

View File

@ -166,6 +166,16 @@ config KVM_AMD_SEV
Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
Secure Nested Paging (SEV-SNP) technologies on AMD processors.
config KVM_IOAPIC
bool "I/O APIC, PIC, and PIT emulation"
default y
depends on KVM
help
Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
for full in-kernel APIC emulation.
If unsure, say Y.
config KVM_SMM
bool "System Management Mode emulation"
default y

View File

@ -5,12 +5,11 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror
include $(srctree)/virt/kvm/Makefile.kvm
kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
debugfs.o mmu/mmu.o mmu/page_track.o \
mmu/spte.o
kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \
debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
kvm-$(CONFIG_KVM_HYPERV) += hyperv.o
kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-$(CONFIG_KVM_SMM) += smm.o

View File

@ -497,15 +497,19 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
return ret;
}
int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
int irq_source_id, int level, bool line_status)
{
struct kvm_vcpu_hv_synic *synic;
synic = synic_get(kvm, vpidx);
if (!level)
return -1;
synic = synic_get(kvm, e->hv_sint.vcpu);
if (!synic)
return -EINVAL;
return synic_set_irq(synic, sint);
return synic_set_irq(synic, e->hv_sint.sint);
}
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)

View File

@ -103,7 +103,8 @@ static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
int irq_source_id, int level, bool line_status);
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);

View File

@ -248,8 +248,8 @@ static void pit_do_work(struct kthread_work *work)
if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
return;
kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false);
kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false);
/*
* Provides NMI watchdog support via Virtual Wire mode.
@ -288,7 +288,7 @@ static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
atomic_set(&pit->pit_state.irq_ack, 1);
}
void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
static void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
{
struct kvm_kpit_state *ps = &pit->pit_state;
struct kvm *kvm = pit->kvm;
@ -400,8 +400,8 @@ static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
}
}
void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
int hpet_legacy_start)
static void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
int hpet_legacy_start)
{
u8 saved_mode;
@ -649,6 +649,79 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
kvm_pit_reset_reinject(pit);
}
int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
mutex_lock(&kps->lock);
memcpy(ps, &kps->channels, sizeof(*ps));
mutex_unlock(&kps->lock);
return 0;
}
int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int i;
struct kvm_pit *pit = kvm->arch.vpit;
mutex_lock(&pit->pit_state.lock);
memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
for (i = 0; i < 3; i++)
kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
mutex_unlock(&pit->pit_state.lock);
return 0;
}
int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
sizeof(ps->channels));
ps->flags = kvm->arch.vpit->pit_state.flags;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
memset(&ps->reserved, 0, sizeof(ps->reserved));
return 0;
}
int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
int start = 0;
int i;
u32 prev_legacy, cur_legacy;
struct kvm_pit *pit = kvm->arch.vpit;
mutex_lock(&pit->pit_state.lock);
prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
if (!prev_legacy && cur_legacy)
start = 1;
memcpy(&pit->pit_state.channels, &ps->channels,
sizeof(pit->pit_state.channels));
pit->pit_state.flags = ps->flags;
for (i = 0; i < 3; i++)
kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
start && i == 0);
mutex_unlock(&pit->pit_state.lock);
return 0;
}
int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control)
{
struct kvm_pit *pit = kvm->arch.vpit;
/* pit->pit_state.lock was overloaded to prevent userspace from getting
* an inconsistent state after running multiple KVM_REINJECT_CONTROL
* ioctls in parallel. Use a separate lock if that ioctl isn't rare.
*/
mutex_lock(&pit->pit_state.lock);
kvm_pit_set_reinject(pit, control->pit_reinject);
mutex_unlock(&pit->pit_state.lock);
return 0;
}
static const struct kvm_io_device_ops pit_dev_ops = {
.read = pit_ioport_read,
.write = pit_ioport_write,
@ -671,10 +744,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
if (!pit)
return NULL;
pit->irq_source_id = kvm_request_irq_source_id(kvm);
if (pit->irq_source_id < 0)
goto fail_request;
mutex_init(&pit->pit_state.lock);
pid = get_pid(task_tgid(current));
@ -726,8 +795,6 @@ fail_register_pit:
kvm_pit_set_reinject(pit, false);
kthread_destroy_worker(pit->worker);
fail_kthread:
kvm_free_irq_source_id(kvm, pit->irq_source_id);
fail_request:
kfree(pit);
return NULL;
}
@ -744,7 +811,6 @@ void kvm_free_pit(struct kvm *kvm)
kvm_pit_set_reinject(pit, false);
hrtimer_cancel(&pit->pit_state.timer);
kthread_destroy_worker(pit->worker);
kvm_free_irq_source_id(kvm, pit->irq_source_id);
kfree(pit);
}
}

View File

@ -6,6 +6,11 @@
#include <kvm/iodev.h>
#include <uapi/asm/kvm.h>
#include "ioapic.h"
#ifdef CONFIG_KVM_IOAPIC
struct kvm_kpit_channel_state {
u32 count; /* can be 65536 */
u16 latched_count;
@ -42,7 +47,6 @@ struct kvm_pit {
struct kvm_io_device speaker_dev;
struct kvm *kvm;
struct kvm_kpit_state pit_state;
int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
struct kthread_worker *worker;
struct kthread_work expired;
@ -55,11 +59,14 @@ struct kvm_pit {
#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
#define KVM_PIT_CHANNEL_MASK 0x3
int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps);
int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps);
int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control);
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
int hpet_legacy_start);
void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
#endif /* CONFIG_KVM_IOAPIC */
#endif

View File

@ -31,6 +31,8 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bitops.h>
#include "ioapic.h"
#include "irq.h"
#include <linux/kvm_host.h>
@ -185,8 +187,11 @@ void kvm_pic_update_irq(struct kvm_pic *s)
pic_unlock(s);
}
int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
int irq_source_id, int level, bool line_status)
{
struct kvm_pic *s = kvm->arch.vpic;
int irq = e->irqchip.pin;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
@ -203,16 +208,6 @@ int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
return ret;
}
void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
{
int i;
pic_lock(s);
for (i = 0; i < PIC_NUM_PINS; i++)
__clear_bit(irq_source_id, &s->irq_states[i]);
pic_unlock(s);
}
/*
* acknowledge interrupt 'irq'
*/

View File

@ -41,11 +41,11 @@
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/current.h>
#include <trace/events/kvm.h>
#include "ioapic.h"
#include "lapic.h"
#include "irq.h"
#include "trace.h"
static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
bool line_status);
@ -310,6 +310,42 @@ void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
kvm_make_scan_ioapic_request(kvm);
}
void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
mutex_lock(&kvm->irq_lock);
kimn->irq = irq;
hlist_add_head_rcu(&kimn->link, &ioapic->mask_notifier_list);
mutex_unlock(&kvm->irq_lock);
}
void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn)
{
mutex_lock(&kvm->irq_lock);
hlist_del_rcu(&kimn->link);
mutex_unlock(&kvm->irq_lock);
synchronize_srcu(&kvm->irq_srcu);
}
void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
bool mask)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
struct kvm_irq_mask_notifier *kimn;
int idx, gsi;
idx = srcu_read_lock(&kvm->irq_srcu);
gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
if (gsi != -1)
hlist_for_each_entry_rcu(kimn, &ioapic->mask_notifier_list, link)
if (kimn->irq == gsi)
kimn->func(kimn, mask);
srcu_read_unlock(&kvm->irq_srcu, idx);
}
static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
{
unsigned index;
@ -479,9 +515,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
return ret;
}
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
int level, bool line_status)
int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
int irq_source_id, int level, bool line_status)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
int irq = e->irqchip.pin;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
@ -496,16 +534,6 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
return ret;
}
void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
{
int i;
spin_lock(&ioapic->lock);
for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
__clear_bit(irq_source_id, &ioapic->irq_states[i]);
spin_unlock(&ioapic->lock);
}
static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
{
int i;
@ -718,6 +746,7 @@ int kvm_ioapic_init(struct kvm *kvm)
return -ENOMEM;
spin_lock_init(&ioapic->lock);
INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
INIT_HLIST_HEAD(&ioapic->mask_notifier_list);
kvm->arch.vioapic = ioapic;
kvm_ioapic_reset(ioapic);
kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);

View File

@ -86,8 +86,24 @@ struct kvm_ioapic {
struct delayed_work eoi_inject;
u32 irq_eoi[IOAPIC_NUM_PINS];
u32 irr_delivered;
/* reads protected by irq_srcu, writes by irq_lock */
struct hlist_head mask_notifier_list;
};
struct kvm_irq_mask_notifier {
void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
int irq;
struct hlist_node link;
};
void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn);
void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn);
void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
bool mask);
#ifdef DEBUG
#define ASSERT(x) \
do { \
@ -103,7 +119,7 @@ do { \
static inline int ioapic_in_kernel(struct kvm *kvm)
{
return irqchip_kernel(kvm);
return irqchip_full(kvm);
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@ -111,9 +127,9 @@ void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
int trigger_mode);
int kvm_ioapic_init(struct kvm *kvm);
void kvm_ioapic_destroy(struct kvm *kvm);
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
int level, bool line_status);
void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
int irq_source_id, int level, bool line_status);
void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,

View File

@ -11,9 +11,12 @@
#include <linux/export.h>
#include <linux/kvm_host.h>
#include <linux/kvm_irqfd.h>
#include "hyperv.h"
#include "ioapic.h"
#include "irq.h"
#include "i8254.h"
#include "trace.h"
#include "x86.h"
#include "xen.h"
@ -41,6 +44,14 @@ static int pending_userspace_extint(struct kvm_vcpu *v)
return v->arch.pending_external_vector != -1;
}
static int get_userspace_extint(struct kvm_vcpu *vcpu)
{
int vector = vcpu->arch.pending_external_vector;
vcpu->arch.pending_external_vector = -1;
return vector;
}
/*
* check if there is pending interrupt from
* non-APIC source without intack.
@ -67,10 +78,13 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
if (!kvm_apic_accept_pic_intr(v))
return 0;
if (irqchip_split(v->kvm))
return pending_userspace_extint(v);
else
#ifdef CONFIG_KVM_IOAPIC
if (pic_in_kernel(v->kvm))
return v->kvm->arch.vpic->output;
#endif
WARN_ON_ONCE(!irqchip_split(v->kvm));
return pending_userspace_extint(v);
}
/*
@ -126,13 +140,13 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v)
return v->kvm->arch.xen.upcall_vector;
#endif
if (irqchip_split(v->kvm)) {
int vector = v->arch.pending_external_vector;
v->arch.pending_external_vector = -1;
return vector;
} else
#ifdef CONFIG_KVM_IOAPIC
if (pic_in_kernel(v->kvm))
return kvm_pic_read_irq(v->kvm); /* PIC */
#endif
WARN_ON_ONCE(!irqchip_split(v->kvm));
return get_userspace_extint(v);
}
EXPORT_SYMBOL_GPL(kvm_cpu_get_extint);
@ -163,7 +177,9 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
#ifdef CONFIG_KVM_IOAPIC
__kvm_migrate_pit_timer(vcpu);
#endif
kvm_x86_call(migrate_timers)(vcpu);
}
@ -171,10 +187,539 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
{
bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
return resample ? irqchip_full(kvm) : irqchip_in_kernel(kvm);
}
bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
{
return irqchip_in_kernel(kvm);
}
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, struct dest_map *dest_map)
{
int r = -1;
struct kvm_vcpu *vcpu, *lowest = NULL;
unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
unsigned int dest_vcpus = 0;
if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
return r;
if (irq->dest_mode == APIC_DEST_PHYSICAL &&
irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
pr_info("apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
continue;
if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
irq->dest_id, irq->dest_mode))
continue;
if (!kvm_lowest_prio_delivery(irq)) {
if (r < 0)
r = 0;
r += kvm_apic_set_irq(vcpu, irq, dest_map);
} else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
if (!kvm_vector_hashing_enabled()) {
if (!lowest)
lowest = vcpu;
else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
lowest = vcpu;
} else {
__set_bit(i, dest_vcpu_bitmap);
dest_vcpus++;
}
}
}
if (dest_vcpus != 0) {
int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
dest_vcpu_bitmap, KVM_MAX_VCPUS);
lowest = kvm_get_vcpu(kvm, idx);
}
if (lowest)
r = kvm_apic_set_irq(lowest, irq, dest_map);
return r;
}
static void kvm_msi_to_lapic_irq(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq)
{
struct msi_msg msg = { .address_lo = e->msi.address_lo,
.address_hi = e->msi.address_hi,
.data = e->msi.data };
trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
(u64)msg.address_hi << 32 : 0), msg.data);
irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
irq->vector = msg.arch_data.vector;
irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
irq->trig_mode = msg.arch_data.is_level;
irq->delivery_mode = msg.arch_data.delivery_mode << 8;
irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
irq->level = 1;
irq->shorthand = APIC_DEST_NOSHORT;
}
static inline bool kvm_msi_route_invalid(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e)
{
return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
}
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level, bool line_status)
{
struct kvm_lapic_irq irq;
if (kvm_msi_route_invalid(kvm, e))
return -EINVAL;
if (!level)
return -1;
kvm_msi_to_lapic_irq(kvm, e, &irq);
return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
}
int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
{
struct kvm_lapic_irq irq;
int r;
switch (e->type) {
#ifdef CONFIG_KVM_HYPERV
case KVM_IRQ_ROUTING_HV_SINT:
return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level,
line_status);
#endif
case KVM_IRQ_ROUTING_MSI:
if (kvm_msi_route_invalid(kvm, e))
return -EINVAL;
kvm_msi_to_lapic_irq(kvm, e, &irq);
if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
return r;
break;
#ifdef CONFIG_KVM_XEN
case KVM_IRQ_ROUTING_XEN_EVTCHN:
if (!level)
return -1;
return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
#endif
default:
break;
}
return -EWOULDBLOCK;
}
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
bool line_status)
{
if (!irqchip_in_kernel(kvm))
return -ENXIO;
irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event->irq, irq_event->level,
line_status);
return 0;
}
bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
{
return irqchip_in_kernel(kvm);
}
int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
/* We can't check irqchip_in_kernel() here as some callers are
* currently initializing the irqchip. Other callers should therefore
* check kvm_arch_can_set_irq_routing() before calling this function.
*/
switch (ue->type) {
#ifdef CONFIG_KVM_IOAPIC
case KVM_IRQ_ROUTING_IRQCHIP:
if (irqchip_split(kvm))
return -EINVAL;
e->irqchip.pin = ue->u.irqchip.pin;
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_SLAVE:
e->irqchip.pin += PIC_NUM_PINS / 2;
fallthrough;
case KVM_IRQCHIP_PIC_MASTER:
if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
return -EINVAL;
e->set = kvm_pic_set_irq;
break;
case KVM_IRQCHIP_IOAPIC:
if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
return -EINVAL;
e->set = kvm_ioapic_set_irq;
break;
default:
return -EINVAL;
}
e->irqchip.irqchip = ue->u.irqchip.irqchip;
break;
#endif
case KVM_IRQ_ROUTING_MSI:
e->set = kvm_set_msi;
e->msi.address_lo = ue->u.msi.address_lo;
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;
if (kvm_msi_route_invalid(kvm, e))
return -EINVAL;
break;
#ifdef CONFIG_KVM_HYPERV
case KVM_IRQ_ROUTING_HV_SINT:
e->set = kvm_hv_synic_set_irq;
e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
e->hv_sint.sint = ue->u.hv_sint.sint;
break;
#endif
#ifdef CONFIG_KVM_XEN
case KVM_IRQ_ROUTING_XEN_EVTCHN:
return kvm_xen_setup_evtchn(kvm, e, ue);
#endif
default:
return -EINVAL;
}
return 0;
}
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu)
{
int r = 0;
unsigned long i;
struct kvm_vcpu *vcpu;
if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
return true;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
continue;
if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
irq->dest_id, irq->dest_mode))
continue;
if (++r == 2)
return false;
*dest_vcpu = vcpu;
}
return r == 1;
}
EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
u8 vector, unsigned long *ioapic_handled_vectors)
{
/*
* Intercept EOI if the vCPU is the target of the new IRQ routing, or
* the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
* may receive a level-triggered IRQ in the future, or already received
* level-triggered IRQ. The EOI needs to be intercepted and forwarded
* to I/O APIC emulation so that the IRQ can be de-asserted.
*/
if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
__set_bit(vector, ioapic_handled_vectors);
} else if (kvm_apic_pending_eoi(vcpu, vector)) {
__set_bit(vector, ioapic_handled_vectors);
/*
* Track the highest pending EOI for which the vCPU is NOT the
* target in the new routing. Only the EOI for the IRQ that is
* in-flight (for the old routing) needs to be intercepted, any
* future IRQs that arrive on this vCPU will be coincidental to
* the level-triggered routing and don't need to be intercepted.
*/
if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
}
}
void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
ulong *ioapic_handled_vectors)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_kernel_irq_routing_entry *entry;
struct kvm_irq_routing_table *table;
u32 i, nr_ioapic_pins;
int idx;
idx = srcu_read_lock(&kvm->irq_srcu);
table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
kvm->arch.nr_reserved_ioapic_pins);
for (i = 0; i < nr_ioapic_pins; ++i) {
hlist_for_each_entry(entry, &table->map[i], link) {
struct kvm_lapic_irq irq;
if (entry->type != KVM_IRQ_ROUTING_MSI)
continue;
kvm_msi_to_lapic_irq(vcpu->kvm, entry, &irq);
if (!irq.trig_mode)
continue;
kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
irq.vector, ioapic_handled_vectors);
}
}
srcu_read_unlock(&kvm->irq_srcu, idx);
}
void kvm_arch_irq_routing_update(struct kvm *kvm)
{
#ifdef CONFIG_KVM_HYPERV
kvm_hv_irq_routing_update(kvm);
#endif
if (irqchip_split(kvm))
kvm_make_scan_ioapic_request(kvm);
}
static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd,
struct kvm_kernel_irq_routing_entry *entry)
{
unsigned int host_irq = irqfd->producer->irq;
struct kvm *kvm = irqfd->kvm;
struct kvm_vcpu *vcpu = NULL;
struct kvm_lapic_irq irq;
int r;
if (WARN_ON_ONCE(!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass()))
return -EINVAL;
if (entry && entry->type == KVM_IRQ_ROUTING_MSI) {
kvm_msi_to_lapic_irq(kvm, entry, &irq);
/*
* Force remapped mode if hardware doesn't support posting the
* virtual interrupt to a vCPU. Only IRQs are postable (NMIs,
* SMIs, etc. are not), and neither AMD nor Intel IOMMUs support
* posting multicast/broadcast IRQs. If the interrupt can't be
* posted, the device MSI needs to be routed to the host so that
* the guest's desired interrupt can be synthesized by KVM.
*
* This means that KVM can only post lowest-priority interrupts
* if they have a single CPU as the destination, e.g. only if
* the guest has affined the interrupt to a single vCPU.
*/
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
!kvm_irq_is_postable(&irq))
vcpu = NULL;
}
if (!irqfd->irq_bypass_vcpu && !vcpu)
return 0;
r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi,
vcpu, irq.vector);
if (r) {
WARN_ON_ONCE(irqfd->irq_bypass_vcpu && !vcpu);
irqfd->irq_bypass_vcpu = NULL;
return r;
}
irqfd->irq_bypass_vcpu = vcpu;
trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu);
return 0;
}
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
struct kvm *kvm = irqfd->kvm;
int ret = 0;
kvm_arch_start_assignment(irqfd->kvm);
spin_lock_irq(&kvm->irqfds.lock);
irqfd->producer = prod;
if (!kvm->arch.nr_possible_bypass_irqs++)
kvm_x86_call(pi_start_bypass)(kvm);
if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry);
if (ret) {
kvm->arch.nr_possible_bypass_irqs--;
kvm_arch_end_assignment(irqfd->kvm);
}
}
spin_unlock_irq(&kvm->irqfds.lock);
return ret;
}
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
struct kvm *kvm = irqfd->kvm;
int ret;
WARN_ON(irqfd->producer != prod);
/*
* If the producer of an IRQ that is currently being posted to a vCPU
* is unregistered, change the associated IRTE back to remapped mode as
* the IRQ has been released (or repurposed) by the device driver, i.e.
* KVM must relinquish control of the IRTE.
*/
spin_lock_irq(&kvm->irqfds.lock);
if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
ret = kvm_pi_update_irte(irqfd, NULL);
if (ret)
pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n",
irqfd->consumer.eventfd, ret);
}
irqfd->producer = NULL;
kvm->arch.nr_possible_bypass_irqs--;
spin_unlock_irq(&kvm->irqfds.lock);
kvm_arch_end_assignment(irqfd->kvm);
}
void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
{
if (new->type != KVM_IRQ_ROUTING_MSI &&
old->type != KVM_IRQ_ROUTING_MSI)
return;
if (old->type == KVM_IRQ_ROUTING_MSI &&
new->type == KVM_IRQ_ROUTING_MSI &&
!memcmp(&old->msi, &new->msi, sizeof(new->msi)))
return;
kvm_pi_update_irte(irqfd, new);
}
#ifdef CONFIG_KVM_IOAPIC
#define IOAPIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
#define PIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
#define ROUTING_ENTRY2(irq) \
IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
static const struct kvm_irq_routing_entry default_routing[] = {
ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
};
int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm)
{
return kvm_set_irq_routing(kvm, default_routing,
ARRAY_SIZE(default_routing), 0);
}
int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
struct kvm_pic *pic = kvm->arch.vpic;
int r;
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
memcpy(&chip->chip.pic, &pic->pics[0],
sizeof(struct kvm_pic_state));
break;
case KVM_IRQCHIP_PIC_SLAVE:
memcpy(&chip->chip.pic, &pic->pics[1],
sizeof(struct kvm_pic_state));
break;
case KVM_IRQCHIP_IOAPIC:
kvm_get_ioapic(kvm, &chip->chip.ioapic);
break;
default:
r = -EINVAL;
break;
}
return r;
}
int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
struct kvm_pic *pic = kvm->arch.vpic;
int r;
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
spin_lock(&pic->lock);
memcpy(&pic->pics[0], &chip->chip.pic,
sizeof(struct kvm_pic_state));
spin_unlock(&pic->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
spin_lock(&pic->lock);
memcpy(&pic->pics[1], &chip->chip.pic,
sizeof(struct kvm_pic_state));
spin_unlock(&pic->lock);
break;
case KVM_IRQCHIP_IOAPIC:
kvm_set_ioapic(kvm, &chip->chip.ioapic);
break;
default:
r = -EINVAL;
break;
}
kvm_pic_update_irq(pic);
return r;
}
#endif

View File

@ -18,6 +18,8 @@
#include <kvm/iodev.h>
#include "lapic.h"
#ifdef CONFIG_KVM_IOAPIC
#define PIC_NUM_PINS 16
#define SELECT_PIC(irq) \
((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
@ -63,6 +65,34 @@ int kvm_pic_init(struct kvm *kvm);
void kvm_pic_destroy(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
int irq_source_id, int level, bool line_status);
int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm);
int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
static inline int irqchip_full(struct kvm *kvm)
{
int mode = kvm->arch.irqchip_mode;
/* Matches smp_wmb() when setting irqchip_mode */
smp_rmb();
return mode == KVM_IRQCHIP_KERNEL;
}
#else /* CONFIG_KVM_IOAPIC */
static __always_inline int irqchip_full(struct kvm *kvm)
{
return false;
}
#endif
static inline int pic_in_kernel(struct kvm *kvm)
{
return irqchip_full(kvm);
}
static inline int irqchip_split(struct kvm *kvm)
{
@ -73,20 +103,6 @@ static inline int irqchip_split(struct kvm *kvm)
return mode == KVM_IRQCHIP_SPLIT;
}
static inline int irqchip_kernel(struct kvm *kvm)
{
int mode = kvm->arch.irqchip_mode;
/* Matches smp_wmb() when setting irqchip_mode */
smp_rmb();
return mode == KVM_IRQCHIP_KERNEL;
}
static inline int pic_in_kernel(struct kvm *kvm)
{
return irqchip_kernel(kvm);
}
static inline int irqchip_in_kernel(struct kvm *kvm)
{
int mode = kvm->arch.irqchip_mode;
@ -105,7 +121,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
int kvm_setup_default_irq_routing(struct kvm *kvm);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq,
struct dest_map *dest_map);

View File

@ -1,469 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* irq_comm.c: Common API for in kernel interrupt controller
* Copyright (c) 2007, Intel Corporation.
*
* Authors:
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
*
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/rculist.h>
#include <trace/events/kvm.h>
#include "irq.h"
#include "ioapic.h"
#include "lapic.h"
#include "hyperv.h"
#include "x86.h"
#include "xen.h"
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
{
struct kvm_pic *pic = kvm->arch.vpic;
return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
}
static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
line_status);
}
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, struct dest_map *dest_map)
{
int r = -1;
struct kvm_vcpu *vcpu, *lowest = NULL;
unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
unsigned int dest_vcpus = 0;
if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
return r;
if (irq->dest_mode == APIC_DEST_PHYSICAL &&
irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
pr_info("apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
continue;
if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
irq->dest_id, irq->dest_mode))
continue;
if (!kvm_lowest_prio_delivery(irq)) {
if (r < 0)
r = 0;
r += kvm_apic_set_irq(vcpu, irq, dest_map);
} else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
if (!kvm_vector_hashing_enabled()) {
if (!lowest)
lowest = vcpu;
else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
lowest = vcpu;
} else {
__set_bit(i, dest_vcpu_bitmap);
dest_vcpus++;
}
}
}
if (dest_vcpus != 0) {
int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
dest_vcpu_bitmap, KVM_MAX_VCPUS);
lowest = kvm_get_vcpu(kvm, idx);
}
if (lowest)
r = kvm_apic_set_irq(lowest, irq, dest_map);
return r;
}
void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq)
{
struct msi_msg msg = { .address_lo = e->msi.address_lo,
.address_hi = e->msi.address_hi,
.data = e->msi.data };
trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
(u64)msg.address_hi << 32 : 0), msg.data);
irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
irq->vector = msg.arch_data.vector;
irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
irq->trig_mode = msg.arch_data.is_level;
irq->delivery_mode = msg.arch_data.delivery_mode << 8;
irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
irq->level = 1;
irq->shorthand = APIC_DEST_NOSHORT;
}
EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
static inline bool kvm_msi_route_invalid(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e)
{
return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
}
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level, bool line_status)
{
struct kvm_lapic_irq irq;
if (kvm_msi_route_invalid(kvm, e))
return -EINVAL;
if (!level)
return -1;
kvm_set_msi_irq(kvm, e, &irq);
return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
}
#ifdef CONFIG_KVM_HYPERV
static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
{
if (!level)
return -1;
return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
}
#endif
int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
{
struct kvm_lapic_irq irq;
int r;
switch (e->type) {
#ifdef CONFIG_KVM_HYPERV
case KVM_IRQ_ROUTING_HV_SINT:
return kvm_hv_set_sint(e, kvm, irq_source_id, level,
line_status);
#endif
case KVM_IRQ_ROUTING_MSI:
if (kvm_msi_route_invalid(kvm, e))
return -EINVAL;
kvm_set_msi_irq(kvm, e, &irq);
if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
return r;
break;
#ifdef CONFIG_KVM_XEN
case KVM_IRQ_ROUTING_XEN_EVTCHN:
if (!level)
return -1;
return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
#endif
default:
break;
}
return -EWOULDBLOCK;
}
int kvm_request_irq_source_id(struct kvm *kvm)
{
unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
int irq_source_id;
mutex_lock(&kvm->irq_lock);
irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
if (irq_source_id >= BITS_PER_LONG) {
pr_warn("exhausted allocatable IRQ sources!\n");
irq_source_id = -EFAULT;
goto unlock;
}
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
set_bit(irq_source_id, bitmap);
unlock:
mutex_unlock(&kvm->irq_lock);
return irq_source_id;
}
void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
{
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
mutex_lock(&kvm->irq_lock);
if (irq_source_id < 0 ||
irq_source_id >= BITS_PER_LONG) {
pr_err("IRQ source ID out of range!\n");
goto unlock;
}
clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
if (!irqchip_kernel(kvm))
goto unlock;
kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
unlock:
mutex_unlock(&kvm->irq_lock);
}
void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn)
{
mutex_lock(&kvm->irq_lock);
kimn->irq = irq;
hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
mutex_unlock(&kvm->irq_lock);
}
void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn)
{
mutex_lock(&kvm->irq_lock);
hlist_del_rcu(&kimn->link);
mutex_unlock(&kvm->irq_lock);
synchronize_srcu(&kvm->irq_srcu);
}
void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
bool mask)
{
struct kvm_irq_mask_notifier *kimn;
int idx, gsi;
idx = srcu_read_lock(&kvm->irq_srcu);
gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
if (gsi != -1)
hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link)
if (kimn->irq == gsi)
kimn->func(kimn, mask);
srcu_read_unlock(&kvm->irq_srcu, idx);
}
bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
{
return irqchip_in_kernel(kvm);
}
int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
/* We can't check irqchip_in_kernel() here as some callers are
* currently initializing the irqchip. Other callers should therefore
* check kvm_arch_can_set_irq_routing() before calling this function.
*/
switch (ue->type) {
case KVM_IRQ_ROUTING_IRQCHIP:
if (irqchip_split(kvm))
return -EINVAL;
e->irqchip.pin = ue->u.irqchip.pin;
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_SLAVE:
e->irqchip.pin += PIC_NUM_PINS / 2;
fallthrough;
case KVM_IRQCHIP_PIC_MASTER:
if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
return -EINVAL;
e->set = kvm_set_pic_irq;
break;
case KVM_IRQCHIP_IOAPIC:
if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
return -EINVAL;
e->set = kvm_set_ioapic_irq;
break;
default:
return -EINVAL;
}
e->irqchip.irqchip = ue->u.irqchip.irqchip;
break;
case KVM_IRQ_ROUTING_MSI:
e->set = kvm_set_msi;
e->msi.address_lo = ue->u.msi.address_lo;
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;
if (kvm_msi_route_invalid(kvm, e))
return -EINVAL;
break;
#ifdef CONFIG_KVM_HYPERV
case KVM_IRQ_ROUTING_HV_SINT:
e->set = kvm_hv_set_sint;
e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
e->hv_sint.sint = ue->u.hv_sint.sint;
break;
#endif
#ifdef CONFIG_KVM_XEN
case KVM_IRQ_ROUTING_XEN_EVTCHN:
return kvm_xen_setup_evtchn(kvm, e, ue);
#endif
default:
return -EINVAL;
}
return 0;
}
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu)
{
int r = 0;
unsigned long i;
struct kvm_vcpu *vcpu;
if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
return true;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!kvm_apic_present(vcpu))
continue;
if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
irq->dest_id, irq->dest_mode))
continue;
if (++r == 2)
return false;
*dest_vcpu = vcpu;
}
return r == 1;
}
EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
#define IOAPIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
#define PIC_ROUTING_ENTRY(irq) \
{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
.u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
#define ROUTING_ENTRY2(irq) \
IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
static const struct kvm_irq_routing_entry default_routing[] = {
ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
};
int kvm_setup_default_irq_routing(struct kvm *kvm)
{
return kvm_set_irq_routing(kvm, default_routing,
ARRAY_SIZE(default_routing), 0);
}
void kvm_arch_post_irq_routing_update(struct kvm *kvm)
{
if (!irqchip_split(kvm))
return;
kvm_make_scan_ioapic_request(kvm);
}
void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
u8 vector, unsigned long *ioapic_handled_vectors)
{
/*
* Intercept EOI if the vCPU is the target of the new IRQ routing, or
* the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
* may receive a level-triggered IRQ in the future, or already received
* level-triggered IRQ. The EOI needs to be intercepted and forwarded
* to I/O APIC emulation so that the IRQ can be de-asserted.
*/
if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
__set_bit(vector, ioapic_handled_vectors);
} else if (kvm_apic_pending_eoi(vcpu, vector)) {
__set_bit(vector, ioapic_handled_vectors);
/*
* Track the highest pending EOI for which the vCPU is NOT the
* target in the new routing. Only the EOI for the IRQ that is
* in-flight (for the old routing) needs to be intercepted, any
* future IRQs that arrive on this vCPU will be coincidental to
* the level-triggered routing and don't need to be intercepted.
*/
if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
}
}
void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
ulong *ioapic_handled_vectors)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_kernel_irq_routing_entry *entry;
struct kvm_irq_routing_table *table;
u32 i, nr_ioapic_pins;
int idx;
idx = srcu_read_lock(&kvm->irq_srcu);
table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
kvm->arch.nr_reserved_ioapic_pins);
for (i = 0; i < nr_ioapic_pins; ++i) {
hlist_for_each_entry(entry, &table->map[i], link) {
struct kvm_lapic_irq irq;
if (entry->type != KVM_IRQ_ROUTING_MSI)
continue;
kvm_set_msi_irq(vcpu->kvm, entry, &irq);
if (!irq.trig_mode)
continue;
kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
irq.vector, ioapic_handled_vectors);
}
}
srcu_read_unlock(&kvm->irq_srcu, idx);
}
void kvm_arch_irq_routing_update(struct kvm *kvm)
{
#ifdef CONFIG_KVM_HYPERV
kvm_hv_irq_routing_update(kvm);
#endif
}

View File

@ -1455,7 +1455,7 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
{
int trigger_mode;
int __maybe_unused trigger_mode;
/* Eoi the ioapic only if the ioapic doesn't own the vector. */
if (!kvm_ioapic_handles_vector(apic, vector))
@ -1476,12 +1476,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
return;
}
#ifdef CONFIG_KVM_IOAPIC
if (apic_test_vector(vector, apic->regs + APIC_TMR))
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
#endif
}
static int apic_set_eoi(struct kvm_lapic *apic)
@ -3146,8 +3148,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
#ifdef CONFIG_KVM_IOAPIC
if (ioapic_in_kernel(vcpu->kvm))
kvm_rtc_eoi_tracking_restore_one(vcpu);
#endif
vcpu->arch.apic_arb_prio = 0;

View File

@ -18,6 +18,7 @@
#include <linux/hashtable.h>
#include <linux/amd-iommu.h>
#include <linux/kvm_host.h>
#include <linux/kvm_irqfd.h>
#include <asm/irq_remapping.h>
#include <asm/msr.h>
@ -29,36 +30,39 @@
#include "svm.h"
/*
* Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID,
* into the GATag so that KVM can retrieve the correct vCPU from a GALog entry
* if an interrupt can't be delivered, e.g. because the vCPU isn't running.
* Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
* KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
* be delivered, e.g. because the vCPU isn't running. Use the vCPU's index
* instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
* lookup on the index, where as vCPUs whose index doesn't match their ID need
* to walk the entire xarray of vCPUs in the worst case scenario.
*
* For the vCPU ID, use however many bits are currently allowed for the max
* For the vCPU index, use however many bits are currently allowed for the max
* guest physical APIC ID (limited by the size of the physical ID table), and
* use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
* size of the GATag is defined by hardware (32 bits), but is an opaque value
* as far as hardware is concerned.
*/
#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
#define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
#define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK)
#define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
((vcpu_id) & AVIC_VCPU_ID_MASK))
#define AVIC_GATAG(vm_id, vcpu_id) \
#define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
((vcpu_idx) & AVIC_VCPU_IDX_MASK))
#define AVIC_GATAG(vm_id, vcpu_idx) \
({ \
u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \
u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \
\
WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \
WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \
WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
ga_tag; \
})
static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
static bool force_avic;
module_param_unsafe(force_avic, bool, 0444);
@ -75,14 +79,6 @@ static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
bool x2avic_enabled;
/*
* This is a wrapper of struct amd_iommu_ir_data.
*/
struct amd_svm_iommu_ir {
struct list_head node; /* Used by SVM for per-vcpu ir_list */
void *data; /* Storing pointer to struct amd_ir_data */
};
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
@ -147,16 +143,16 @@ int avic_ga_log_notifier(u32 ga_tag)
struct kvm_svm *kvm_svm;
struct kvm_vcpu *vcpu = NULL;
u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
trace_kvm_avic_ga_log(vm_id, vcpu_id);
pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
trace_kvm_avic_ga_log(vm_id, vcpu_idx);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
if (kvm_svm->avic_vm_id != vm_id)
continue;
vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
break;
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
@ -180,10 +176,8 @@ void avic_vm_destroy(struct kvm *kvm)
if (!enable_apicv)
return;
if (kvm_svm->avic_logical_id_table_page)
__free_page(kvm_svm->avic_logical_id_table_page);
if (kvm_svm->avic_physical_id_table_page)
__free_page(kvm_svm->avic_physical_id_table_page);
free_page((unsigned long)kvm_svm->avic_logical_id_table);
free_page((unsigned long)kvm_svm->avic_physical_id_table);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_del(&kvm_svm->hnode);
@ -196,27 +190,19 @@ int avic_vm_init(struct kvm *kvm)
int err = -ENOMEM;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
struct kvm_svm *k2;
struct page *p_page;
struct page *l_page;
u32 vm_id;
if (!enable_apicv)
return 0;
/* Allocating physical APIC ID table (4KB) */
p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!p_page)
kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!kvm_svm->avic_physical_id_table)
goto free_avic;
kvm_svm->avic_physical_id_table_page = p_page;
/* Allocating logical APIC ID table (4KB) */
l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!l_page)
kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!kvm_svm->avic_logical_id_table)
goto free_avic;
kvm_svm->avic_logical_id_table_page = l_page;
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
again:
vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
@ -242,17 +228,19 @@ free_avic:
return err;
}
static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
{
return __sme_set(__pa(svm->vcpu.arch.apic->regs));
}
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
{
struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
if (kvm_apicv_activated(svm->vcpu.kvm))
avic_activate_vmcb(svm);
@ -260,32 +248,31 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
avic_deactivate_vmcb(svm);
}
static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
unsigned int index)
{
u64 *avic_physical_id_table;
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
(index > X2AVIC_MAX_PHYSICAL_ID))
return NULL;
avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
return &avic_physical_id_table[index];
}
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
u64 *entry, new_entry;
int id = vcpu->vcpu_id;
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
u32 id = vcpu->vcpu_id;
u64 new_entry;
/*
* Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
* hardware. Immediately clear apicv_active, i.e. don't wait until the
* KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
* avic_vcpu_load() expects to be called if and only if the vCPU has
* fully initialized AVIC.
*/
if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
(id > X2AVIC_MAX_PHYSICAL_ID))
return -EINVAL;
(id > X2AVIC_MAX_PHYSICAL_ID)) {
kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
vcpu->arch.apic->apicv_active = false;
return 0;
}
if (!vcpu->arch.apic->regs)
BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
(X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
return -EINVAL;
if (kvm_apicv_activated(vcpu->kvm)) {
@ -302,19 +289,21 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return ret;
}
svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
/* Note, fls64() returns the bit position, +1. */
BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
/* Setting AVIC backing page address in the phy APIC ID table */
entry = avic_get_physical_id_entry(vcpu, id);
if (!entry)
return -EINVAL;
new_entry = avic_get_backing_page_address(svm) |
AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
svm->avic_physical_id_entry = new_entry;
new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
WRITE_ONCE(*entry, new_entry);
svm->avic_physical_id_cache = entry;
/*
* Initialize the real table, as vCPUs must have a valid entry in order
* for broadcast IPIs to function correctly (broadcast IPIs ignore
* invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
*/
WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
return 0;
}
@ -448,7 +437,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
if (apic_x2apic_mode(source))
avic_logical_id_table = NULL;
else
avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
avic_logical_id_table = kvm_svm->avic_logical_id_table;
/*
* AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
@ -550,7 +539,6 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
u32 *logical_apic_id_table;
u32 cluster, index;
ldr = GET_APIC_LOGICAL_ID(ldr);
@ -571,9 +559,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
return NULL;
index += (cluster << 2);
logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
return &logical_apic_id_table[index];
return &kvm_svm->avic_logical_id_table[index];
}
static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
@ -722,6 +708,9 @@ int avic_init_vcpu(struct vcpu_svm *svm)
int ret;
struct kvm_vcpu *vcpu = &svm->vcpu;
INIT_LIST_HEAD(&svm->ir_list);
spin_lock_init(&svm->ir_list_lock);
if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
return 0;
@ -729,8 +718,6 @@ int avic_init_vcpu(struct vcpu_svm *svm)
if (ret)
return ret;
INIT_LIST_HEAD(&svm->ir_list);
spin_lock_init(&svm->ir_list_lock);
svm->dfr_reg = APIC_DFR_FLAT;
return ret;
@ -742,316 +729,161 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
{
int ret = 0;
struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
unsigned long flags;
struct amd_svm_iommu_ir *ir;
struct vcpu_svm *svm = to_svm(vcpu);
if (!kvm_arch_has_assigned_device(vcpu->kvm))
return 0;
if (!vcpu)
return;
spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
list_del(&irqfd->vcpu_list);
spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
}
int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
unsigned int host_irq, uint32_t guest_irq,
struct kvm_vcpu *vcpu, u32 vector)
{
/*
* Here, we go through the per-vcpu ir_list to update all existing
* interrupt remapping table entry targeting this vcpu.
* If the IRQ was affined to a different vCPU, remove the IRTE metadata
* from the *previous* vCPU's list.
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
svm_ir_list_del(irqfd);
if (list_empty(&svm->ir_list))
goto out;
if (vcpu) {
/*
* Try to enable guest_mode in IRTE, unless AVIC is inhibited,
* in which case configure the IRTE for legacy mode, but track
* the IRTE metadata so that it can be converted to guest mode
* if AVIC is enabled/uninhibited in the future.
*/
struct amd_iommu_pi_data pi_data = {
.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
vcpu->vcpu_idx),
.is_guest_mode = kvm_vcpu_apicv_active(vcpu),
.vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
.vector = vector,
};
struct vcpu_svm *svm = to_svm(vcpu);
u64 entry;
int ret;
list_for_each_entry(ir, &svm->ir_list, node) {
if (activate)
ret = amd_iommu_activate_guest_mode(ir->data);
else
ret = amd_iommu_deactivate_guest_mode(ir->data);
/*
* Prevent the vCPU from being scheduled out or migrated until
* the IRTE is updated and its metadata has been added to the
* list of IRQs being posted to the vCPU, to ensure the IRTE
* isn't programmed with stale pCPU/IsRunning information.
*/
guard(spinlock_irqsave)(&svm->ir_list_lock);
/*
* Update the target pCPU for IOMMU doorbells if the vCPU is
* running. If the vCPU is NOT running, i.e. is blocking or
* scheduled out, KVM will update the pCPU info when the vCPU
* is awakened and/or scheduled in. See also avic_vcpu_load().
*/
entry = svm->avic_physical_id_entry;
if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
} else {
pi_data.cpu = -1;
pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
}
ret = irq_set_vcpu_affinity(host_irq, &pi_data);
if (ret)
break;
}
out:
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
return ret;
}
return ret;
static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
{
unsigned long flags;
struct amd_svm_iommu_ir *cur;
spin_lock_irqsave(&svm->ir_list_lock, flags);
list_for_each_entry(cur, &svm->ir_list, node) {
if (cur->data != pi->ir_data)
continue;
list_del(&cur->node);
kfree(cur);
break;
}
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
{
int ret = 0;
unsigned long flags;
struct amd_svm_iommu_ir *ir;
u64 entry;
if (WARN_ON_ONCE(!pi->ir_data))
return -EINVAL;
/**
* In some cases, the existing irte is updated and re-set,
* so we need to check here if it's already been * added
* to the ir_list.
*/
if (pi->prev_ga_tag) {
struct kvm *kvm = svm->vcpu.kvm;
u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
struct vcpu_svm *prev_svm;
if (!prev_vcpu) {
ret = -EINVAL;
goto out;
/*
* Revert to legacy mode if the IOMMU didn't provide metadata
* for the IRTE, which KVM needs to keep the IRTE up-to-date,
* e.g. if the vCPU is migrated or AVIC is disabled.
*/
if (WARN_ON_ONCE(!pi_data.ir_data)) {
irq_set_vcpu_affinity(host_irq, NULL);
return -EIO;
}
prev_svm = to_svm(prev_vcpu);
svm_ir_list_del(prev_svm, pi);
irqfd->irq_bypass_data = pi_data.ir_data;
list_add(&irqfd->vcpu_list, &svm->ir_list);
return 0;
}
return irq_set_vcpu_affinity(host_irq, NULL);
}
/**
* Allocating new amd_iommu_pi_data, which will get
* add to the per-vcpu ir_list.
enum avic_vcpu_action {
/*
* There is no need to differentiate between activate and deactivate,
* as KVM only refreshes AVIC state when the vCPU is scheduled in and
* isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
* being (de)activated.
*/
ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
if (!ir) {
ret = -ENOMEM;
goto out;
}
ir->data = pi->ir_data;
spin_lock_irqsave(&svm->ir_list_lock, flags);
AVIC_TOGGLE_ON_OFF = BIT(0),
AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF,
AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF,
/*
* Update the target pCPU for IOMMU doorbells if the vCPU is running.
* If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
* will update the pCPU info when the vCPU awkened and/or scheduled in.
* See also avic_vcpu_load().
* No unique action is required to deal with a vCPU that stops/starts
* running. A vCPU that starts running by definition stops blocking as
* well, and a vCPU that stops running can't have been blocking, i.e.
* doesn't need to toggle GALogIntr.
*/
entry = READ_ONCE(*(svm->avic_physical_id_cache));
if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
true, pi->ir_data);
AVIC_START_RUNNING = 0,
AVIC_STOP_RUNNING = 0,
list_add(&ir->node, &svm->ir_list);
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
out:
return ret;
}
/*
* When a vCPU starts blocking, KVM needs to set the GALogIntr flag
* int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
* sent to the vCPU.
*/
AVIC_START_BLOCKING = BIT(1),
};
/*
* Note:
* The HW cannot support posting multicast/broadcast
* interrupts to a vCPU. So, we still use legacy interrupt
* remapping for these kind of interrupts.
*
* For lowest-priority interrupts, we only support
* those with single CPU as the destination, e.g. user
* configures the interrupts via /proc/irq or uses
* irqbalance to make the interrupts single-CPU.
*/
static int
get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
enum avic_vcpu_action action)
{
struct kvm_lapic_irq irq;
struct kvm_vcpu *vcpu = NULL;
kvm_set_msi_irq(kvm, e, &irq);
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
!kvm_irq_is_postable(&irq)) {
pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
__func__, irq.vector);
return -1;
}
pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
irq.vector);
*svm = to_svm(vcpu);
vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
vcpu_info->vector = irq.vector;
return 0;
}
/*
* avic_pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
* @host_irq: host irq of the interrupt
* @guest_irq: gsi of the interrupt
* @set: set or unset PI
* returns 0 on success, < 0 on failure
*/
int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
bool enable_remapped_mode = true;
int idx, ret = 0;
if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
return 0;
pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
__func__, host_irq, guest_irq, set);
idx = srcu_read_lock(&kvm->irq_srcu);
irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
if (guest_irq >= irq_rt->nr_rt_entries ||
hlist_empty(&irq_rt->map[guest_irq])) {
pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
guest_irq, irq_rt->nr_rt_entries);
goto out;
}
hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
struct vcpu_data vcpu_info;
struct vcpu_svm *svm = NULL;
if (e->type != KVM_IRQ_ROUTING_MSI)
continue;
/**
* Here, we setup with legacy mode in the following cases:
* 1. When cannot target interrupt to a specific vcpu.
* 2. Unsetting posted interrupt.
* 3. APIC virtualization is disabled for the vcpu.
* 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
*/
if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
kvm_vcpu_apicv_active(&svm->vcpu)) {
struct amd_iommu_pi_data pi;
enable_remapped_mode = false;
/* Try to enable guest_mode in IRTE */
pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
AVIC_HPA_MASK);
pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
svm->vcpu.vcpu_id);
pi.is_guest_mode = true;
pi.vcpu_data = &vcpu_info;
ret = irq_set_vcpu_affinity(host_irq, &pi);
/**
* Here, we successfully setting up vcpu affinity in
* IOMMU guest mode. Now, we need to store the posted
* interrupt information in a per-vcpu ir_list so that
* we can reference to them directly when we update vcpu
* scheduling information in IOMMU irte.
*/
if (!ret && pi.is_guest_mode)
svm_ir_list_add(svm, &pi);
}
if (!ret && svm) {
trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
e->gsi, vcpu_info.vector,
vcpu_info.pi_desc_addr, set);
}
if (ret < 0) {
pr_err("%s: failed to update PI IRTE\n", __func__);
goto out;
}
}
ret = 0;
if (enable_remapped_mode) {
/* Use legacy mode in IRTE */
struct amd_iommu_pi_data pi;
/**
* Here, pi is used to:
* - Tell IOMMU to use legacy mode for this interrupt.
* - Retrieve ga_tag of prior interrupt remapping data.
*/
pi.prev_ga_tag = 0;
pi.is_guest_mode = false;
ret = irq_set_vcpu_affinity(host_irq, &pi);
/**
* Check if the posted interrupt was previously
* setup with the guest_mode by checking if the ga_tag
* was cached. If so, we need to clean up the per-vcpu
* ir_list.
*/
if (!ret && pi.prev_ga_tag) {
int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
struct kvm_vcpu *vcpu;
vcpu = kvm_get_vcpu_by_id(kvm, id);
if (vcpu)
svm_ir_list_del(to_svm(vcpu), &pi);
}
}
out:
srcu_read_unlock(&kvm->irq_srcu, idx);
return ret;
}
static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
int ret = 0;
struct amd_svm_iommu_ir *ir;
bool ga_log_intr = (action & AVIC_START_BLOCKING);
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_kernel_irqfd *irqfd;
lockdep_assert_held(&svm->ir_list_lock);
if (!kvm_arch_has_assigned_device(vcpu->kvm))
return 0;
/*
* Here, we go through the per-vcpu ir_list to update all existing
* interrupt remapping table entry targeting this vcpu.
*/
if (list_empty(&svm->ir_list))
return 0;
return;
list_for_each_entry(ir, &svm->ir_list, node) {
ret = amd_iommu_update_ga(cpu, r, ir->data);
if (ret)
return ret;
list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
void *data = irqfd->irq_bypass_data;
if (!(action & AVIC_TOGGLE_ON_OFF))
WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
else if (cpu >= 0)
WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
else
WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
}
return 0;
}
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
enum avic_vcpu_action action)
{
u64 entry;
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
u64 entry;
lockdep_assert_preemption_disabled();
if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
* No need to update anything if the vCPU is blocking, i.e. if the vCPU
* is being scheduled in after being preempted. The CPU entries in the
* Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
* If the vCPU was migrated, its new CPU value will be stuffed when the
* vCPU unblocks.
*/
if (kvm_vcpu_is_blocking(vcpu))
if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
return;
/*
@ -1063,38 +895,57 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
entry = READ_ONCE(*(svm->avic_physical_id_cache));
entry = svm->avic_physical_id_entry;
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
svm->avic_physical_id_entry = entry;
/*
* If IPI virtualization is disabled, clear IsRunning when updating the
* actual Physical ID table, so that the CPU never sees IsRunning=1.
* Keep the APIC ID up-to-date in the entry to minimize the chances of
* things going sideways if hardware peeks at the ID.
*/
if (!enable_ipiv)
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
void avic_vcpu_put(struct kvm_vcpu *vcpu)
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
/*
* No need to update anything if the vCPU is blocking, i.e. if the vCPU
* is being scheduled in after being preempted. The CPU entries in the
* Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
* If the vCPU was migrated, its new CPU value will be stuffed when the
* vCPU unblocks.
*/
if (kvm_vcpu_is_blocking(vcpu))
return;
__avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
}
static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
u64 entry = svm->avic_physical_id_entry;
lockdep_assert_preemption_disabled();
/*
* Note, reading the Physical ID entry outside of ir_list_lock is safe
* as only the pCPU that has loaded (or is loading) the vCPU is allowed
* to modify the entry, and preemption is disabled. I.e. the vCPU
* can't be scheduled out and thus avic_vcpu_{put,load}() can't run
* recursively.
*/
entry = READ_ONCE(*(svm->avic_physical_id_cache));
/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
return;
/*
@ -1107,13 +958,62 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
avic_update_iommu_vcpu_affinity(vcpu, -1, action);
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
/*
* Keep the previous APIC ID in the entry so that a rogue doorbell from
* hardware is at least restricted to a CPU associated with the vCPU.
*/
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
if (enable_ipiv)
WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
/*
* Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
* it's a synthetic flag that usurps an unused should-be-zero bit.
*/
if (action & AVIC_START_BLOCKING)
entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
svm->avic_physical_id_entry = entry;
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
void avic_vcpu_put(struct kvm_vcpu *vcpu)
{
/*
* Note, reading the Physical ID entry outside of ir_list_lock is safe
* as only the pCPU that has loaded (or is loading) the vCPU is allowed
* to modify the entry, and preemption is disabled. I.e. the vCPU
* can't be scheduled out and thus avic_vcpu_{put,load}() can't run
* recursively.
*/
u64 entry = to_svm(vcpu)->avic_physical_id_entry;
/*
* Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
* vCPU is preempted while its in the process of blocking. WARN if the
* vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
* the AVIC if it wasn't previously loaded.
*/
if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
return;
/*
* The vCPU was preempted while blocking, ensure its IRTEs are
* configured to generate GA Log Interrupts.
*/
if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
return;
}
__avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
AVIC_STOP_RUNNING);
}
void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
@ -1142,19 +1042,18 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
bool activated = kvm_vcpu_apicv_active(vcpu);
if (!enable_apicv)
return;
/* APICv should only be toggled on/off while the vCPU is running. */
WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
avic_refresh_virtual_apic_mode(vcpu);
if (activated)
avic_vcpu_load(vcpu, vcpu->cpu);
if (kvm_vcpu_apicv_active(vcpu))
__avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
else
avic_vcpu_put(vcpu);
avic_set_pi_irte_mode(vcpu, activated);
__avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
}
void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
@ -1162,20 +1061,25 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_apicv_active(vcpu))
return;
/*
* Unload the AVIC when the vCPU is about to block, _before_
* the vCPU actually blocks.
*
* Any IRQs that arrive before IsRunning=0 will not cause an
* incomplete IPI vmexit on the source, therefore vIRR will also
* be checked by kvm_vcpu_check_block() before blocking. The
* memory barrier implicit in set_current_state orders writing
* IsRunning=0 before reading the vIRR. The processor needs a
* matching memory barrier on interrupt delivery between writing
* IRR and reading IsRunning; the lack of this barrier might be
* the cause of errata #1235).
*/
avic_vcpu_put(vcpu);
/*
* Unload the AVIC when the vCPU is about to block, _before_ the vCPU
* actually blocks.
*
* Note, any IRQs that arrive before IsRunning=0 will not cause an
* incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
* this by checking vIRR one last time before blocking. The memory
* barrier implicit in set_current_state orders writing IsRunning=0
* before reading the vIRR. The processor needs a matching memory
* barrier on interrupt delivery between writing IRR and reading
* IsRunning; the lack of this barrier might be the cause of errata #1235).
*
* Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
* doesn't need to detect events for scheduling purposes. The doorbell
* used to signal running vCPUs cannot be blocked, i.e. will perturb the
* CPU and cause noisy neighbor problems if the VM is sending interrupts
* to the vCPU while it's scheduled out.
*/
__avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
}
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
@ -1228,6 +1132,14 @@ bool avic_hardware_setup(void)
if (x2avic_enabled)
pr_info("x2AVIC enabled\n");
/*
* Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
* due to erratum 1235, which results in missed VM-Exits on the sender
* and thus missed wake events for blocking vCPUs due to the CPU
* failing to see a software update to clear IsRunning.
*/
enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
return true;

View File

@ -232,6 +232,7 @@ module_param(tsc_scaling, int, 0444);
*/
static bool avic;
module_param(avic, bool, 0444);
module_param(enable_ipiv, bool, 0444);
module_param(enable_device_posted_irqs, bool, 0444);
@ -1490,6 +1491,8 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
WARN_ON_ONCE(!list_empty(&svm->ir_list));
svm_leave_nested(vcpu);
svm_free_nested(svm);
@ -5581,6 +5584,7 @@ static __init int svm_hardware_setup(void)
enable_apicv = avic = avic && avic_hardware_setup();
if (!enable_apicv) {
enable_ipiv = false;
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;

View File

@ -123,8 +123,8 @@ struct kvm_svm {
/* Struct members for AVIC */
u32 avic_vm_id;
struct page *avic_logical_id_table_page;
struct page *avic_physical_id_table_page;
u32 *avic_logical_id_table;
u64 *avic_physical_id_table;
struct hlist_node hnode;
struct kvm_sev_info sev_info;
@ -306,14 +306,22 @@ struct vcpu_svm {
u32 ldr_reg;
u32 dfr_reg;
struct page *avic_backing_page;
u64 *avic_physical_id_cache;
/* This is essentially a shadow of the vCPU's actual entry in the
* Physical ID table that is programmed into the VMCB, i.e. that is
* seen by the CPU. If IPI virtualization is disabled, IsRunning is
* only ever set in the shadow, i.e. is never propagated to the "real"
* table, so that hardware never sees IsRunning=1.
*/
u64 avic_physical_id_entry;
/*
* Per-vcpu list of struct amd_svm_iommu_ir:
* This is used mainly to store interrupt remapping information used
* when update the vcpu affinity. This avoids the need to scan for
* IRTE and try to match ga_tag in the IOMMU driver.
* Per-vCPU list of irqfds that are eligible to post IRQs directly to
* the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list
* is used to reconfigure IRTEs when the vCPU is loaded/put (to set the
* target pCPU), when AVIC is toggled on/off (to (de)activate bypass),
* and if the irqfd becomes ineligible for posting (to put the IRTE
* back into remapped mode).
*/
struct list_head ir_list;
spinlock_t ir_list_lock;
@ -721,7 +729,8 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \
BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \
BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) | \
BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \
)
bool avic_hardware_setup(void);
@ -736,8 +745,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
unsigned int host_irq, uint32_t guest_irq,
struct kvm_vcpu *vcpu, u32 vector);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);

View File

@ -260,6 +260,86 @@ TRACE_EVENT(kvm_cpuid,
__entry->used_max_basic ? ", used max basic" : "")
);
#define kvm_deliver_mode \
{0x0, "Fixed"}, \
{0x1, "LowPrio"}, \
{0x2, "SMI"}, \
{0x3, "Res3"}, \
{0x4, "NMI"}, \
{0x5, "INIT"}, \
{0x6, "SIPI"}, \
{0x7, "ExtINT"}
#ifdef CONFIG_KVM_IOAPIC
TRACE_EVENT(kvm_ioapic_set_irq,
TP_PROTO(__u64 e, int pin, bool coalesced),
TP_ARGS(e, pin, coalesced),
TP_STRUCT__entry(
__field( __u64, e )
__field( int, pin )
__field( bool, coalesced )
),
TP_fast_assign(
__entry->e = e;
__entry->pin = pin;
__entry->coalesced = coalesced;
),
TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
__entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
__print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
(__entry->e & (1<<11)) ? "logical" : "physical",
(__entry->e & (1<<15)) ? "level" : "edge",
(__entry->e & (1<<16)) ? "|masked" : "",
__entry->coalesced ? " (coalesced)" : "")
);
TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
TP_PROTO(__u64 e),
TP_ARGS(e),
TP_STRUCT__entry(
__field( __u64, e )
),
TP_fast_assign(
__entry->e = e;
),
TP_printk("dst %x vec %u (%s|%s|%s%s)",
(u8)(__entry->e >> 56), (u8)__entry->e,
__print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
(__entry->e & (1<<11)) ? "logical" : "physical",
(__entry->e & (1<<15)) ? "level" : "edge",
(__entry->e & (1<<16)) ? "|masked" : "")
);
#endif
TRACE_EVENT(kvm_msi_set_irq,
TP_PROTO(__u64 address, __u64 data),
TP_ARGS(address, data),
TP_STRUCT__entry(
__field( __u64, address )
__field( __u64, data )
),
TP_fast_assign(
__entry->address = address;
__entry->data = data;
),
TP_printk("dst %llx vec %u (%s|%s|%s%s)",
(u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
(u8)__entry->data,
__print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
(__entry->address & (1<<2)) ? "logical" : "physical",
(__entry->data & (1<<15)) ? "level" : "edge",
(__entry->address & (1<<3)) ? "|rh" : "")
);
#define AREG(x) { APIC_##x, "APIC_" #x }
#define kvm_trace_symbol_apic \
@ -1096,37 +1176,32 @@ TRACE_EVENT(kvm_smm_transition,
* Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC.
*/
TRACE_EVENT(kvm_pi_irte_update,
TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
unsigned int gsi, unsigned int gvec,
u64 pi_desc_addr, bool set),
TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
TP_PROTO(unsigned int host_irq, struct kvm_vcpu *vcpu,
unsigned int gsi, unsigned int gvec, bool set),
TP_ARGS(host_irq, vcpu, gsi, gvec, set),
TP_STRUCT__entry(
__field( unsigned int, host_irq )
__field( unsigned int, vcpu_id )
__field( int, vcpu_id )
__field( unsigned int, gsi )
__field( unsigned int, gvec )
__field( u64, pi_desc_addr )
__field( bool, set )
),
TP_fast_assign(
__entry->host_irq = host_irq;
__entry->vcpu_id = vcpu_id;
__entry->vcpu_id = vcpu ? vcpu->vcpu_id : -1;
__entry->gsi = gsi;
__entry->gvec = gvec;
__entry->pi_desc_addr = pi_desc_addr;
__entry->set = set;
),
TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
"gvec: 0x%x, pi_desc_addr: 0x%llx",
TP_printk("PI is %s for irq %u, vcpu %d, gsi: 0x%x, gvec: 0x%x",
__entry->set ? "enabled and being updated" : "disabled",
__entry->host_irq,
__entry->vcpu_id,
__entry->gsi,
__entry->gvec,
__entry->pi_desc_addr)
__entry->gvec)
);
/*

View File

@ -15,7 +15,6 @@ extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
extern bool __read_mostly enable_ipiv;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0

View File

@ -1014,7 +1014,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.nested_ops = &vmx_nested_ops,
.pi_update_irte = vmx_pi_update_irte,
.pi_start_assignment = vmx_pi_start_assignment,
.pi_start_bypass = vmx_pi_start_bypass,
#ifdef CONFIG_X86_64
.set_hv_timer = vt_op(set_hv_timer),

View File

@ -2,6 +2,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/kvm_irqfd.h>
#include <asm/irq_remapping.h>
#include <asm/cpu.h>
@ -72,13 +73,10 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
/*
* If the vCPU wasn't on the wakeup list and wasn't migrated, then the
* full update can be skipped as neither the vector nor the destination
* needs to be changed.
* needs to be changed. Clear SN even if there is no assigned device,
* again for simplicity.
*/
if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
/*
* Clear SN if it was set due to being preempted. Again, do
* this even if there is no assigned device for simplicity.
*/
if (pi_test_and_clear_sn(pi_desc))
goto after_clear_sn;
return;
@ -148,8 +146,13 @@ after_clear_sn:
static bool vmx_can_use_vtd_pi(struct kvm *kvm)
{
/*
* Note, reading the number of possible bypass IRQs can race with a
* bypass IRQ being attached to the VM. vmx_pi_start_bypass() ensures
* blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK.
*/
return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() &&
kvm_arch_has_assigned_device(kvm);
READ_ONCE(kvm->arch.nr_possible_bypass_irqs);
}
/*
@ -224,17 +227,23 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
if (!vmx_needs_pi_wakeup(vcpu))
return;
if (kvm_vcpu_is_blocking(vcpu) &&
/*
* If the vCPU is blocking with IRQs enabled and ISN'T being preempted,
* enable the wakeup handler so that notification IRQ wakes the vCPU as
* expected. There is no need to enable the wakeup handler if the vCPU
* is preempted between setting its wait state and manually scheduling
* out, as the task is still runnable, i.e. doesn't need a wake event
* from KVM to be scheduled in.
*
* If the wakeup handler isn't being enabled, Suppress Notifications as
* the cost of propagating PIR.IRR to PID.ON is negligible compared to
* the cost of a spurious IRQ, and vCPU put/load is a slow path.
*/
if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) &&
((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
(!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
pi_enable_wakeup_handler(vcpu);
/*
* Set SN when the vCPU is preempted. Note, the vCPU can both be seen
* as blocking and preempted, e.g. if it's preempted between setting
* its wait state and manually scheduling out.
*/
if (vcpu->preempted)
else
pi_set_sn(pi_desc);
}
@ -281,99 +290,30 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
/*
* Bail out of the block loop if the VM has an assigned
* device, but the blocking vCPU didn't reconfigure the
* PI.NV to the wakeup vector, i.e. the assigned device
* came along after the initial check in vmx_vcpu_pi_put().
* Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as
* blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup
* vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put().
*/
void vmx_pi_start_assignment(struct kvm *kvm)
void vmx_pi_start_bypass(struct kvm *kvm)
{
if (!kvm_arch_has_irq_bypass())
if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm)))
return;
kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
}
/*
* vmx_pi_update_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
* @host_irq: host irq of the interrupt
* @guest_irq: gsi of the interrupt
* @set: set or unset PI
* returns 0 on success, < 0 on failure
*/
int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
unsigned int host_irq, uint32_t guest_irq,
struct kvm_vcpu *vcpu, u32 vector)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
bool enable_remapped_mode = true;
struct kvm_lapic_irq irq;
struct kvm_vcpu *vcpu;
struct vcpu_data vcpu_info;
int idx, ret = 0;
if (vcpu) {
struct intel_iommu_pi_data pi_data = {
.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)),
.vector = vector,
};
if (!vmx_can_use_vtd_pi(kvm))
return 0;
idx = srcu_read_lock(&kvm->irq_srcu);
irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
if (guest_irq >= irq_rt->nr_rt_entries ||
hlist_empty(&irq_rt->map[guest_irq])) {
pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
guest_irq, irq_rt->nr_rt_entries);
goto out;
return irq_set_vcpu_affinity(host_irq, &pi_data);
} else {
return irq_set_vcpu_affinity(host_irq, NULL);
}
hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
if (e->type != KVM_IRQ_ROUTING_MSI)
continue;
/*
* VT-d PI cannot support posting multicast/broadcast
* interrupts to a vCPU, we still use interrupt remapping
* for these kind of interrupts.
*
* For lowest-priority interrupts, we only support
* those with single CPU as the destination, e.g. user
* configures the interrupts via /proc/irq or uses
* irqbalance to make the interrupts single-CPU.
*
* We will support full lowest-priority interrupt later.
*
* In addition, we can only inject generic interrupts using
* the PI mechanism, refuse to route others through it.
*/
kvm_set_msi_irq(kvm, e, &irq);
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
!kvm_irq_is_postable(&irq))
continue;
vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
vcpu_info.vector = irq.vector;
trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
vcpu_info.vector, vcpu_info.pi_desc_addr, set);
if (!set)
continue;
enable_remapped_mode = false;
ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
if (ret < 0) {
printk(KERN_INFO "%s: failed to update PI IRTE\n",
__func__);
goto out;
}
}
if (enable_remapped_mode)
ret = irq_set_vcpu_affinity(host_irq, NULL);
ret = 0;
out:
srcu_read_unlock(&kvm->irq_srcu, idx);
return ret;
}

View File

@ -3,6 +3,9 @@
#define __KVM_X86_VMX_POSTED_INTR_H
#include <linux/bitmap.h>
#include <linux/find.h>
#include <linux/kvm_host.h>
#include <asm/posted_intr.h>
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
@ -11,9 +14,10 @@ void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void vmx_pi_start_assignment(struct kvm *kvm);
int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
unsigned int host_irq, uint32_t guest_irq,
struct kvm_vcpu *vcpu, u32 vector);
void vmx_pi_start_bypass(struct kvm *kvm);
static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
{

View File

@ -113,8 +113,6 @@ static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, 0444);
module_param(enable_apicv, bool, 0444);
bool __read_mostly enable_ipiv = true;
module_param(enable_ipiv, bool, 0444);
module_param(enable_device_posted_irqs, bool, 0444);

View File

@ -226,6 +226,9 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
EXPORT_SYMBOL_GPL(enable_apicv);
bool __read_mostly enable_ipiv = true;
EXPORT_SYMBOL_GPL(enable_ipiv);
bool __read_mostly enable_device_posted_irqs = true;
EXPORT_SYMBOL_GPL(enable_device_posted_irqs);
@ -4634,17 +4637,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXT_CPUID:
case KVM_CAP_EXT_EMUL_CPUID:
case KVM_CAP_CLOCKSOURCE:
#ifdef CONFIG_KVM_IOAPIC
case KVM_CAP_PIT:
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
case KVM_CAP_REINJECT_CONTROL:
#endif
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_USER_NMI:
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_IOEVENTFD_NO_LENGTH:
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
case KVM_CAP_VCPU_EVENTS:
#ifdef CONFIG_KVM_HYPERV
@ -6401,135 +6407,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
struct kvm_pic *pic = kvm->arch.vpic;
int r;
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
memcpy(&chip->chip.pic, &pic->pics[0],
sizeof(struct kvm_pic_state));
break;
case KVM_IRQCHIP_PIC_SLAVE:
memcpy(&chip->chip.pic, &pic->pics[1],
sizeof(struct kvm_pic_state));
break;
case KVM_IRQCHIP_IOAPIC:
kvm_get_ioapic(kvm, &chip->chip.ioapic);
break;
default:
r = -EINVAL;
break;
}
return r;
}
static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
struct kvm_pic *pic = kvm->arch.vpic;
int r;
r = 0;
switch (chip->chip_id) {
case KVM_IRQCHIP_PIC_MASTER:
spin_lock(&pic->lock);
memcpy(&pic->pics[0], &chip->chip.pic,
sizeof(struct kvm_pic_state));
spin_unlock(&pic->lock);
break;
case KVM_IRQCHIP_PIC_SLAVE:
spin_lock(&pic->lock);
memcpy(&pic->pics[1], &chip->chip.pic,
sizeof(struct kvm_pic_state));
spin_unlock(&pic->lock);
break;
case KVM_IRQCHIP_IOAPIC:
kvm_set_ioapic(kvm, &chip->chip.ioapic);
break;
default:
r = -EINVAL;
break;
}
kvm_pic_update_irq(pic);
return r;
}
static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
mutex_lock(&kps->lock);
memcpy(ps, &kps->channels, sizeof(*ps));
mutex_unlock(&kps->lock);
return 0;
}
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
int i;
struct kvm_pit *pit = kvm->arch.vpit;
mutex_lock(&pit->pit_state.lock);
memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
for (i = 0; i < 3; i++)
kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
mutex_unlock(&pit->pit_state.lock);
return 0;
}
static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
sizeof(ps->channels));
ps->flags = kvm->arch.vpit->pit_state.flags;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
memset(&ps->reserved, 0, sizeof(ps->reserved));
return 0;
}
static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
int start = 0;
int i;
u32 prev_legacy, cur_legacy;
struct kvm_pit *pit = kvm->arch.vpit;
mutex_lock(&pit->pit_state.lock);
prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
if (!prev_legacy && cur_legacy)
start = 1;
memcpy(&pit->pit_state.channels, &ps->channels,
sizeof(pit->pit_state.channels));
pit->pit_state.flags = ps->flags;
for (i = 0; i < 3; i++)
kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
start && i == 0);
mutex_unlock(&pit->pit_state.lock);
return 0;
}
static int kvm_vm_ioctl_reinject(struct kvm *kvm,
struct kvm_reinject_control *control)
{
struct kvm_pit *pit = kvm->arch.vpit;
/* pit->pit_state.lock was overloaded to prevent userspace from getting
* an inconsistent state after running multiple KVM_REINJECT_CONTROL
* ioctls in parallel. Use a separate lock if that ioctl isn't rare.
*/
mutex_lock(&pit->pit_state.lock);
kvm_pit_set_reinject(pit, control->pit_reinject);
mutex_unlock(&pit->pit_state.lock);
return 0;
}
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
@ -6549,18 +6426,6 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
kvm_vcpu_kick(vcpu);
}
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
bool line_status)
{
if (!irqchip_in_kernel(kvm))
return -ENXIO;
irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event->irq, irq_event->level,
line_status);
return 0;
}
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
@ -7072,9 +6937,11 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -ENOTTY;
#ifdef CONFIG_KVM_IOAPIC
/*
* This union makes it completely explicit to gcc-3.x
* that these two variables' stack usage should be
* that these three variables' stack usage should be
* combined, not added together.
*/
union {
@ -7082,6 +6949,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm_pit_state2 ps2;
struct kvm_pit_config pit_config;
} u;
#endif
switch (ioctl) {
case KVM_SET_TSS_ADDR:
@ -7105,6 +6973,7 @@ set_identity_unlock:
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
break;
#ifdef CONFIG_KVM_IOAPIC
case KVM_CREATE_IRQCHIP: {
mutex_lock(&kvm->lock);
@ -7126,7 +6995,7 @@ set_identity_unlock:
goto create_irqchip_unlock;
}
r = kvm_setup_default_irq_routing(kvm);
r = kvm_setup_default_ioapic_and_pic_routing(kvm);
if (r) {
kvm_ioapic_destroy(kvm);
kvm_pic_destroy(kvm);
@ -7174,7 +7043,7 @@ set_identity_unlock:
}
r = -ENXIO;
if (!irqchip_kernel(kvm))
if (!irqchip_full(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@ -7198,7 +7067,7 @@ set_identity_unlock:
}
r = -ENXIO;
if (!irqchip_kernel(kvm))
if (!irqchip_full(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
set_irqchip_out:
@ -7271,6 +7140,7 @@ set_pit2_out:
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
#endif
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
@ -10730,8 +10600,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
#ifdef CONFIG_KVM_IOAPIC
else if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
#endif
if (is_guest_mode(vcpu))
vcpu->arch.load_eoi_exitmap_pending = true;
@ -12801,15 +12673,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto out_uninit_mmu;
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
&kvm->arch.irq_sources_bitmap);
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
@ -12940,7 +12805,9 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
#ifdef CONFIG_KVM_IOAPIC
kvm_free_pit(kvm);
#endif
kvm_mmu_pre_destroy_vm(kvm);
static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
@ -12964,8 +12831,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
}
kvm_destroy_vcpus(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
#ifdef CONFIG_KVM_IOAPIC
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
#endif
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
@ -13577,8 +13446,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
void kvm_arch_start_assignment(struct kvm *kvm)
{
if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
kvm_x86_call(pi_start_assignment)(kvm);
atomic_inc(&kvm->arch.assigned_device_count);
}
EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
@ -13629,77 +13497,6 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
struct kvm *kvm = irqfd->kvm;
int ret;
kvm_arch_start_assignment(irqfd->kvm);
spin_lock_irq(&kvm->irqfds.lock);
irqfd->producer = prod;
ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
if (ret)
kvm_arch_end_assignment(irqfd->kvm);
spin_unlock_irq(&kvm->irqfds.lock);
return ret;
}
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{
int ret;
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
struct kvm *kvm = irqfd->kvm;
WARN_ON(irqfd->producer != prod);
/*
* When producer of consumer is unregistered, we change back to
* remapped mode, so we can re-use the current implementation
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
spin_lock_irq(&kvm->irqfds.lock);
irqfd->producer = NULL;
ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
spin_unlock_irq(&kvm->irqfds.lock);
kvm_arch_end_assignment(irqfd->kvm);
}
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
}
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
{
if (old->type != KVM_IRQ_ROUTING_MSI ||
new->type != KVM_IRQ_ROUTING_MSI)
return true;
return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
}
bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
@ -14099,7 +13896,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);

View File

@ -368,6 +368,14 @@ static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
container_of(polltbl, struct mshv_irqfd, irqfd_polltbl);
irqfd->irqfd_wqh = wqh;
/*
* TODO: Ensure there isn't already an exclusive, priority waiter, e.g.
* that the irqfd isn't already bound to another partition. Only the
* first exclusive waiter encountered will be notified, and
* add_wait_queue_priority() doesn't enforce exclusivity.
*/
irqfd->irqfd_wait.flags |= WQ_FLAG_EXCLUSIVE;
add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
}

View File

@ -1054,7 +1054,6 @@ struct irq_2_irte {
};
struct amd_ir_data {
u32 cached_ga_tag;
struct amd_iommu *iommu;
struct irq_2_irte irq_2_irte;
struct msi_msg msi_entry;

View File

@ -3804,13 +3804,70 @@ static const struct irq_domain_ops amd_ir_domain_ops = {
.deactivate = irq_remapping_deactivate,
};
int amd_iommu_activate_guest_mode(void *data)
static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
bool ga_log_intr)
{
if (cpu >= 0) {
entry->lo.fields_vapic.destination =
APICID_TO_IRTE_DEST_LO(cpu);
entry->hi.fields.destination =
APICID_TO_IRTE_DEST_HI(cpu);
entry->lo.fields_vapic.is_run = true;
entry->lo.fields_vapic.ga_log_intr = false;
} else {
entry->lo.fields_vapic.is_run = false;
entry->lo.fields_vapic.ga_log_intr = ga_log_intr;
}
}
/*
* Update the pCPU information for an IRTE that is configured to post IRQs to
* a vCPU, without issuing an IOMMU invalidation for the IRTE.
*
* If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination
* with the pCPU's APIC ID, set IsRun, and clear GALogIntr. If the vCPU isn't
* associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based
* on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is
* blocking and requires a notification wake event). I.e. treat vCPUs that are
* associated with a pCPU as running. This API is intended to be used when a
* vCPU is scheduled in/out (or stops running for any reason), to do a fast
* update of IsRun, GALogIntr, and (conditionally) Destination.
*
* Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached
* and thus don't require an invalidation to ensure the IOMMU consumes fresh
* information.
*/
int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr)
{
struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
return -EINVAL;
if (!entry || !entry->lo.fields_vapic.guest_mode)
return 0;
if (!ir_data->iommu)
return -ENODEV;
__amd_iommu_update_ga(entry, cpu, ga_log_intr);
return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
ir_data->irq_2_irte.index, entry);
}
EXPORT_SYMBOL(amd_iommu_update_ga);
int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr)
{
struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
u64 valid;
if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
return -EINVAL;
if (!entry)
return 0;
valid = entry->lo.fields_vapic.valid;
@ -3820,11 +3877,12 @@ int amd_iommu_activate_guest_mode(void *data)
entry->lo.fields_vapic.valid = valid;
entry->lo.fields_vapic.guest_mode = 1;
entry->lo.fields_vapic.ga_log_intr = 1;
entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr;
entry->hi.fields.vector = ir_data->ga_vector;
entry->lo.fields_vapic.ga_tag = ir_data->ga_tag;
__amd_iommu_update_ga(entry, cpu, ga_log_intr);
return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
ir_data->irq_2_irte.index, entry);
}
@ -3837,8 +3895,10 @@ int amd_iommu_deactivate_guest_mode(void *data)
struct irq_cfg *cfg = ir_data->cfg;
u64 valid;
if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
!entry || !entry->lo.fields_vapic.guest_mode)
if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
return -EINVAL;
if (!entry || !entry->lo.fields_vapic.guest_mode)
return 0;
valid = entry->lo.fields_remap.valid;
@ -3860,11 +3920,10 @@ int amd_iommu_deactivate_guest_mode(void *data)
}
EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info)
{
int ret;
struct amd_iommu_pi_data *pi_data = vcpu_info;
struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
struct amd_iommu_pi_data *pi_data = info;
struct amd_ir_data *ir_data = data->chip_data;
struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
struct iommu_dev_data *dev_data;
@ -3885,25 +3944,20 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
return -EINVAL;
ir_data->cfg = irqd_cfg(data);
pi_data->ir_data = ir_data;
pi_data->prev_ga_tag = ir_data->cached_ga_tag;
if (pi_data->is_guest_mode) {
ir_data->ga_root_ptr = (pi_data->base >> 12);
ir_data->ga_vector = vcpu_pi_info->vector;
if (pi_data) {
pi_data->ir_data = ir_data;
ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12);
ir_data->ga_vector = pi_data->vector;
ir_data->ga_tag = pi_data->ga_tag;
ret = amd_iommu_activate_guest_mode(ir_data);
if (!ret)
ir_data->cached_ga_tag = pi_data->ga_tag;
if (pi_data->is_guest_mode)
ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu,
pi_data->ga_log_intr);
else
ret = amd_iommu_deactivate_guest_mode(ir_data);
} else {
ret = amd_iommu_deactivate_guest_mode(ir_data);
/*
* This communicates the ga_tag back to the caller
* so that it can do all the necessary clean up.
*/
if (!ret)
ir_data->cached_ga_tag = 0;
}
return ret;
@ -3995,29 +4049,4 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
return 0;
}
int amd_iommu_update_ga(int cpu, bool is_run, void *data)
{
struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
!entry || !entry->lo.fields_vapic.guest_mode)
return 0;
if (!ir_data->iommu)
return -ENODEV;
if (cpu >= 0) {
entry->lo.fields_vapic.destination =
APICID_TO_IRTE_DEST_LO(cpu);
entry->hi.fields.destination =
APICID_TO_IRTE_DEST_HI(cpu);
}
entry->lo.fields_vapic.is_run = is_run;
return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
ir_data->irq_2_irte.index, entry);
}
EXPORT_SYMBOL(amd_iommu_update_ga);
#endif

View File

@ -1244,10 +1244,10 @@ static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
{
struct intel_ir_data *ir_data = data->chip_data;
struct vcpu_data *vcpu_pi_info = info;
struct intel_iommu_pi_data *pi_data = info;
/* stop posting interrupts, back to the default mode */
if (!vcpu_pi_info) {
if (!pi_data) {
__intel_ir_reconfigure_irte(data, true);
} else {
struct irte irte_pi;
@ -1265,10 +1265,10 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
/* Update the posted mode fields */
irte_pi.p_pst = 1;
irte_pi.p_urgent = 0;
irte_pi.p_vector = vcpu_pi_info->vector;
irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >>
irte_pi.p_vector = pi_data->vector;
irte_pi.pda_l = (pi_data->pi_desc_addr >>
(32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) &
irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) &
~(-1UL << PDA_HIGH_BIT);
ir_data->irq_2_iommu.posted_vcpu = true;

View File

@ -342,10 +342,10 @@ int its_get_vlpi(int irq, struct its_vlpi_map *map)
return irq_set_vcpu_affinity(irq, &info);
}
int its_unmap_vlpi(int irq)
void its_unmap_vlpi(int irq)
{
irq_clear_status_flags(irq, IRQ_DISABLE_UNLAZY);
return irq_set_vcpu_affinity(irq, NULL);
WARN_ON_ONCE(irq_set_vcpu_affinity(irq, NULL));
}
int its_prop_update_vlpi(int irq, u8 config, bool inv)

View File

@ -505,15 +505,11 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev,
if (ret)
goto out_put_eventfd_ctx;
ctx->producer.token = trigger;
ctx->producer.irq = irq;
ret = irq_bypass_register_producer(&ctx->producer);
ret = irq_bypass_register_producer(&ctx->producer, trigger, irq);
if (unlikely(ret)) {
dev_info(&pdev->dev,
"irq bypass producer (token %p) registration fails: %d\n",
ctx->producer.token, ret);
ctx->producer.token = NULL;
"irq bypass producer (eventfd %p) registration fails: %d\n",
trigger, ret);
}
ctx->trigger = trigger;

View File

@ -212,11 +212,11 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid)
if (!vq->call_ctx.ctx)
return;
vq->call_ctx.producer.irq = irq;
ret = irq_bypass_register_producer(&vq->call_ctx.producer);
ret = irq_bypass_register_producer(&vq->call_ctx.producer,
vq->call_ctx.ctx, irq);
if (unlikely(ret))
dev_info(&v->dev, "vq %u, irq bypass producer (token %p) registration fails, ret = %d\n",
qid, vq->call_ctx.producer.token, ret);
dev_info(&v->dev, "vq %u, irq bypass producer (eventfd %p) registration fails, ret = %d\n",
qid, vq->call_ctx.ctx, ret);
}
static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid)
@ -712,7 +712,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
if (ops->get_status(vdpa) &
VIRTIO_CONFIG_S_DRIVER_OK)
vhost_vdpa_unsetup_vq_irq(v, idx);
vq->call_ctx.producer.token = NULL;
}
break;
}
@ -753,7 +752,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
cb.callback = vhost_vdpa_virtqueue_cb;
cb.private = vq;
cb.trigger = vq->call_ctx.ctx;
vq->call_ctx.producer.token = vq->call_ctx.ctx;
if (ops->get_status(vdpa) &
VIRTIO_CONFIG_S_DRIVER_OK)
vhost_vdpa_setup_vq_irq(v, idx);

View File

@ -434,7 +434,7 @@ struct kvm_kernel_irq_routing_entry;
int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq,
struct kvm_kernel_irq_routing_entry *irq_entry);
int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq);
void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq);
int vgic_v4_load(struct kvm_vcpu *vcpu);
void vgic_v4_commit(struct kvm_vcpu *vcpu);

View File

@ -12,20 +12,6 @@
struct amd_iommu;
/*
* This is mainly used to communicate information back-and-forth
* between SVM and IOMMU for setting up and tearing down posted
* interrupt
*/
struct amd_iommu_pi_data {
u32 ga_tag;
u32 prev_ga_tag;
u64 base;
bool is_guest_mode;
struct vcpu_data *vcpu_data;
void *ir_data;
};
#ifdef CONFIG_AMD_IOMMU
struct task_struct;
@ -44,10 +30,8 @@ static inline void amd_iommu_detect(void) { }
/* IOMMU AVIC Function */
extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32));
extern int
amd_iommu_update_ga(int cpu, bool is_run, void *data);
extern int amd_iommu_activate_guest_mode(void *data);
extern int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr);
extern int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr);
extern int amd_iommu_deactivate_guest_mode(void *data);
#else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
@ -58,13 +42,12 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
return 0;
}
static inline int
amd_iommu_update_ga(int cpu, bool is_run, void *data)
static inline int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr)
{
return 0;
}
static inline int amd_iommu_activate_guest_mode(void *data)
static inline int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr)
{
return 0;
}

View File

@ -10,6 +10,7 @@
#include <linux/list.h>
struct eventfd_ctx;
struct irq_bypass_consumer;
/*
@ -18,20 +19,22 @@ struct irq_bypass_consumer;
* The IRQ bypass manager is a simple set of lists and callbacks that allows
* IRQ producers (ex. physical interrupt sources) to be matched to IRQ
* consumers (ex. virtualization hardware that allows IRQ bypass or offload)
* via a shared token (ex. eventfd_ctx). Producers and consumers register
* independently. When a token match is found, the optional @stop callback
* will be called for each participant. The pair will then be connected via
* the @add_* callbacks, and finally the optional @start callback will allow
* any final coordination. When either participant is unregistered, the
* process is repeated using the @del_* callbacks in place of the @add_*
* callbacks. Match tokens must be unique per producer/consumer, 1:N pairings
* are not supported.
* via a shared eventfd_ctx. Producers and consumers register independently.
* When a producer and consumer are paired, i.e. an eventfd match is found, the
* optional @stop callback will be called for each participant. The pair will
* then be connected via the @add_* callbacks, and finally the optional @start
* callback will allow any final coordination. When either participant is
* unregistered, the process is repeated using the @del_* callbacks in place of
* the @add_* callbacks. eventfds must be unique per producer/consumer, 1:N
* pairings are not supported.
*/
struct irq_bypass_consumer;
/**
* struct irq_bypass_producer - IRQ bypass producer definition
* @node: IRQ bypass manager private list management
* @token: opaque token to match between producer and consumer (non-NULL)
* @eventfd: eventfd context used to match producers and consumers
* @consumer: The connected consumer (NULL if no connection)
* @irq: Linux IRQ number for the producer device
* @add_consumer: Connect the IRQ producer to an IRQ consumer (optional)
* @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional)
@ -43,8 +46,8 @@ struct irq_bypass_consumer;
* for a physical device assigned to a VM.
*/
struct irq_bypass_producer {
struct list_head node;
void *token;
struct eventfd_ctx *eventfd;
struct irq_bypass_consumer *consumer;
int irq;
int (*add_consumer)(struct irq_bypass_producer *,
struct irq_bypass_consumer *);
@ -56,8 +59,8 @@ struct irq_bypass_producer {
/**
* struct irq_bypass_consumer - IRQ bypass consumer definition
* @node: IRQ bypass manager private list management
* @token: opaque token to match between producer and consumer (non-NULL)
* @eventfd: eventfd context used to match producers and consumers
* @producer: The connected producer (NULL if no connection)
* @add_producer: Connect the IRQ consumer to an IRQ producer
* @del_producer: Disconnect the IRQ consumer from an IRQ producer
* @stop: Perform any quiesce operations necessary prior to add/del (optional)
@ -69,8 +72,9 @@ struct irq_bypass_producer {
* portions of the interrupt handling to the VM.
*/
struct irq_bypass_consumer {
struct list_head node;
void *token;
struct eventfd_ctx *eventfd;
struct irq_bypass_producer *producer;
int (*add_producer)(struct irq_bypass_consumer *,
struct irq_bypass_producer *);
void (*del_producer)(struct irq_bypass_consumer *,
@ -79,9 +83,11 @@ struct irq_bypass_consumer {
void (*start)(struct irq_bypass_consumer *);
};
int irq_bypass_register_producer(struct irq_bypass_producer *);
void irq_bypass_unregister_producer(struct irq_bypass_producer *);
int irq_bypass_register_consumer(struct irq_bypass_consumer *);
void irq_bypass_unregister_consumer(struct irq_bypass_consumer *);
int irq_bypass_register_producer(struct irq_bypass_producer *producer,
struct eventfd_ctx *eventfd, int irq);
void irq_bypass_unregister_producer(struct irq_bypass_producer *producer);
int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer,
struct eventfd_ctx *eventfd);
void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer);
#endif /* IRQBYPASS_H */

View File

@ -146,7 +146,7 @@ int its_commit_vpe(struct its_vpe *vpe);
int its_invall_vpe(struct its_vpe *vpe);
int its_map_vlpi(int irq, struct its_vlpi_map *map);
int its_get_vlpi(int irq, struct its_vlpi_map *map);
int its_unmap_vlpi(int irq);
void its_unmap_vlpi(int irq);
int its_prop_update_vlpi(int irq, u8 config, bool inv);
int its_prop_update_vsgi(int irq, u8 priority, bool group);

View File

@ -190,6 +190,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
#define KVM_USERSPACE_IRQ_SOURCE_ID 0
#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
#define KVM_PIT_IRQ_SOURCE_ID 2
extern struct mutex kvm_lock;
extern struct list_head vm_list;
@ -1022,16 +1023,12 @@ void kvm_unlock_all_vcpus(struct kvm *kvm);
void vcpu_load(struct kvm_vcpu *vcpu);
void vcpu_put(struct kvm_vcpu *vcpu);
#ifdef __KVM_HAVE_IOAPIC
#ifdef CONFIG_KVM_IOAPIC
void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm);
void kvm_arch_post_irq_routing_update(struct kvm *kvm);
#else
static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
{
}
static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm)
{
}
#endif
#ifdef CONFIG_HAVE_KVM_IRQCHIP
@ -1788,8 +1785,6 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
int kvm_request_irq_source_id(struct kvm *kvm);
void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
/*
@ -2406,6 +2401,8 @@ struct kvm_vcpu *kvm_get_running_vcpu(void);
struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
struct kvm_kernel_irqfd;
bool kvm_arch_has_irq_bypass(void);
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
struct irq_bypass_producer *);
@ -2413,10 +2410,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
struct irq_bypass_producer *);
void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *,
struct kvm_kernel_irq_routing_entry *);
void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new);
#endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
#ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS

View File

@ -55,10 +55,13 @@ struct kvm_kernel_irqfd {
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
poll_table pt;
struct work_struct shutdown;
struct irq_bypass_consumer consumer;
struct irq_bypass_producer *producer;
struct kvm_vcpu *irq_bypass_vcpu;
struct list_head vcpu_list;
void *irq_bypass_data;
};
#endif /* __LINUX_KVM_IRQFD_H */

View File

@ -164,6 +164,8 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
struct wait_queue_entry *wq_entry);
extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)

View File

@ -82,95 +82,15 @@ TRACE_EVENT(kvm_set_irq,
TP_printk("gsi %u level %d source %d",
__entry->gsi, __entry->level, __entry->irq_source_id)
);
#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
#if defined(__KVM_HAVE_IOAPIC)
#define kvm_deliver_mode \
{0x0, "Fixed"}, \
{0x1, "LowPrio"}, \
{0x2, "SMI"}, \
{0x3, "Res3"}, \
{0x4, "NMI"}, \
{0x5, "INIT"}, \
{0x6, "SIPI"}, \
{0x7, "ExtINT"}
TRACE_EVENT(kvm_ioapic_set_irq,
TP_PROTO(__u64 e, int pin, bool coalesced),
TP_ARGS(e, pin, coalesced),
TP_STRUCT__entry(
__field( __u64, e )
__field( int, pin )
__field( bool, coalesced )
),
TP_fast_assign(
__entry->e = e;
__entry->pin = pin;
__entry->coalesced = coalesced;
),
TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
__entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
__print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
(__entry->e & (1<<11)) ? "logical" : "physical",
(__entry->e & (1<<15)) ? "level" : "edge",
(__entry->e & (1<<16)) ? "|masked" : "",
__entry->coalesced ? " (coalesced)" : "")
);
TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
TP_PROTO(__u64 e),
TP_ARGS(e),
TP_STRUCT__entry(
__field( __u64, e )
),
TP_fast_assign(
__entry->e = e;
),
TP_printk("dst %x vec %u (%s|%s|%s%s)",
(u8)(__entry->e >> 56), (u8)__entry->e,
__print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
(__entry->e & (1<<11)) ? "logical" : "physical",
(__entry->e & (1<<15)) ? "level" : "edge",
(__entry->e & (1<<16)) ? "|masked" : "")
);
TRACE_EVENT(kvm_msi_set_irq,
TP_PROTO(__u64 address, __u64 data),
TP_ARGS(address, data),
TP_STRUCT__entry(
__field( __u64, address )
__field( __u64, data )
),
TP_fast_assign(
__entry->address = address;
__entry->data = data;
),
TP_printk("dst %llx vec %u (%s|%s|%s%s)",
(u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
(u8)__entry->data,
__print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
(__entry->address & (1<<2)) ? "logical" : "physical",
(__entry->data & (1<<15)) ? "level" : "edge",
(__entry->address & (1<<3)) ? "|rh" : "")
);
#ifdef CONFIG_KVM_IOAPIC
#define kvm_irqchips \
{KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \
{KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \
{KVM_IRQCHIP_IOAPIC, "IOAPIC"}
#endif /* defined(__KVM_HAVE_IOAPIC) */
#if defined(CONFIG_HAVE_KVM_IRQCHIP)
#endif /* CONFIG_KVM_IOAPIC */
#ifdef kvm_irqchips
#define kvm_ack_irq_string "irqchip %s pin %u"

View File

@ -40,13 +40,31 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
{
unsigned long flags;
wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
wq_entry->flags |= WQ_FLAG_PRIORITY;
spin_lock_irqsave(&wq_head->lock, flags);
__add_wait_queue(wq_head, wq_entry);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL_GPL(add_wait_queue_priority);
int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
struct wait_queue_entry *wq_entry)
{
struct list_head *head = &wq_head->head;
wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
guard(spinlock_irqsave)(&wq_head->lock);
if (!list_empty(head) &&
(list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY))
return -EBUSY;
list_add(&wq_entry->entry, head);
return 0;
}
EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive);
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@ -64,7 +82,7 @@ EXPORT_SYMBOL(remove_wait_queue);
* the non-exclusive tasks. Normally, exclusive tasks will be at the end of
* the list and any non-exclusive tasks will be woken first. A priority task
* may be at the head of the list, and can consume the event without any other
* tasks being woken.
* tasks being woken if it's also an exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

View File

@ -59,6 +59,7 @@ TEST_PROGS_x86 += x86/nx_huge_pages_test.sh
TEST_GEN_PROGS_COMMON = demand_paging_test
TEST_GEN_PROGS_COMMON += dirty_log_test
TEST_GEN_PROGS_COMMON += guest_print_test
TEST_GEN_PROGS_COMMON += irqfd_test
TEST_GEN_PROGS_COMMON += kvm_binary_stats_test
TEST_GEN_PROGS_COMMON += kvm_create_max_vcpus
TEST_GEN_PROGS_COMMON += kvm_page_table_test

View File

@ -620,18 +620,12 @@ static void kvm_routing_and_irqfd_check(struct kvm_vm *vm,
* that no actual interrupt was injected for those cases.
*/
for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
fd[f] = eventfd(0, 0);
TEST_ASSERT(fd[f] != -1, __KVM_SYSCALL_ERROR("eventfd()", fd[f]));
}
for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++)
fd[f] = kvm_new_eventfd();
for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
struct kvm_irqfd irqfd = {
.fd = fd[f],
.gsi = i - MIN_SPI,
};
assert(i <= (uint64_t)UINT_MAX);
vm_ioctl(vm, KVM_IRQFD, &irqfd);
kvm_assign_irqfd(vm, i - MIN_SPI, fd[f]);
}
for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {

View File

@ -1,5 +1,6 @@
CONFIG_KVM=y
CONFIG_KVM_INTEL=y
CONFIG_KVM_AMD=y
CONFIG_EVENTFD=y
CONFIG_USERFAULTFD=y
CONFIG_IDLE_PAGE_TRACKING=y

View File

@ -18,6 +18,7 @@
#include <asm/atomic.h>
#include <asm/kvm.h>
#include <sys/eventfd.h>
#include <sys/ioctl.h>
#include "kvm_util_arch.h"
@ -502,6 +503,45 @@ static inline int vm_get_stats_fd(struct kvm_vm *vm)
return fd;
}
static inline int __kvm_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd,
uint32_t flags)
{
struct kvm_irqfd irqfd = {
.fd = eventfd,
.gsi = gsi,
.flags = flags,
.resamplefd = -1,
};
return __vm_ioctl(vm, KVM_IRQFD, &irqfd);
}
static inline void kvm_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd,
uint32_t flags)
{
int ret = __kvm_irqfd(vm, gsi, eventfd, flags);
TEST_ASSERT_VM_VCPU_IOCTL(!ret, KVM_IRQFD, ret, vm);
}
static inline void kvm_assign_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd)
{
kvm_irqfd(vm, gsi, eventfd, 0);
}
static inline void kvm_deassign_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd)
{
kvm_irqfd(vm, gsi, eventfd, KVM_IRQFD_FLAG_DEASSIGN);
}
static inline int kvm_new_eventfd(void)
{
int fd = eventfd(0, 0);
TEST_ASSERT(fd >= 0, __KVM_SYSCALL_ERROR("eventfd()", fd));
return fd;
}
static inline void read_stats_header(int stats_fd, struct kvm_stats_header *header)
{
ssize_t ret;

View File

@ -0,0 +1,135 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <stdint.h>
#include <sys/sysinfo.h>
#include "kvm_util.h"
static struct kvm_vm *vm1;
static struct kvm_vm *vm2;
static int __eventfd;
static bool done;
/*
* KVM de-assigns based on eventfd *and* GSI, but requires unique eventfds when
* assigning (the API isn't symmetrical). Abuse the oddity and use a per-task
* GSI base to avoid false failures due to cross-task de-assign, i.e. so that
* the secondary doesn't de-assign the primary's eventfd and cause assign to
* unexpectedly succeed on the primary.
*/
#define GSI_BASE_PRIMARY 0x20
#define GSI_BASE_SECONDARY 0x30
static void juggle_eventfd_secondary(struct kvm_vm *vm, int eventfd)
{
int r, i;
/*
* The secondary task can encounter EBADF since the primary can close
* the eventfd at any time. And because the primary can recreate the
* eventfd, at the safe fd in the file table, the secondary can also
* encounter "unexpected" success, e.g. if the close+recreate happens
* between the first and second assignments. The secondary's role is
* mostly to antagonize KVM, not to detect bugs.
*/
for (i = 0; i < 2; i++) {
r = __kvm_irqfd(vm, GSI_BASE_SECONDARY, eventfd, 0);
TEST_ASSERT(!r || errno == EBUSY || errno == EBADF,
"Wanted success, EBUSY, or EBADF, r = %d, errno = %d",
r, errno);
/* De-assign should succeed unless the eventfd was closed. */
r = __kvm_irqfd(vm, GSI_BASE_SECONDARY + i, eventfd, KVM_IRQFD_FLAG_DEASSIGN);
TEST_ASSERT(!r || errno == EBADF,
"De-assign should succeed unless the fd was closed");
}
}
static void *secondary_irqfd_juggler(void *ign)
{
while (!READ_ONCE(done)) {
juggle_eventfd_secondary(vm1, READ_ONCE(__eventfd));
juggle_eventfd_secondary(vm2, READ_ONCE(__eventfd));
}
return NULL;
}
static void juggle_eventfd_primary(struct kvm_vm *vm, int eventfd)
{
int r1, r2;
/*
* At least one of the assigns should fail. KVM disallows assigning a
* single eventfd to multiple GSIs (or VMs), so it's possible that both
* assignments can fail, too.
*/
r1 = __kvm_irqfd(vm, GSI_BASE_PRIMARY, eventfd, 0);
TEST_ASSERT(!r1 || errno == EBUSY,
"Wanted success or EBUSY, r = %d, errno = %d", r1, errno);
r2 = __kvm_irqfd(vm, GSI_BASE_PRIMARY + 1, eventfd, 0);
TEST_ASSERT(r1 || (r2 && errno == EBUSY),
"Wanted failure (EBUSY), r1 = %d, r2 = %d, errno = %d",
r1, r2, errno);
/*
* De-assign should always succeed, even if the corresponding assign
* failed.
*/
kvm_irqfd(vm, GSI_BASE_PRIMARY, eventfd, KVM_IRQFD_FLAG_DEASSIGN);
kvm_irqfd(vm, GSI_BASE_PRIMARY + 1, eventfd, KVM_IRQFD_FLAG_DEASSIGN);
}
int main(int argc, char *argv[])
{
pthread_t racing_thread;
int r, i;
/* Create "full" VMs, as KVM_IRQFD requires an in-kernel IRQ chip. */
vm1 = vm_create(1);
vm2 = vm_create(1);
WRITE_ONCE(__eventfd, kvm_new_eventfd());
kvm_irqfd(vm1, 10, __eventfd, 0);
r = __kvm_irqfd(vm1, 11, __eventfd, 0);
TEST_ASSERT(r && errno == EBUSY,
"Wanted EBUSY, r = %d, errno = %d", r, errno);
r = __kvm_irqfd(vm2, 12, __eventfd, 0);
TEST_ASSERT(r && errno == EBUSY,
"Wanted EBUSY, r = %d, errno = %d", r, errno);
/*
* De-assign all eventfds, along with multiple eventfds that were never
* assigned. KVM's ABI is that de-assign is allowed so long as the
* eventfd itself is valid.
*/
kvm_irqfd(vm1, 11, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
kvm_irqfd(vm1, 12, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
kvm_irqfd(vm1, 13, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
kvm_irqfd(vm1, 14, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
kvm_irqfd(vm1, 10, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
close(__eventfd);
pthread_create(&racing_thread, NULL, secondary_irqfd_juggler, vm2);
for (i = 0; i < 10000; i++) {
WRITE_ONCE(__eventfd, kvm_new_eventfd());
juggle_eventfd_primary(vm1, __eventfd);
juggle_eventfd_primary(vm2, __eventfd);
close(__eventfd);
}
WRITE_ONCE(done, true);
pthread_join(racing_thread, NULL);
}

View File

@ -1716,7 +1716,18 @@ void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa)
/* Create an interrupt controller chip for the specified VM. */
void vm_create_irqchip(struct kvm_vm *vm)
{
vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL);
int r;
/*
* Allocate a fully in-kernel IRQ chip by default, but fall back to a
* split model (x86 only) if that fails (KVM x86 allows compiling out
* support for KVM_CREATE_IRQCHIP).
*/
r = __vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL);
if (r && errno == ENOTTY && kvm_has_cap(KVM_CAP_SPLIT_IRQCHIP))
vm_enable_cap(vm, KVM_CAP_SPLIT_IRQCHIP, 24);
else
TEST_ASSERT_VM_VCPU_IOCTL(!r, KVM_CREATE_IRQCHIP, r, vm);
vm->has_irqchip = true;
}

View File

@ -547,15 +547,9 @@ int main(int argc, char *argv[])
int irq_fd[2] = { -1, -1 };
if (do_eventfd_tests) {
irq_fd[0] = eventfd(0, 0);
irq_fd[1] = eventfd(0, 0);
irq_fd[0] = kvm_new_eventfd();
irq_fd[1] = kvm_new_eventfd();
/* Unexpected, but not a KVM failure */
if (irq_fd[0] == -1 || irq_fd[1] == -1)
do_evtchn_tests = do_eventfd_tests = false;
}
if (do_eventfd_tests) {
irq_routes.info.nr = 2;
irq_routes.entries[0].gsi = 32;
@ -572,15 +566,8 @@ int main(int argc, char *argv[])
vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info);
struct kvm_irqfd ifd = { };
ifd.fd = irq_fd[0];
ifd.gsi = 32;
vm_ioctl(vm, KVM_IRQFD, &ifd);
ifd.fd = irq_fd[1];
ifd.gsi = 33;
vm_ioctl(vm, KVM_IRQFD, &ifd);
kvm_assign_irqfd(vm, 32, irq_fd[0]);
kvm_assign_irqfd(vm, 33, irq_fd[1]);
struct sigaction sa = { };
sa.sa_handler = handle_alrm;

View File

@ -204,6 +204,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
int ret = 0;
if (flags & EPOLLIN) {
/*
* WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP,
* as KVM holds irqfds.lock when registering the irqfd with the
* eventfd.
*/
u64 cnt;
eventfd_ctx_do_read(irqfd->eventfd, &cnt);
@ -225,6 +230,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
/* The eventfd is closing, detach from KVM */
unsigned long iflags;
/*
* Taking irqfds.lock is safe here, as KVM holds a reference to
* the eventfd when registering the irqfd, i.e. this path can't
* be reached while kvm_irqfd_add() is running.
*/
spin_lock_irqsave(&kvm->irqfds.lock, iflags);
/*
@ -245,22 +255,14 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
return ret;
}
static void
irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
struct kvm_kernel_irqfd *irqfd =
container_of(pt, struct kvm_kernel_irqfd, pt);
add_wait_queue_priority(wqh, &irqfd->wait);
}
/* Must be called under irqfds.lock */
static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
int n_entries;
lockdep_assert_held(&kvm->irqfds.lock);
n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
write_seqcount_begin(&irqfd->irq_entry_sc);
@ -274,6 +276,63 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
write_seqcount_end(&irqfd->irq_entry_sc);
}
struct kvm_irqfd_pt {
struct kvm_kernel_irqfd *irqfd;
struct kvm *kvm;
poll_table pt;
int ret;
};
static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt);
struct kvm_kernel_irqfd *irqfd = p->irqfd;
struct kvm *kvm = p->kvm;
/*
* Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing,
* and irqfds.items. It does NOT protect registering with the eventfd.
*/
spin_lock_irq(&kvm->irqfds.lock);
/*
* Initialize the routing information prior to adding the irqfd to the
* eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the
* irqfd is registered.
*/
irqfd_update(kvm, irqfd);
/*
* Add the irqfd as a priority waiter on the eventfd, with a custom
* wake-up handler, so that KVM *and only KVM* is notified whenever the
* underlying eventfd is signaled.
*/
init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
/*
* Temporarily lie to lockdep about holding irqfds.lock to avoid a
* false positive regarding potential deadlock with irqfd_wakeup()
* (see irqfd_wakeup() for details).
*
* Adding to the wait queue will fail if there is already a priority
* waiter, i.e. if the eventfd is associated with another irqfd (in any
* VM). Note, kvm_irqfd_deassign() waits for all in-flight shutdown
* jobs to complete, i.e. ensures the irqfd has been removed from the
* eventfd's waitqueue before returning to userspace.
*/
spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_);
p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait);
spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_);
if (p->ret)
goto out;
list_add_tail(&irqfd->list, &kvm->irqfds.items);
out:
spin_unlock_irq(&kvm->irqfds.lock);
}
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
void __attribute__((weak)) kvm_arch_irq_bypass_stop(
struct irq_bypass_consumer *cons)
@ -285,26 +344,20 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start(
{
}
int __attribute__((weak)) kvm_arch_update_irqfd_routing(
struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
{
return 0;
}
bool __attribute__((weak)) kvm_arch_irqfd_route_changed(
struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
{
return true;
}
#endif
static int
kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
{
struct kvm_kernel_irqfd *irqfd, *tmp;
struct kvm_kernel_irqfd *irqfd;
struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
struct kvm_irqfd_pt irqfd_pt;
int ret;
__poll_t events;
int idx;
@ -390,57 +443,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
}
/*
* Install our own custom wake-up handling so we are notified via
* a callback whenever someone signals the underlying eventfd
* Set the irqfd routing and add it to KVM's list before registering
* the irqfd with the eventfd, so that the routing information is valid
* and stays valid, e.g. if there are GSI routing changes, prior to
* making the irqfd visible, i.e. before it might be signaled.
*
* Note, holding SRCU ensures a stable read of routing information, and
* also prevents irqfd_shutdown() from freeing the irqfd before it's
* fully initialized.
*/
init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
spin_lock_irq(&kvm->irqfds.lock);
ret = 0;
list_for_each_entry(tmp, &kvm->irqfds.items, list) {
if (irqfd->eventfd != tmp->eventfd)
continue;
/* This fd is used for another irq already. */
ret = -EBUSY;
spin_unlock_irq(&kvm->irqfds.lock);
goto fail;
}
idx = srcu_read_lock(&kvm->irq_srcu);
irqfd_update(kvm, irqfd);
list_add_tail(&irqfd->list, &kvm->irqfds.items);
spin_unlock_irq(&kvm->irqfds.lock);
/*
* Check if there was an event already pending on the eventfd
* before we registered, and trigger it as if we didn't miss it.
* Register the irqfd with the eventfd by polling on the eventfd, and
* simultaneously and the irqfd to KVM's list. If there was en event
* pending on the eventfd prior to registering, manually trigger IRQ
* injection.
*/
events = vfs_poll(fd_file(f), &irqfd->pt);
irqfd_pt.irqfd = irqfd;
irqfd_pt.kvm = kvm;
init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register);
events = vfs_poll(fd_file(f), &irqfd_pt.pt);
ret = irqfd_pt.ret;
if (ret)
goto fail_poll;
if (events & EPOLLIN)
schedule_work(&irqfd->inject);
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
if (kvm_arch_has_irq_bypass()) {
irqfd->consumer.token = (void *)irqfd->eventfd;
irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
irqfd->consumer.start = kvm_arch_irq_bypass_start;
ret = irq_bypass_register_consumer(&irqfd->consumer);
ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd);
if (ret)
pr_info("irq bypass consumer (token %p) registration fails: %d\n",
irqfd->consumer.token, ret);
pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n",
irqfd->eventfd, ret);
}
#endif
srcu_read_unlock(&kvm->irq_srcu, idx);
return 0;
fail_poll:
srcu_read_unlock(&kvm->irq_srcu, idx);
fail:
if (irqfd->resampler)
irqfd_resampler_shutdown(irqfd);
@ -617,13 +667,8 @@ void kvm_irq_routing_update(struct kvm *kvm)
irqfd_update(kvm, irqfd);
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
if (irqfd->producer &&
kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) {
int ret = kvm_arch_update_irqfd_routing(
irqfd->kvm, irqfd->producer->irq,
irqfd->gsi, 1);
WARN_ON(ret);
}
if (irqfd->producer)
kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry);
#endif
}

View File

@ -222,8 +222,6 @@ int kvm_set_irq_routing(struct kvm *kvm,
kvm_arch_irq_routing_update(kvm);
mutex_unlock(&kvm->irq_lock);
kvm_arch_post_irq_routing_update(kvm);
synchronize_srcu_expedited(&kvm->irq_srcu);
new = old;

View File

@ -22,8 +22,8 @@
MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("IRQ bypass manager utility module");
static LIST_HEAD(producers);
static LIST_HEAD(consumers);
static DEFINE_XARRAY(producers);
static DEFINE_XARRAY(consumers);
static DEFINE_MUTEX(lock);
/* @lock must be held when calling connect */
@ -51,6 +51,10 @@ static int __connect(struct irq_bypass_producer *prod,
if (prod->start)
prod->start(prod);
if (!ret) {
prod->consumer = cons;
cons->producer = prod;
}
return ret;
}
@ -72,56 +76,49 @@ static void __disconnect(struct irq_bypass_producer *prod,
cons->start(cons);
if (prod->start)
prod->start(prod);
prod->consumer = NULL;
cons->producer = NULL;
}
/**
* irq_bypass_register_producer - register IRQ bypass producer
* @producer: pointer to producer structure
* @eventfd: pointer to the eventfd context associated with the producer
* @irq: Linux IRQ number of the underlying producer device
*
* Add the provided IRQ producer to the list of producers and connect
* with any matching token found on the IRQ consumers list.
* Add the provided IRQ producer to the set of producers and connect with the
* consumer with a matching eventfd, if one exists.
*/
int irq_bypass_register_producer(struct irq_bypass_producer *producer)
int irq_bypass_register_producer(struct irq_bypass_producer *producer,
struct eventfd_ctx *eventfd, int irq)
{
struct irq_bypass_producer *tmp;
unsigned long index = (unsigned long)eventfd;
struct irq_bypass_consumer *consumer;
int ret;
if (!producer->token)
if (WARN_ON_ONCE(producer->eventfd))
return -EINVAL;
might_sleep();
producer->irq = irq;
if (!try_module_get(THIS_MODULE))
return -ENODEV;
guard(mutex)(&lock);
mutex_lock(&lock);
ret = xa_insert(&producers, index, producer, GFP_KERNEL);
if (ret)
return ret;
list_for_each_entry(tmp, &producers, node) {
if (tmp->token == producer->token) {
ret = -EBUSY;
goto out_err;
consumer = xa_load(&consumers, index);
if (consumer) {
ret = __connect(producer, consumer);
if (ret) {
WARN_ON_ONCE(xa_erase(&producers, index) != producer);
return ret;
}
}
list_for_each_entry(consumer, &consumers, node) {
if (consumer->token == producer->token) {
ret = __connect(producer, consumer);
if (ret)
goto out_err;
break;
}
}
list_add(&producer->node, &producers);
mutex_unlock(&lock);
producer->eventfd = eventfd;
return 0;
out_err:
mutex_unlock(&lock);
module_put(THIS_MODULE);
return ret;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
@ -129,95 +126,65 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
* irq_bypass_unregister_producer - unregister IRQ bypass producer
* @producer: pointer to producer structure
*
* Remove a previously registered IRQ producer from the list of producers
* and disconnect it from any connected IRQ consumer.
* Remove a previously registered IRQ producer (note, it's safe to call this
* even if registration was unsuccessful). Disconnect from the associated
* consumer, if one exists.
*/
void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
{
struct irq_bypass_producer *tmp;
struct irq_bypass_consumer *consumer;
unsigned long index = (unsigned long)producer->eventfd;
if (!producer->token)
if (!producer->eventfd)
return;
might_sleep();
guard(mutex)(&lock);
if (!try_module_get(THIS_MODULE))
return; /* nothing in the list anyway */
if (producer->consumer)
__disconnect(producer, producer->consumer);
mutex_lock(&lock);
list_for_each_entry(tmp, &producers, node) {
if (tmp->token != producer->token)
continue;
list_for_each_entry(consumer, &consumers, node) {
if (consumer->token == producer->token) {
__disconnect(producer, consumer);
break;
}
}
list_del(&producer->node);
module_put(THIS_MODULE);
break;
}
mutex_unlock(&lock);
module_put(THIS_MODULE);
WARN_ON_ONCE(xa_erase(&producers, index) != producer);
producer->eventfd = NULL;
}
EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
/**
* irq_bypass_register_consumer - register IRQ bypass consumer
* @consumer: pointer to consumer structure
* @eventfd: pointer to the eventfd context associated with the consumer
*
* Add the provided IRQ consumer to the list of consumers and connect
* with any matching token found on the IRQ producer list.
* Add the provided IRQ consumer to the set of consumers and connect with the
* producer with a matching eventfd, if one exists.
*/
int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer,
struct eventfd_ctx *eventfd)
{
struct irq_bypass_consumer *tmp;
unsigned long index = (unsigned long)eventfd;
struct irq_bypass_producer *producer;
int ret;
if (!consumer->token ||
!consumer->add_producer || !consumer->del_producer)
if (WARN_ON_ONCE(consumer->eventfd))
return -EINVAL;
might_sleep();
if (!consumer->add_producer || !consumer->del_producer)
return -EINVAL;
if (!try_module_get(THIS_MODULE))
return -ENODEV;
guard(mutex)(&lock);
mutex_lock(&lock);
ret = xa_insert(&consumers, index, consumer, GFP_KERNEL);
if (ret)
return ret;
list_for_each_entry(tmp, &consumers, node) {
if (tmp->token == consumer->token || tmp == consumer) {
ret = -EBUSY;
goto out_err;
producer = xa_load(&producers, index);
if (producer) {
ret = __connect(producer, consumer);
if (ret) {
WARN_ON_ONCE(xa_erase(&consumers, index) != consumer);
return ret;
}
}
list_for_each_entry(producer, &producers, node) {
if (producer->token == consumer->token) {
ret = __connect(producer, consumer);
if (ret)
goto out_err;
break;
}
}
list_add(&consumer->node, &consumers);
mutex_unlock(&lock);
consumer->eventfd = eventfd;
return 0;
out_err:
mutex_unlock(&lock);
module_put(THIS_MODULE);
return ret;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
@ -225,42 +192,23 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
* irq_bypass_unregister_consumer - unregister IRQ bypass consumer
* @consumer: pointer to consumer structure
*
* Remove a previously registered IRQ consumer from the list of consumers
* and disconnect it from any connected IRQ producer.
* Remove a previously registered IRQ consumer (note, it's safe to call this
* even if registration was unsuccessful). Disconnect from the associated
* producer, if one exists.
*/
void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
{
struct irq_bypass_consumer *tmp;
struct irq_bypass_producer *producer;
unsigned long index = (unsigned long)consumer->eventfd;
if (!consumer->token)
if (!consumer->eventfd)
return;
might_sleep();
guard(mutex)(&lock);
if (!try_module_get(THIS_MODULE))
return; /* nothing in the list anyway */
if (consumer->producer)
__disconnect(consumer->producer, consumer);
mutex_lock(&lock);
list_for_each_entry(tmp, &consumers, node) {
if (tmp != consumer)
continue;
list_for_each_entry(producer, &producers, node) {
if (producer->token == consumer->token) {
__disconnect(producer, consumer);
break;
}
}
list_del(&consumer->node);
module_put(THIS_MODULE);
break;
}
mutex_unlock(&lock);
module_put(THIS_MODULE);
WARN_ON_ONCE(xa_erase(&consumers, index) != consumer);
consumer->eventfd = NULL;
}
EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);