Merge tag 'kvm-x86-irqs-6.17' of https://github.com/kvm-x86/linux into HEAD

KVM IRQ changes for 6.17 - Rework irqbypass to track/match producers and consumers via an xarray instead of a linked list. Using a linked list leads to O(n^2) insertion times, which is hugely problematic for use cases that create large numbers of VMs. Such use cases typically don't actually use irqbypass, but eliminating the pointless registration is a future problem to solve as it likely requires new uAPI. - Track irqbypass's "token" as "struct eventfd_ctx *" instead of a "void *", to avoid making a simple concept unnecessarily difficult to understand. - Add CONFIG_KVM_IOAPIC for x86 to allow disabling support for I/O APIC, PIC, and PIT emulation at compile time. - Drop x86's irq_comm.c, and move a pile of IRQ related code into irq.c. - Fix a variety of flaws and bugs in the AVIC device posted IRQ code. - Inhibited AVIC if a vCPU's ID is too big (relative to what hardware supports) instead of rejecting vCPU creation. - Extend enable_ipiv module param support to SVM, by simply leaving IsRunning clear in the vCPU's physical ID table entry. - Disable IPI virtualization, via enable_ipiv, if the CPU is affected by erratum #1235, to allow (safely) enabling AVIC on such CPUs. - Dedup x86's device posted IRQ code, as the vast majority of functionality can be shared verbatime between SVM and VMX. - Harden the device posted IRQ code against bugs and runtime errors. - Use vcpu_idx, not vcpu_id, for GA log tag/metadata, to make lookups O(1) instead of O(n). - Generate GA Log interrupts if and only if the target vCPU is blocking, i.e. only if KVM needs a notification in order to wake the vCPU. - Decouple device posted IRQs from VFIO device assignment, as binding a VM to a VFIO group is not a requirement for enabling device posted IRQs. - Clean up and document/comment the irqfd assignment code. - Disallow binding multiple irqfds to an eventfd with a priority waiter, i.e. ensure an eventfd is bound to at most one irqfd through the entire host, and add a selftest to verify eventfd:irqfd bindings are globally unique.
2025-07-28 11:03:04 -04:00 · 2025-07-28 11:03:04 -04:00 · f02b1bcc73
parent 65164fd0f6 81bf24f1ac
commit f02b1bcc73
56 changed files with 1847 additions and 1765 deletions
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@ -2765,19 +2765,15 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq);
 }
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
-				  struct kvm_kernel_irq_routing_entry *new)
+				   struct kvm_kernel_irq_routing_entry *old,
 				   struct kvm_kernel_irq_routing_entry *new)
 {
-	if (old->type != KVM_IRQ_ROUTING_MSI ||
+	if (old->type == KVM_IRQ_ROUTING_MSI &&
-	    new->type != KVM_IRQ_ROUTING_MSI)
+	    new->type == KVM_IRQ_ROUTING_MSI &&
-		return true;
+	    !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
 		return;
 	return memcmp(&old->msi, &new->msi, sizeof(new->msi));
 }
 int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
 				  uint32_t guest_irq, bool set)
 {
 	/*
 	 * Remapping the vLPI requires taking the its_lock mutex to resolve
 	 * the new translation. We're in spinlock land at this point, so no
@ -2785,7 +2781,7 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
 	 *
 	 * Unmap the vLPI and fall back to software LPI injection.
 	 */
-	return kvm_vgic_v4_unset_forwarding(kvm, host_irq);
+	return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq);
 }
 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@ -758,7 +758,7 @@ static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
 	if (irq) {
 		scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
 			if (irq->hw)
-				WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
+				its_unmap_vlpi(ite->irq->host_irq);
 			irq->hw = false;
 		}
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@ -527,28 +527,26 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq)
 	return NULL;
 }
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
+void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
 {
 	struct vgic_irq *irq;
 	unsigned long flags;
 	int ret = 0;
 	if (!vgic_supports_direct_msis(kvm))
-		return 0;
+		return;
 	irq = __vgic_host_irq_get_vlpi(kvm, host_irq);
 	if (!irq)
-		return 0;
+		return;
 	raw_spin_lock_irqsave(&irq->irq_lock, flags);
 	WARN_ON(irq->hw && irq->host_irq != host_irq);
 	if (irq->hw) {
 		atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
 		irq->hw = false;
-		ret = its_unmap_vlpi(host_irq);
+		its_unmap_vlpi(host_irq);
 	}
 	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
 	vgic_put_irq(kvm, irq);
 	return ret;
 }
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@ -26,7 +26,22 @@ enum {
 	IRQ_REMAP_X2APIC_MODE,
 };
-struct vcpu_data {
+/*
 * This is mainly used to communicate information back-and-forth
 * between SVM and IOMMU for setting up and tearing down posted
 * interrupt
 */
 struct amd_iommu_pi_data {
 	u64 vapic_addr;		/* Physical address of the vCPU's vAPIC. */
 	u32 ga_tag;
 	u32 vector;		/* Guest vector of the interrupt */
 	int cpu;
 	bool ga_log_intr;
 	bool is_guest_mode;
 	void *ir_data;
 };
 struct intel_iommu_pi_data {
 	u64 pi_desc_addr;	/* Physical address of PI Descriptor */
 	u32 vector;		/* Guest vector of the interrupt */
 };
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@ -112,7 +112,7 @@ KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
 KVM_X86_OP_OPTIONAL(vcpu_blocking)
 KVM_X86_OP_OPTIONAL(vcpu_unblocking)
 KVM_X86_OP_OPTIONAL(pi_update_irte)
-KVM_X86_OP_OPTIONAL(pi_start_assignment)
+KVM_X86_OP_OPTIONAL(pi_start_bypass)
 KVM_X86_OP_OPTIONAL(apicv_pre_state_restore)
 KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
 KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@ -297,6 +297,7 @@ enum x86_intercept_stage;
 */
 #define KVM_APIC_PV_EOI_PENDING	1
 struct kvm_kernel_irqfd;
 struct kvm_kernel_irq_routing_entry;
 /*
@ -1320,6 +1321,12 @@ enum kvm_apicv_inhibit {
 	 */
 	APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
 	/*
 	 * AVIC is disabled because the vCPU's APIC ID is beyond the max
 	 * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable.
 	 */
 	APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG,
 	NR_APICV_INHIBIT_REASONS,
 };
@ -1338,7 +1345,8 @@ enum kvm_apicv_inhibit {
 	__APICV_INHIBIT_REASON(IRQWIN),			\
 	__APICV_INHIBIT_REASON(PIT_REINJ),		\
 	__APICV_INHIBIT_REASON(SEV),			\
-	__APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
+	__APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED),	\
 	__APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)
 struct kvm_arch {
 	unsigned long n_used_mmu_pages;
@ -1381,9 +1389,13 @@ struct kvm_arch {
 	atomic_t noncoherent_dma_count;
 #define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
 	atomic_t assigned_device_count;
 	unsigned long nr_possible_bypass_irqs;
 #ifdef CONFIG_KVM_IOAPIC
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
 #endif
 	atomic_t vapics_in_nmi_mode;
 	struct mutex apic_map_lock;
 	struct kvm_apic_map __rcu *apic_map;
@ -1403,7 +1415,6 @@ struct kvm_arch {
 	bool pause_in_guest;
 	bool cstate_in_guest;
 	unsigned long irq_sources_bitmap;
 	s64 kvmclock_offset;
 	/*
@ -1432,9 +1443,6 @@ struct kvm_arch {
 	struct delayed_work kvmclock_update_work;
 	struct delayed_work kvmclock_sync_work;
 	/* reads protected by irq_srcu, writes by irq_lock */
 	struct hlist_head mask_notifier_list;
 #ifdef CONFIG_KVM_HYPERV
 	struct kvm_hv hyperv;
 #endif
@ -1853,9 +1861,10 @@ struct kvm_x86_ops {
 	void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
 	void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
-	int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
+	int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
-			      uint32_t guest_irq, bool set);
+			      unsigned int host_irq, uint32_t guest_irq,
-	void (*pi_start_assignment)(struct kvm *kvm);
+			      struct kvm_vcpu *vcpu, u32 vector);
 	void (*pi_start_bypass)(struct kvm *kvm);
 	void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
 	void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
 	bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
@ -1950,6 +1959,7 @@ struct kvm_arch_async_pf {
 extern u32 __read_mostly kvm_nr_uret_msrs;
 extern bool __read_mostly allow_smaller_maxphyaddr;
 extern bool __read_mostly enable_apicv;
 extern bool __read_mostly enable_ipiv;
 extern bool __read_mostly enable_device_posted_irqs;
 extern struct kvm_x86_ops kvm_x86_ops;
@ -2044,19 +2054,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
 struct kvm_irq_mask_notifier {
 	void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
 	int irq;
 	struct hlist_node link;
 };
 void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
 				    struct kvm_irq_mask_notifier *kimn);
 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
 			     bool mask);
 extern bool tdp_enabled;
 u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
@ -2215,9 +2212,6 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
 	return !!(*irq_state);
 }
 int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
 void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
@ -2394,9 +2388,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
 bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			     struct kvm_vcpu **dest_vcpu);
 void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq);
 static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
 {
 	/* We can only post Fixed and LowPrio IRQs */
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@ -252,16 +252,21 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 #define AVIC_LOGICAL_ID_ENTRY_VALID_BIT			31
 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK		(1 << 31)
 /*
 * GA_LOG_INTR is a synthetic flag that's never propagated to hardware-visible
 * tables.  GA_LOG_INTR is set if the vCPU needs device posted IRQs to generate
 * GA log interrupts to wake the vCPU (because it's blocking or about to block).
 */
 #define AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR		BIT_ULL(61)
 #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK	GENMASK_ULL(11, 0)
-#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK	(0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK	GENMASK_ULL(51, 12)
 #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK		(1ULL << 62)
 #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK		(1ULL << 63)
 #define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK		(0xFFULL)
 #define AVIC_DOORBELL_PHYSICAL_ID_MASK			GENMASK_ULL(11, 0)
 #define VMCB_AVIC_APIC_BAR_MASK				0xFFFFFFFFFF000ULL
 #define AVIC_UNACCEL_ACCESS_WRITE_MASK		1
 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK		0xFF0
 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK		0xFFFFFFFF
@ -290,8 +295,6 @@ enum avic_ipi_failure_cause {
 static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID);
 static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
 #define AVIC_HPA_MASK	~((0xFFFULL << 52) | 0xFFF)
 #define SVM_SEV_FEAT_SNP_ACTIVE				BIT(0)
 #define SVM_SEV_FEAT_RESTRICTED_INJECTION		BIT(3)
 #define SVM_SEV_FEAT_ALTERNATE_INJECTION		BIT(4)
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@ -166,6 +166,16 @@ config KVM_AMD_SEV
 	  Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
 	  Secure Nested Paging (SEV-SNP) technologies on AMD processors.
 config KVM_IOAPIC
 	bool "I/O APIC, PIC, and PIT emulation"
 	default y
 	depends on KVM
 	help
 	  Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
 	  for full in-kernel APIC emulation.
 	  If unsure, say Y.
 config KVM_SMM
 	bool "System Management Mode emulation"
 	default y
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@ -5,12 +5,11 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror
 include $(srctree)/virt/kvm/Makefile.kvm
-kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
+kvm-y			+= x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \
-			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
+			   debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
 			   debugfs.o mmu/mmu.o mmu/page_track.o \
 			   mmu/spte.o
 kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
 kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
 kvm-$(CONFIG_KVM_HYPERV) += hyperv.o
 kvm-$(CONFIG_KVM_XEN)	+= xen.o
 kvm-$(CONFIG_KVM_SMM)	+= smm.o
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@ -497,15 +497,19 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
 	return ret;
 }
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
 			 int irq_source_id, int level, bool line_status)
 {
 	struct kvm_vcpu_hv_synic *synic;
-	synic = synic_get(kvm, vpidx);
+	if (!level)
 		return -1;
 	synic = synic_get(kvm, e->hv_sint.vcpu);
 	if (!synic)
 		return -EINVAL;
-	return synic_set_irq(synic, sint);
+	return synic_set_irq(synic, e->hv_sint.sint);
 }
 void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@ -103,7 +103,8 @@ static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
 void kvm_hv_irq_routing_update(struct kvm *kvm);
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
 			 int irq_source_id, int level, bool line_status);
 void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
 int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@ -248,8 +248,8 @@ static void pit_do_work(struct kthread_work *work)
 	if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
 		return;
-	kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
+	kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false);
-	kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+	kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false);
 	/*
 	 * Provides NMI watchdog support via Virtual Wire mode.
@ -288,7 +288,7 @@ static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
 	atomic_set(&pit->pit_state.irq_ack, 1);
 }
-void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+static void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
 {
 	struct kvm_kpit_state *ps = &pit->pit_state;
 	struct kvm *kvm = pit->kvm;
@ -400,8 +400,8 @@ static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
 	}
 }
-void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+static void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
-		int hpet_legacy_start)
+			       int hpet_legacy_start)
 {
 	u8 saved_mode;
@ -649,6 +649,79 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
 		kvm_pit_reset_reinject(pit);
 }
 int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
 	struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
 	BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
 	mutex_lock(&kps->lock);
 	memcpy(ps, &kps->channels, sizeof(*ps));
 	mutex_unlock(&kps->lock);
 	return 0;
 }
 int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
 	int i;
 	struct kvm_pit *pit = kvm->arch.vpit;
 	mutex_lock(&pit->pit_state.lock);
 	memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
 	for (i = 0; i < 3; i++)
 		kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
 	mutex_unlock(&pit->pit_state.lock);
 	return 0;
 }
 int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
 		sizeof(ps->channels));
 	ps->flags = kvm->arch.vpit->pit_state.flags;
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	memset(&ps->reserved, 0, sizeof(ps->reserved));
 	return 0;
 }
 int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
 	int start = 0;
 	int i;
 	u32 prev_legacy, cur_legacy;
 	struct kvm_pit *pit = kvm->arch.vpit;
 	mutex_lock(&pit->pit_state.lock);
 	prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
 	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
 	if (!prev_legacy && cur_legacy)
 		start = 1;
 	memcpy(&pit->pit_state.channels, &ps->channels,
 	       sizeof(pit->pit_state.channels));
 	pit->pit_state.flags = ps->flags;
 	for (i = 0; i < 3; i++)
 		kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
 				   start && i == 0);
 	mutex_unlock(&pit->pit_state.lock);
 	return 0;
 }
 int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control)
 {
 	struct kvm_pit *pit = kvm->arch.vpit;
 	/* pit->pit_state.lock was overloaded to prevent userspace from getting
 	 * an inconsistent state after running multiple KVM_REINJECT_CONTROL
 	 * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
 	 */
 	mutex_lock(&pit->pit_state.lock);
 	kvm_pit_set_reinject(pit, control->pit_reinject);
 	mutex_unlock(&pit->pit_state.lock);
 	return 0;
 }
 static const struct kvm_io_device_ops pit_dev_ops = {
 	.read     = pit_ioport_read,
 	.write    = pit_ioport_write,
@ -671,10 +744,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	if (!pit)
 		return NULL;
 	pit->irq_source_id = kvm_request_irq_source_id(kvm);
 	if (pit->irq_source_id < 0)
 		goto fail_request;
 	mutex_init(&pit->pit_state.lock);
 	pid = get_pid(task_tgid(current));
@ -726,8 +795,6 @@ fail_register_pit:
 	kvm_pit_set_reinject(pit, false);
 	kthread_destroy_worker(pit->worker);
 fail_kthread:
 	kvm_free_irq_source_id(kvm, pit->irq_source_id);
 fail_request:
 	kfree(pit);
 	return NULL;
 }
@ -744,7 +811,6 @@ void kvm_free_pit(struct kvm *kvm)
 		kvm_pit_set_reinject(pit, false);
 		hrtimer_cancel(&pit->pit_state.timer);
 		kthread_destroy_worker(pit->worker);
 		kvm_free_irq_source_id(kvm, pit->irq_source_id);
 		kfree(pit);
 	}
 }
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@ -6,6 +6,11 @@
 #include <kvm/iodev.h>
 #include <uapi/asm/kvm.h>
 #include "ioapic.h"
 #ifdef CONFIG_KVM_IOAPIC
 struct kvm_kpit_channel_state {
 	u32 count; /* can be 65536 */
 	u16 latched_count;
@ -42,7 +47,6 @@ struct kvm_pit {
 	struct kvm_io_device speaker_dev;
 	struct kvm *kvm;
 	struct kvm_kpit_state pit_state;
 	int irq_source_id;
 	struct kvm_irq_mask_notifier mask_notifier;
 	struct kthread_worker *worker;
 	struct kthread_work expired;
@ -55,11 +59,14 @@ struct kvm_pit {
 #define KVM_MAX_PIT_INTR_INTERVAL   HZ / 100
 #define KVM_PIT_CHANNEL_MASK	    0x3
 int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps);
 int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps);
 int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
 int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
 int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control);
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
 void kvm_free_pit(struct kvm *kvm);
-
+#endif /* CONFIG_KVM_IOAPIC */
 void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
 		int hpet_legacy_start);
 void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
 #endif
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@ -31,6 +31,8 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/bitops.h>
 #include "ioapic.h"
 #include "irq.h"
 #include <linux/kvm_host.h>
@ -185,8 +187,11 @@ void kvm_pic_update_irq(struct kvm_pic *s)
 	pic_unlock(s);
 }
-int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
 		    int irq_source_id, int level, bool line_status)
 {
 	struct kvm_pic *s = kvm->arch.vpic;
 	int irq = e->irqchip.pin;
 	int ret, irq_level;
 	BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
@ -203,16 +208,6 @@ int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
 	return ret;
 }
 void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
 {
 	int i;
 	pic_lock(s);
 	for (i = 0; i < PIC_NUM_PINS; i++)
 		__clear_bit(irq_source_id, &s->irq_states[i]);
 	pic_unlock(s);
 }
 /*
 * acknowledge interrupt 'irq'
 */
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@ -41,11 +41,11 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/current.h>
 #include <trace/events/kvm.h>
 #include "ioapic.h"
 #include "lapic.h"
 #include "irq.h"
 #include "trace.h"
 static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
 		bool line_status);
@ -310,6 +310,42 @@ void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
 	kvm_make_scan_ioapic_request(kvm);
 }
 void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
 				    struct kvm_irq_mask_notifier *kimn)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 	mutex_lock(&kvm->irq_lock);
 	kimn->irq = irq;
 	hlist_add_head_rcu(&kimn->link, &ioapic->mask_notifier_list);
 	mutex_unlock(&kvm->irq_lock);
 }
 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn)
 {
 	mutex_lock(&kvm->irq_lock);
 	hlist_del_rcu(&kimn->link);
 	mutex_unlock(&kvm->irq_lock);
 	synchronize_srcu(&kvm->irq_srcu);
 }
 void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
 			     bool mask)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 	struct kvm_irq_mask_notifier *kimn;
 	int idx, gsi;
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
 	if (gsi != -1)
 		hlist_for_each_entry_rcu(kimn, &ioapic->mask_notifier_list, link)
 			if (kimn->irq == gsi)
 				kimn->func(kimn, mask);
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 {
 	unsigned index;
@ -479,9 +515,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
 	return ret;
 }
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
-		       int level, bool line_status)
+		       int irq_source_id, int level, bool line_status)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 	int irq = e->irqchip.pin;
 	int ret, irq_level;
 	BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
@ -496,16 +534,6 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
 	return ret;
 }
 void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
 {
 	int i;
 	spin_lock(&ioapic->lock);
 	for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
 		__clear_bit(irq_source_id, &ioapic->irq_states[i]);
 	spin_unlock(&ioapic->lock);
 }
 static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
 {
 	int i;
@ -718,6 +746,7 @@ int kvm_ioapic_init(struct kvm *kvm)
 		return -ENOMEM;
 	spin_lock_init(&ioapic->lock);
 	INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
 	INIT_HLIST_HEAD(&ioapic->mask_notifier_list);
 	kvm->arch.vioapic = ioapic;
 	kvm_ioapic_reset(ioapic);
 	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@ -86,8 +86,24 @@ struct kvm_ioapic {
 	struct delayed_work eoi_inject;
 	u32 irq_eoi[IOAPIC_NUM_PINS];
 	u32 irr_delivered;
 	/* reads protected by irq_srcu, writes by irq_lock */
 	struct hlist_head mask_notifier_list;
 };
 struct kvm_irq_mask_notifier {
 	void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
 	int irq;
 	struct hlist_node link;
 };
 void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
 				    struct kvm_irq_mask_notifier *kimn);
 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
 			     bool mask);
 #ifdef DEBUG
 #define ASSERT(x)  							\
 do {									\
@ -103,7 +119,7 @@ do {									\
 static inline int ioapic_in_kernel(struct kvm *kvm)
 {
-	return irqchip_kernel(kvm);
+	return irqchip_full(kvm);
 }
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@ -111,9 +127,9 @@ void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
 			int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_destroy(struct kvm *kvm);
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
-		       int level, bool line_status);
+		       int irq_source_id, int level, bool line_status);
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
+
 void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@ -11,9 +11,12 @@
 #include <linux/export.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm_irqfd.h>
 #include "hyperv.h"
 #include "ioapic.h"
 #include "irq.h"
-#include "i8254.h"
+#include "trace.h"
 #include "x86.h"
 #include "xen.h"
@ -41,6 +44,14 @@ static int pending_userspace_extint(struct kvm_vcpu *v)
 	return v->arch.pending_external_vector != -1;
 }
 static int get_userspace_extint(struct kvm_vcpu *vcpu)
 {
 	int vector = vcpu->arch.pending_external_vector;
 	vcpu->arch.pending_external_vector = -1;
 	return vector;
 }
 /*
 * check if there is pending interrupt from
 * non-APIC source without intack.
@ -67,10 +78,13 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
 	if (!kvm_apic_accept_pic_intr(v))
 		return 0;
-	if (irqchip_split(v->kvm))
+#ifdef CONFIG_KVM_IOAPIC
-		return pending_userspace_extint(v);
+	if (pic_in_kernel(v->kvm))
 	else
 		return v->kvm->arch.vpic->output;
 #endif
 	WARN_ON_ONCE(!irqchip_split(v->kvm));
 	return pending_userspace_extint(v);
 }
 /*
@ -126,13 +140,13 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v)
 		return v->kvm->arch.xen.upcall_vector;
 #endif
-	if (irqchip_split(v->kvm)) {
+#ifdef CONFIG_KVM_IOAPIC
-		int vector = v->arch.pending_external_vector;
+	if (pic_in_kernel(v->kvm))
 		v->arch.pending_external_vector = -1;
 		return vector;
 	} else
 		return kvm_pic_read_irq(v->kvm); /* PIC */
 #endif
 	WARN_ON_ONCE(!irqchip_split(v->kvm));
 	return get_userspace_extint(v);
 }
 EXPORT_SYMBOL_GPL(kvm_cpu_get_extint);
@ -163,7 +177,9 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
 {
 	__kvm_migrate_apic_timer(vcpu);
 #ifdef CONFIG_KVM_IOAPIC
 	__kvm_migrate_pit_timer(vcpu);
 #endif
 	kvm_x86_call(migrate_timers)(vcpu);
 }
@ -171,10 +187,539 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
 {
 	bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
-	return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
+	return resample ? irqchip_full(kvm) : irqchip_in_kernel(kvm);
 }
 bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
 {
 	return irqchip_in_kernel(kvm);
 }
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 			     struct kvm_lapic_irq *irq, struct dest_map *dest_map)
 {
 	int r = -1;
 	struct kvm_vcpu *vcpu, *lowest = NULL;
 	unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
 	unsigned int dest_vcpus = 0;
 	if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
 		return r;
 	if (irq->dest_mode == APIC_DEST_PHYSICAL &&
 	    irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
 		pr_info("apic: phys broadcast and lowest prio\n");
 		irq->delivery_mode = APIC_DM_FIXED;
 	}
 	memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (!kvm_apic_present(vcpu))
 			continue;
 		if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
 					irq->dest_id, irq->dest_mode))
 			continue;
 		if (!kvm_lowest_prio_delivery(irq)) {
 			if (r < 0)
 				r = 0;
 			r += kvm_apic_set_irq(vcpu, irq, dest_map);
 		} else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
 			if (!kvm_vector_hashing_enabled()) {
 				if (!lowest)
 					lowest = vcpu;
 				else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
 					lowest = vcpu;
 			} else {
 				__set_bit(i, dest_vcpu_bitmap);
 				dest_vcpus++;
 			}
 		}
 	}
 	if (dest_vcpus != 0) {
 		int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
 					dest_vcpu_bitmap, KVM_MAX_VCPUS);
 		lowest = kvm_get_vcpu(kvm, idx);
 	}
 	if (lowest)
 		r = kvm_apic_set_irq(lowest, irq, dest_map);
 	return r;
 }
 static void kvm_msi_to_lapic_irq(struct kvm *kvm,
 				 struct kvm_kernel_irq_routing_entry *e,
 				 struct kvm_lapic_irq *irq)
 {
 	struct msi_msg msg = { .address_lo = e->msi.address_lo,
 			       .address_hi = e->msi.address_hi,
 			       .data = e->msi.data };
 	trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
 			      (u64)msg.address_hi << 32 : 0), msg.data);
 	irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
 	irq->vector = msg.arch_data.vector;
 	irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
 	irq->trig_mode = msg.arch_data.is_level;
 	irq->delivery_mode = msg.arch_data.delivery_mode << 8;
 	irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
 	irq->level = 1;
 	irq->shorthand = APIC_DEST_NOSHORT;
 }
 static inline bool kvm_msi_route_invalid(struct kvm *kvm,
 		struct kvm_kernel_irq_routing_entry *e)
 {
 	return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
 }
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		struct kvm *kvm, int irq_source_id, int level, bool line_status)
 {
 	struct kvm_lapic_irq irq;
 	if (kvm_msi_route_invalid(kvm, e))
 		return -EINVAL;
 	if (!level)
 		return -1;
 	kvm_msi_to_lapic_irq(kvm, e, &irq);
 	return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
 }
 int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
 			      struct kvm *kvm, int irq_source_id, int level,
 			      bool line_status)
 {
 	struct kvm_lapic_irq irq;
 	int r;
 	switch (e->type) {
 #ifdef CONFIG_KVM_HYPERV
 	case KVM_IRQ_ROUTING_HV_SINT:
 		return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level,
 					    line_status);
 #endif
 	case KVM_IRQ_ROUTING_MSI:
 		if (kvm_msi_route_invalid(kvm, e))
 			return -EINVAL;
 		kvm_msi_to_lapic_irq(kvm, e, &irq);
 		if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
 			return r;
 		break;
 #ifdef CONFIG_KVM_XEN
 	case KVM_IRQ_ROUTING_XEN_EVTCHN:
 		if (!level)
 			return -1;
 		return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
 #endif
 	default:
 		break;
 	}
 	return -EWOULDBLOCK;
 }
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
 			bool line_status)
 {
 	if (!irqchip_in_kernel(kvm))
 		return -ENXIO;
 	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
 					irq_event->irq, irq_event->level,
 					line_status);
 	return 0;
 }
 bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
 {
 	return irqchip_in_kernel(kvm);
 }
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
 {
 	/* We can't check irqchip_in_kernel() here as some callers are
 	 * currently initializing the irqchip. Other callers should therefore
 	 * check kvm_arch_can_set_irq_routing() before calling this function.
 	 */
 	switch (ue->type) {
 #ifdef CONFIG_KVM_IOAPIC
 	case KVM_IRQ_ROUTING_IRQCHIP:
 		if (irqchip_split(kvm))
 			return -EINVAL;
 		e->irqchip.pin = ue->u.irqchip.pin;
 		switch (ue->u.irqchip.irqchip) {
 		case KVM_IRQCHIP_PIC_SLAVE:
 			e->irqchip.pin += PIC_NUM_PINS / 2;
 			fallthrough;
 		case KVM_IRQCHIP_PIC_MASTER:
 			if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
 				return -EINVAL;
 			e->set = kvm_pic_set_irq;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
 			if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
 				return -EINVAL;
 			e->set = kvm_ioapic_set_irq;
 			break;
 		default:
 			return -EINVAL;
 		}
 		e->irqchip.irqchip = ue->u.irqchip.irqchip;
 		break;
 #endif
 	case KVM_IRQ_ROUTING_MSI:
 		e->set = kvm_set_msi;
 		e->msi.address_lo = ue->u.msi.address_lo;
 		e->msi.address_hi = ue->u.msi.address_hi;
 		e->msi.data = ue->u.msi.data;
 		if (kvm_msi_route_invalid(kvm, e))
 			return -EINVAL;
 		break;
 #ifdef CONFIG_KVM_HYPERV
 	case KVM_IRQ_ROUTING_HV_SINT:
 		e->set = kvm_hv_synic_set_irq;
 		e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
 		e->hv_sint.sint = ue->u.hv_sint.sint;
 		break;
 #endif
 #ifdef CONFIG_KVM_XEN
 	case KVM_IRQ_ROUTING_XEN_EVTCHN:
 		return kvm_xen_setup_evtchn(kvm, e, ue);
 #endif
 	default:
 		return -EINVAL;
 	}
 	return 0;
 }
 bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			     struct kvm_vcpu **dest_vcpu)
 {
 	int r = 0;
 	unsigned long i;
 	struct kvm_vcpu *vcpu;
 	if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
 		return true;
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (!kvm_apic_present(vcpu))
 			continue;
 		if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
 					irq->dest_id, irq->dest_mode))
 			continue;
 		if (++r == 2)
 			return false;
 		*dest_vcpu = vcpu;
 	}
 	return r == 1;
 }
 EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
 void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
 			 u8 vector, unsigned long *ioapic_handled_vectors)
 {
 	/*
 	 * Intercept EOI if the vCPU is the target of the new IRQ routing, or
 	 * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
 	 * may receive a level-triggered IRQ in the future, or already received
 	 * level-triggered IRQ.  The EOI needs to be intercepted and forwarded
 	 * to I/O APIC emulation so that the IRQ can be de-asserted.
 	 */
 	if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
 		__set_bit(vector, ioapic_handled_vectors);
 	} else if (kvm_apic_pending_eoi(vcpu, vector)) {
 		__set_bit(vector, ioapic_handled_vectors);
 		/*
 		 * Track the highest pending EOI for which the vCPU is NOT the
 		 * target in the new routing.  Only the EOI for the IRQ that is
 		 * in-flight (for the old routing) needs to be intercepted, any
 		 * future IRQs that arrive on this vCPU will be coincidental to
 		 * the level-triggered routing and don't need to be intercepted.
 		 */
 		if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
 			vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
 	}
 }
 void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
 			    ulong *ioapic_handled_vectors)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_kernel_irq_routing_entry *entry;
 	struct kvm_irq_routing_table *table;
 	u32 i, nr_ioapic_pins;
 	int idx;
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
 	nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
 			       kvm->arch.nr_reserved_ioapic_pins);
 	for (i = 0; i < nr_ioapic_pins; ++i) {
 		hlist_for_each_entry(entry, &table->map[i], link) {
 			struct kvm_lapic_irq irq;
 			if (entry->type != KVM_IRQ_ROUTING_MSI)
 				continue;
 			kvm_msi_to_lapic_irq(vcpu->kvm, entry, &irq);
 			if (!irq.trig_mode)
 				continue;
 			kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
 					    irq.vector, ioapic_handled_vectors);
 		}
 	}
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 void kvm_arch_irq_routing_update(struct kvm *kvm)
 {
 #ifdef CONFIG_KVM_HYPERV
 	kvm_hv_irq_routing_update(kvm);
 #endif
 	if (irqchip_split(kvm))
 		kvm_make_scan_ioapic_request(kvm);
 }
 static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd,
 			      struct kvm_kernel_irq_routing_entry *entry)
 {
 	unsigned int host_irq = irqfd->producer->irq;
 	struct kvm *kvm = irqfd->kvm;
 	struct kvm_vcpu *vcpu = NULL;
 	struct kvm_lapic_irq irq;
 	int r;
 	if (WARN_ON_ONCE(!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass()))
 		return -EINVAL;
 	if (entry && entry->type == KVM_IRQ_ROUTING_MSI) {
 		kvm_msi_to_lapic_irq(kvm, entry, &irq);
 		/*
 		 * Force remapped mode if hardware doesn't support posting the
 		 * virtual interrupt to a vCPU.  Only IRQs are postable (NMIs,
 		 * SMIs, etc. are not), and neither AMD nor Intel IOMMUs support
 		 * posting multicast/broadcast IRQs.  If the interrupt can't be
 		 * posted, the device MSI needs to be routed to the host so that
 		 * the guest's desired interrupt can be synthesized by KVM.
 		 *
 		 * This means that KVM can only post lowest-priority interrupts
 		 * if they have a single CPU as the destination, e.g. only if
 		 * the guest has affined the interrupt to a single vCPU.
 		 */
 		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
 		    !kvm_irq_is_postable(&irq))
 			vcpu = NULL;
 	}
 	if (!irqfd->irq_bypass_vcpu && !vcpu)
 		return 0;
 	r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi,
 					 vcpu, irq.vector);
 	if (r) {
 		WARN_ON_ONCE(irqfd->irq_bypass_vcpu && !vcpu);
 		irqfd->irq_bypass_vcpu = NULL;
 		return r;
 	}
 	irqfd->irq_bypass_vcpu = vcpu;
 	trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu);
 	return 0;
 }
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
 				      struct irq_bypass_producer *prod)
 {
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 	struct kvm *kvm = irqfd->kvm;
 	int ret = 0;
 	kvm_arch_start_assignment(irqfd->kvm);
 	spin_lock_irq(&kvm->irqfds.lock);
 	irqfd->producer = prod;
 	if (!kvm->arch.nr_possible_bypass_irqs++)
 		kvm_x86_call(pi_start_bypass)(kvm);
 	if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
 		ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry);
 		if (ret) {
 			kvm->arch.nr_possible_bypass_irqs--;
 			kvm_arch_end_assignment(irqfd->kvm);
 		}
 	}
 	spin_unlock_irq(&kvm->irqfds.lock);
 	return ret;
 }
 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 				      struct irq_bypass_producer *prod)
 {
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 	struct kvm *kvm = irqfd->kvm;
 	int ret;
 	WARN_ON(irqfd->producer != prod);
 	/*
 	 * If the producer of an IRQ that is currently being posted to a vCPU
 	 * is unregistered, change the associated IRTE back to remapped mode as
 	 * the IRQ has been released (or repurposed) by the device driver, i.e.
 	 * KVM must relinquish control of the IRTE.
 	 */
 	spin_lock_irq(&kvm->irqfds.lock);
 	if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
 		ret = kvm_pi_update_irte(irqfd, NULL);
 		if (ret)
 			pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n",
 				irqfd->consumer.eventfd, ret);
 	}
 	irqfd->producer = NULL;
 	kvm->arch.nr_possible_bypass_irqs--;
 	spin_unlock_irq(&kvm->irqfds.lock);
 	kvm_arch_end_assignment(irqfd->kvm);
 }
 void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
 				   struct kvm_kernel_irq_routing_entry *old,
 				   struct kvm_kernel_irq_routing_entry *new)
 {
 	if (new->type != KVM_IRQ_ROUTING_MSI &&
 	    old->type != KVM_IRQ_ROUTING_MSI)
 		return;
 	if (old->type == KVM_IRQ_ROUTING_MSI &&
 	    new->type == KVM_IRQ_ROUTING_MSI &&
 	    !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
 		return;
 	kvm_pi_update_irte(irqfd, new);
 }
 #ifdef CONFIG_KVM_IOAPIC
 #define IOAPIC_ROUTING_ENTRY(irq) \
 	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
 	  .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
 #define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
 #define PIC_ROUTING_ENTRY(irq) \
 	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
 	  .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
 #define ROUTING_ENTRY2(irq) \
 	IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
 static const struct kvm_irq_routing_entry default_routing[] = {
 	ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
 	ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
 	ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
 	ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
 	ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
 	ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
 	ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
 	ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
 	ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
 	ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
 	ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
 	ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
 };
 int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm)
 {
 	return kvm_set_irq_routing(kvm, default_routing,
 				   ARRAY_SIZE(default_routing), 0);
 }
 int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 	struct kvm_pic *pic = kvm->arch.vpic;
 	int r;
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
 		memcpy(&chip->chip.pic, &pic->pics[0],
 			sizeof(struct kvm_pic_state));
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
 		memcpy(&chip->chip.pic, &pic->pics[1],
 			sizeof(struct kvm_pic_state));
 		break;
 	case KVM_IRQCHIP_IOAPIC:
 		kvm_get_ioapic(kvm, &chip->chip.ioapic);
 		break;
 	default:
 		r = -EINVAL;
 		break;
 	}
 	return r;
 }
 int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 	struct kvm_pic *pic = kvm->arch.vpic;
 	int r;
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
 		spin_lock(&pic->lock);
 		memcpy(&pic->pics[0], &chip->chip.pic,
 			sizeof(struct kvm_pic_state));
 		spin_unlock(&pic->lock);
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
 		spin_lock(&pic->lock);
 		memcpy(&pic->pics[1], &chip->chip.pic,
 			sizeof(struct kvm_pic_state));
 		spin_unlock(&pic->lock);
 		break;
 	case KVM_IRQCHIP_IOAPIC:
 		kvm_set_ioapic(kvm, &chip->chip.ioapic);
 		break;
 	default:
 		r = -EINVAL;
 		break;
 	}
 	kvm_pic_update_irq(pic);
 	return r;
 }
 #endif
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@ -18,6 +18,8 @@
 #include <kvm/iodev.h>
 #include "lapic.h"
 #ifdef CONFIG_KVM_IOAPIC
 #define PIC_NUM_PINS 16
 #define SELECT_PIC(irq) \
 	((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
@ -63,6 +65,34 @@ int kvm_pic_init(struct kvm *kvm);
 void kvm_pic_destroy(struct kvm *kvm);
 int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
 		    int irq_source_id, int level, bool line_status);
 int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm);
 int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
 int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
 static inline int irqchip_full(struct kvm *kvm)
 {
 	int mode = kvm->arch.irqchip_mode;
 	/* Matches smp_wmb() when setting irqchip_mode */
 	smp_rmb();
 	return mode == KVM_IRQCHIP_KERNEL;
 }
 #else /* CONFIG_KVM_IOAPIC */
 static __always_inline int irqchip_full(struct kvm *kvm)
 {
 	return false;
 }
 #endif
 static inline int pic_in_kernel(struct kvm *kvm)
 {
 	return irqchip_full(kvm);
 }
 static inline int irqchip_split(struct kvm *kvm)
 {
@ -73,20 +103,6 @@ static inline int irqchip_split(struct kvm *kvm)
 	return mode == KVM_IRQCHIP_SPLIT;
 }
 static inline int irqchip_kernel(struct kvm *kvm)
 {
 	int mode = kvm->arch.irqchip_mode;
 	/* Matches smp_wmb() when setting irqchip_mode */
 	smp_rmb();
 	return mode == KVM_IRQCHIP_KERNEL;
 }
 static inline int pic_in_kernel(struct kvm *kvm)
 {
 	return irqchip_kernel(kvm);
 }
 static inline int irqchip_in_kernel(struct kvm *kvm)
 {
 	int mode = kvm->arch.irqchip_mode;
@ -105,7 +121,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
 int apic_has_pending_timer(struct kvm_vcpu *vcpu);
 int kvm_setup_default_irq_routing(struct kvm *kvm);
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 			     struct kvm_lapic_irq *irq,
 			     struct dest_map *dest_map);
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@ -1,469 +0,0 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
 * irq_comm.c: Common API for in kernel interrupt controller
 * Copyright (c) 2007, Intel Corporation.
 *
 * Authors:
 *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
 *
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
 #include <linux/export.h>
 #include <linux/rculist.h>
 #include <trace/events/kvm.h>
 #include "irq.h"
 #include "ioapic.h"
 #include "lapic.h"
 #include "hyperv.h"
 #include "x86.h"
 #include "xen.h"
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
 			   bool line_status)
 {
 	struct kvm_pic *pic = kvm->arch.vpic;
 	return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
 }
 static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 			      struct kvm *kvm, int irq_source_id, int level,
 			      bool line_status)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
 				line_status);
 }
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq, struct dest_map *dest_map)
 {
 	int r = -1;
 	struct kvm_vcpu *vcpu, *lowest = NULL;
 	unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
 	unsigned int dest_vcpus = 0;
 	if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
 		return r;
 	if (irq->dest_mode == APIC_DEST_PHYSICAL &&
 	    irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
 		pr_info("apic: phys broadcast and lowest prio\n");
 		irq->delivery_mode = APIC_DM_FIXED;
 	}
 	memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (!kvm_apic_present(vcpu))
 			continue;
 		if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
 					irq->dest_id, irq->dest_mode))
 			continue;
 		if (!kvm_lowest_prio_delivery(irq)) {
 			if (r < 0)
 				r = 0;
 			r += kvm_apic_set_irq(vcpu, irq, dest_map);
 		} else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
 			if (!kvm_vector_hashing_enabled()) {
 				if (!lowest)
 					lowest = vcpu;
 				else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
 					lowest = vcpu;
 			} else {
 				__set_bit(i, dest_vcpu_bitmap);
 				dest_vcpus++;
 			}
 		}
 	}
 	if (dest_vcpus != 0) {
 		int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
 					dest_vcpu_bitmap, KVM_MAX_VCPUS);
 		lowest = kvm_get_vcpu(kvm, idx);
 	}
 	if (lowest)
 		r = kvm_apic_set_irq(lowest, irq, dest_map);
 	return r;
 }
 void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		     struct kvm_lapic_irq *irq)
 {
 	struct msi_msg msg = { .address_lo = e->msi.address_lo,
 			       .address_hi = e->msi.address_hi,
 			       .data = e->msi.data };
 	trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
 			      (u64)msg.address_hi << 32 : 0), msg.data);
 	irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
 	irq->vector = msg.arch_data.vector;
 	irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
 	irq->trig_mode = msg.arch_data.is_level;
 	irq->delivery_mode = msg.arch_data.delivery_mode << 8;
 	irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
 	irq->level = 1;
 	irq->shorthand = APIC_DEST_NOSHORT;
 }
 EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
 static inline bool kvm_msi_route_invalid(struct kvm *kvm,
 		struct kvm_kernel_irq_routing_entry *e)
 {
 	return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
 }
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		struct kvm *kvm, int irq_source_id, int level, bool line_status)
 {
 	struct kvm_lapic_irq irq;
 	if (kvm_msi_route_invalid(kvm, e))
 		return -EINVAL;
 	if (!level)
 		return -1;
 	kvm_set_msi_irq(kvm, e, &irq);
 	return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
 }
 #ifdef CONFIG_KVM_HYPERV
 static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
 		    struct kvm *kvm, int irq_source_id, int level,
 		    bool line_status)
 {
 	if (!level)
 		return -1;
 	return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
 }
 #endif
 int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
 			      struct kvm *kvm, int irq_source_id, int level,
 			      bool line_status)
 {
 	struct kvm_lapic_irq irq;
 	int r;
 	switch (e->type) {
 #ifdef CONFIG_KVM_HYPERV
 	case KVM_IRQ_ROUTING_HV_SINT:
 		return kvm_hv_set_sint(e, kvm, irq_source_id, level,
 				       line_status);
 #endif
 	case KVM_IRQ_ROUTING_MSI:
 		if (kvm_msi_route_invalid(kvm, e))
 			return -EINVAL;
 		kvm_set_msi_irq(kvm, e, &irq);
 		if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
 			return r;
 		break;
 #ifdef CONFIG_KVM_XEN
 	case KVM_IRQ_ROUTING_XEN_EVTCHN:
 		if (!level)
 			return -1;
 		return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
 #endif
 	default:
 		break;
 	}
 	return -EWOULDBLOCK;
 }
 int kvm_request_irq_source_id(struct kvm *kvm)
 {
 	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
 	int irq_source_id;
 	mutex_lock(&kvm->irq_lock);
 	irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
 	if (irq_source_id >= BITS_PER_LONG) {
 		pr_warn("exhausted allocatable IRQ sources!\n");
 		irq_source_id = -EFAULT;
 		goto unlock;
 	}
 	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
 	ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
 	set_bit(irq_source_id, bitmap);
 unlock:
 	mutex_unlock(&kvm->irq_lock);
 	return irq_source_id;
 }
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 {
 	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
 	ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
 	mutex_lock(&kvm->irq_lock);
 	if (irq_source_id < 0 ||
 	    irq_source_id >= BITS_PER_LONG) {
 		pr_err("IRQ source ID out of range!\n");
 		goto unlock;
 	}
 	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
 	if (!irqchip_kernel(kvm))
 		goto unlock;
 	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
 	kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
 unlock:
 	mutex_unlock(&kvm->irq_lock);
 }
 void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
 				    struct kvm_irq_mask_notifier *kimn)
 {
 	mutex_lock(&kvm->irq_lock);
 	kimn->irq = irq;
 	hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
 	mutex_unlock(&kvm->irq_lock);
 }
 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn)
 {
 	mutex_lock(&kvm->irq_lock);
 	hlist_del_rcu(&kimn->link);
 	mutex_unlock(&kvm->irq_lock);
 	synchronize_srcu(&kvm->irq_srcu);
 }
 void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
 			     bool mask)
 {
 	struct kvm_irq_mask_notifier *kimn;
 	int idx, gsi;
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
 	if (gsi != -1)
 		hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link)
 			if (kimn->irq == gsi)
 				kimn->func(kimn, mask);
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
 {
 	return irqchip_in_kernel(kvm);
 }
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
 {
 	/* We can't check irqchip_in_kernel() here as some callers are
 	 * currently initializing the irqchip. Other callers should therefore
 	 * check kvm_arch_can_set_irq_routing() before calling this function.
 	 */
 	switch (ue->type) {
 	case KVM_IRQ_ROUTING_IRQCHIP:
 		if (irqchip_split(kvm))
 			return -EINVAL;
 		e->irqchip.pin = ue->u.irqchip.pin;
 		switch (ue->u.irqchip.irqchip) {
 		case KVM_IRQCHIP_PIC_SLAVE:
 			e->irqchip.pin += PIC_NUM_PINS / 2;
 			fallthrough;
 		case KVM_IRQCHIP_PIC_MASTER:
 			if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
 				return -EINVAL;
 			e->set = kvm_set_pic_irq;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
 			if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
 				return -EINVAL;
 			e->set = kvm_set_ioapic_irq;
 			break;
 		default:
 			return -EINVAL;
 		}
 		e->irqchip.irqchip = ue->u.irqchip.irqchip;
 		break;
 	case KVM_IRQ_ROUTING_MSI:
 		e->set = kvm_set_msi;
 		e->msi.address_lo = ue->u.msi.address_lo;
 		e->msi.address_hi = ue->u.msi.address_hi;
 		e->msi.data = ue->u.msi.data;
 		if (kvm_msi_route_invalid(kvm, e))
 			return -EINVAL;
 		break;
 #ifdef CONFIG_KVM_HYPERV
 	case KVM_IRQ_ROUTING_HV_SINT:
 		e->set = kvm_hv_set_sint;
 		e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
 		e->hv_sint.sint = ue->u.hv_sint.sint;
 		break;
 #endif
 #ifdef CONFIG_KVM_XEN
 	case KVM_IRQ_ROUTING_XEN_EVTCHN:
 		return kvm_xen_setup_evtchn(kvm, e, ue);
 #endif
 	default:
 		return -EINVAL;
 	}
 	return 0;
 }
 bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			     struct kvm_vcpu **dest_vcpu)
 {
 	int r = 0;
 	unsigned long i;
 	struct kvm_vcpu *vcpu;
 	if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
 		return true;
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (!kvm_apic_present(vcpu))
 			continue;
 		if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
 					irq->dest_id, irq->dest_mode))
 			continue;
 		if (++r == 2)
 			return false;
 		*dest_vcpu = vcpu;
 	}
 	return r == 1;
 }
 EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
 #define IOAPIC_ROUTING_ENTRY(irq) \
 	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
 	  .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
 #define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
 #define PIC_ROUTING_ENTRY(irq) \
 	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
 	  .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
 #define ROUTING_ENTRY2(irq) \
 	IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
 static const struct kvm_irq_routing_entry default_routing[] = {
 	ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
 	ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
 	ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
 	ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
 	ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
 	ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
 	ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
 	ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
 	ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
 	ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
 	ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
 	ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
 };
 int kvm_setup_default_irq_routing(struct kvm *kvm)
 {
 	return kvm_set_irq_routing(kvm, default_routing,
 				   ARRAY_SIZE(default_routing), 0);
 }
 void kvm_arch_post_irq_routing_update(struct kvm *kvm)
 {
 	if (!irqchip_split(kvm))
 		return;
 	kvm_make_scan_ioapic_request(kvm);
 }
 void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
 			 u8 vector, unsigned long *ioapic_handled_vectors)
 {
 	/*
 	 * Intercept EOI if the vCPU is the target of the new IRQ routing, or
 	 * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
 	 * may receive a level-triggered IRQ in the future, or already received
 	 * level-triggered IRQ.  The EOI needs to be intercepted and forwarded
 	 * to I/O APIC emulation so that the IRQ can be de-asserted.
 	 */
 	if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
 		__set_bit(vector, ioapic_handled_vectors);
 	} else if (kvm_apic_pending_eoi(vcpu, vector)) {
 		__set_bit(vector, ioapic_handled_vectors);
 		/*
 		 * Track the highest pending EOI for which the vCPU is NOT the
 		 * target in the new routing.  Only the EOI for the IRQ that is
 		 * in-flight (for the old routing) needs to be intercepted, any
 		 * future IRQs that arrive on this vCPU will be coincidental to
 		 * the level-triggered routing and don't need to be intercepted.
 		 */
 		if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
 			vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
 	}
 }
 void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
 			    ulong *ioapic_handled_vectors)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_kernel_irq_routing_entry *entry;
 	struct kvm_irq_routing_table *table;
 	u32 i, nr_ioapic_pins;
 	int idx;
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
 	nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
 			       kvm->arch.nr_reserved_ioapic_pins);
 	for (i = 0; i < nr_ioapic_pins; ++i) {
 		hlist_for_each_entry(entry, &table->map[i], link) {
 			struct kvm_lapic_irq irq;
 			if (entry->type != KVM_IRQ_ROUTING_MSI)
 				continue;
 			kvm_set_msi_irq(vcpu->kvm, entry, &irq);
 			if (!irq.trig_mode)
 				continue;
 			kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
 					    irq.vector, ioapic_handled_vectors);
 		}
 	}
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 void kvm_arch_irq_routing_update(struct kvm *kvm)
 {
 #ifdef CONFIG_KVM_HYPERV
 	kvm_hv_irq_routing_update(kvm);
 #endif
 }
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@ -1455,7 +1455,7 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 {
-	int trigger_mode;
+	int __maybe_unused trigger_mode;
 	/* Eoi the ioapic only if the ioapic doesn't own the vector. */
 	if (!kvm_ioapic_handles_vector(apic, vector))
@ -1476,12 +1476,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 		return;
 	}
 #ifdef CONFIG_KVM_IOAPIC
 	if (apic_test_vector(vector, apic->regs + APIC_TMR))
 		trigger_mode = IOAPIC_LEVEL_TRIG;
 	else
 		trigger_mode = IOAPIC_EDGE_TRIG;
 	kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
 #endif
 }
 static int apic_set_eoi(struct kvm_lapic *apic)
@ -3146,8 +3148,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 		kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
 	}
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 #ifdef CONFIG_KVM_IOAPIC
 	if (ioapic_in_kernel(vcpu->kvm))
 		kvm_rtc_eoi_tracking_restore_one(vcpu);
 #endif
 	vcpu->arch.apic_arb_prio = 0;
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@ -18,6 +18,7 @@
 #include <linux/hashtable.h>
 #include <linux/amd-iommu.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm_irqfd.h>
 #include <asm/irq_remapping.h>
 #include <asm/msr.h>
@ -29,36 +30,39 @@
 #include "svm.h"
 /*
- * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID,
+ * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
- * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry
+ * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
- * if an interrupt can't be delivered, e.g. because the vCPU isn't running.
+ * be delivered, e.g. because the vCPU isn't running.  Use the vCPU's index
 * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
 * lookup on the index, where as vCPUs whose index doesn't match their ID need
 * to walk the entire xarray of vCPUs in the worst case scenario.
 *
- * For the vCPU ID, use however many bits are currently allowed for the max
+ * For the vCPU index, use however many bits are currently allowed for the max
 * guest physical APIC ID (limited by the size of the physical ID table), and
 * use whatever bits remain to assign arbitrary AVIC IDs to VMs.  Note, the
 * size of the GATag is defined by hardware (32 bits), but is an opaque value
 * as far as hardware is concerned.
 */
-#define AVIC_VCPU_ID_MASK		AVIC_PHYSICAL_MAX_INDEX_MASK
+#define AVIC_VCPU_IDX_MASK		AVIC_PHYSICAL_MAX_INDEX_MASK
 #define AVIC_VM_ID_SHIFT		HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
 #define AVIC_VM_ID_MASK			(GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
 #define AVIC_GATAG_TO_VMID(x)		((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
-#define AVIC_GATAG_TO_VCPUID(x)		(x & AVIC_VCPU_ID_MASK)
+#define AVIC_GATAG_TO_VCPUIDX(x)	(x & AVIC_VCPU_IDX_MASK)
-#define __AVIC_GATAG(vm_id, vcpu_id)	((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
+#define __AVIC_GATAG(vm_id, vcpu_idx)	((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
-					 ((vcpu_id) & AVIC_VCPU_ID_MASK))
+					 ((vcpu_idx) & AVIC_VCPU_IDX_MASK))
-#define AVIC_GATAG(vm_id, vcpu_id)					\
+#define AVIC_GATAG(vm_id, vcpu_idx)					\
 ({									\
-	u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id);			\
+	u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx);			\
 									\
-	WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id));	\
+	WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx));	\
 	WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id));		\
 	ga_tag;								\
 })
-static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
+static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
 static bool force_avic;
 module_param_unsafe(force_avic, bool, 0444);
@ -75,14 +79,6 @@ static bool next_vm_id_wrapped = 0;
 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
 bool x2avic_enabled;
 /*
 * This is a wrapper of struct amd_iommu_ir_data.
 */
 struct amd_svm_iommu_ir {
 	struct list_head node;	/* Used by SVM for per-vcpu ir_list */
 	void *data;		/* Storing pointer to struct amd_ir_data */
 };
 static void avic_activate_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb *vmcb = svm->vmcb01.ptr;
@ -147,16 +143,16 @@ int avic_ga_log_notifier(u32 ga_tag)
 	struct kvm_svm *kvm_svm;
 	struct kvm_vcpu *vcpu = NULL;
 	u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
-	u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+	u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
-	pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+	pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
-	trace_kvm_avic_ga_log(vm_id, vcpu_id);
+	trace_kvm_avic_ga_log(vm_id, vcpu_idx);
 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
 	hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
 		if (kvm_svm->avic_vm_id != vm_id)
 			continue;
-		vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
+		vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
 		break;
 	}
 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
@ -180,10 +176,8 @@ void avic_vm_destroy(struct kvm *kvm)
 	if (!enable_apicv)
 		return;
-	if (kvm_svm->avic_logical_id_table_page)
+	free_page((unsigned long)kvm_svm->avic_logical_id_table);
-		__free_page(kvm_svm->avic_logical_id_table_page);
+	free_page((unsigned long)kvm_svm->avic_physical_id_table);
 	if (kvm_svm->avic_physical_id_table_page)
 		__free_page(kvm_svm->avic_physical_id_table_page);
 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
 	hash_del(&kvm_svm->hnode);
@ -196,27 +190,19 @@ int avic_vm_init(struct kvm *kvm)
 	int err = -ENOMEM;
 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
 	struct kvm_svm *k2;
 	struct page *p_page;
 	struct page *l_page;
 	u32 vm_id;
 	if (!enable_apicv)
 		return 0;
-	/* Allocating physical APIC ID table (4KB) */
+	kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
-	p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (!kvm_svm->avic_physical_id_table)
 	if (!p_page)
 		goto free_avic;
-	kvm_svm->avic_physical_id_table_page = p_page;
+	kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
-
+	if (!kvm_svm->avic_logical_id_table)
 	/* Allocating logical APIC ID table (4KB) */
 	l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
 	if (!l_page)
 		goto free_avic;
 	kvm_svm->avic_logical_id_table_page = l_page;
 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
 again:
 	vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
@ -242,17 +228,19 @@ free_avic:
 	return err;
 }
 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
 {
 	return __sme_set(__pa(svm->vcpu.arch.apic->regs));
 }
 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
 {
 	struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
 	phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
 	phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
 	phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
-	vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
+	vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
-	vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
+	vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
-	vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
+	vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
-	vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
+	vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
 	if (kvm_apicv_activated(svm->vcpu.kvm))
 		avic_activate_vmcb(svm);
@ -260,32 +248,31 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
 		avic_deactivate_vmcb(svm);
 }
 static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
 				       unsigned int index)
 {
 	u64 *avic_physical_id_table;
 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
 	if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
 	    (index > X2AVIC_MAX_PHYSICAL_ID))
 		return NULL;
 	avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
 	return &avic_physical_id_table[index];
 }
 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
 {
-	u64 *entry, new_entry;
+	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
 	int id = vcpu->vcpu_id;
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u32 id = vcpu->vcpu_id;
 	u64 new_entry;
 	/*
 	 * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
 	 * hardware.  Immediately clear apicv_active, i.e. don't wait until the
 	 * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
 	 * avic_vcpu_load() expects to be called if and only if the vCPU has
 	 * fully initialized AVIC.
 	 */
 	if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
-	    (id > X2AVIC_MAX_PHYSICAL_ID))
+	    (id > X2AVIC_MAX_PHYSICAL_ID)) {
-		return -EINVAL;
+		kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
 		vcpu->arch.apic->apicv_active = false;
 		return 0;
 	}
-	if (!vcpu->arch.apic->regs)
+	BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
 		     (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
 	if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
 		return -EINVAL;
 	if (kvm_apicv_activated(vcpu->kvm)) {
@ -302,19 +289,21 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
 			return ret;
 	}
-	svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
+	/* Note, fls64() returns the bit position, +1. */
 	BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
 		     fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
 	/* Setting AVIC backing page address in the phy APIC ID table */
-	entry = avic_get_physical_id_entry(vcpu, id);
+	new_entry = avic_get_backing_page_address(svm) |
-	if (!entry)
+		    AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
-		return -EINVAL;
+	svm->avic_physical_id_entry = new_entry;
-	new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+	/*
-			      AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+	 * Initialize the real table, as vCPUs must have a valid entry in order
-			      AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
+	 * for broadcast IPIs to function correctly (broadcast IPIs ignore
-	WRITE_ONCE(*entry, new_entry);
+	 * invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
-
+	 */
-	svm->avic_physical_id_cache = entry;
+	WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
 	return 0;
 }
@ -448,7 +437,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
 		if (apic_x2apic_mode(source))
 			avic_logical_id_table = NULL;
 		else
-			avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
+			avic_logical_id_table = kvm_svm->avic_logical_id_table;
 		/*
 		 * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
@ -550,7 +539,6 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
 {
 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
 	u32 *logical_apic_id_table;
 	u32 cluster, index;
 	ldr = GET_APIC_LOGICAL_ID(ldr);
@ -571,9 +559,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
 		return NULL;
 	index += (cluster << 2);
-	logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
+	return &kvm_svm->avic_logical_id_table[index];
 	return &logical_apic_id_table[index];
 }
 static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
@ -722,6 +708,9 @@ int avic_init_vcpu(struct vcpu_svm *svm)
 	int ret;
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	INIT_LIST_HEAD(&svm->ir_list);
 	spin_lock_init(&svm->ir_list_lock);
 	if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
 		return 0;
@ -729,8 +718,6 @@ int avic_init_vcpu(struct vcpu_svm *svm)
 	if (ret)
 		return ret;
 	INIT_LIST_HEAD(&svm->ir_list);
 	spin_lock_init(&svm->ir_list_lock);
 	svm->dfr_reg = APIC_DFR_FLAT;
 	return ret;
@ -742,316 +729,161 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
 	avic_handle_ldr_update(vcpu);
 }
-static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
 {
-	int ret = 0;
+	struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
 	unsigned long flags;
 	struct amd_svm_iommu_ir *ir;
 	struct vcpu_svm *svm = to_svm(vcpu);
-	if (!kvm_arch_has_assigned_device(vcpu->kvm))
+	if (!vcpu)
-		return 0;
+		return;
 	spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
 	list_del(&irqfd->vcpu_list);
 	spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
 }
 int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 			unsigned int host_irq, uint32_t guest_irq,
 			struct kvm_vcpu *vcpu, u32 vector)
 {
 	/*
-	 * Here, we go through the per-vcpu ir_list to update all existing
+	 * If the IRQ was affined to a different vCPU, remove the IRTE metadata
-	 * interrupt remapping table entry targeting this vcpu.
+	 * from the *previous* vCPU's list.
 	 */
-	spin_lock_irqsave(&svm->ir_list_lock, flags);
+	svm_ir_list_del(irqfd);
-	if (list_empty(&svm->ir_list))
+	if (vcpu) {
-		goto out;
+		/*
 		 * Try to enable guest_mode in IRTE, unless AVIC is inhibited,
 		 * in which case configure the IRTE for legacy mode, but track
 		 * the IRTE metadata so that it can be converted to guest mode
 		 * if AVIC is enabled/uninhibited in the future.
 		 */
 		struct amd_iommu_pi_data pi_data = {
 			.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
 					     vcpu->vcpu_idx),
 			.is_guest_mode = kvm_vcpu_apicv_active(vcpu),
 			.vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
 			.vector = vector,
 		};
 		struct vcpu_svm *svm = to_svm(vcpu);
 		u64 entry;
 		int ret;
-	list_for_each_entry(ir, &svm->ir_list, node) {
+		/*
-		if (activate)
+		 * Prevent the vCPU from being scheduled out or migrated until
-			ret = amd_iommu_activate_guest_mode(ir->data);
+		 * the IRTE is updated and its metadata has been added to the
-		else
+		 * list of IRQs being posted to the vCPU, to ensure the IRTE
-			ret = amd_iommu_deactivate_guest_mode(ir->data);
+		 * isn't programmed with stale pCPU/IsRunning information.
 		 */
 		guard(spinlock_irqsave)(&svm->ir_list_lock);
 		/*
 		 * Update the target pCPU for IOMMU doorbells if the vCPU is
 		 * running.  If the vCPU is NOT running, i.e. is blocking or
 		 * scheduled out, KVM will update the pCPU info when the vCPU
 		 * is awakened and/or scheduled in.  See also avic_vcpu_load().
 		 */
 		entry = svm->avic_physical_id_entry;
 		if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
 			pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
 		} else {
 			pi_data.cpu = -1;
 			pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
 		}
 		ret = irq_set_vcpu_affinity(host_irq, &pi_data);
 		if (ret)
-			break;
+			return ret;
 	}
 out:
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
 	return ret;
 }
-static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+		/*
-{
+		 * Revert to legacy mode if the IOMMU didn't provide metadata
-	unsigned long flags;
+		 * for the IRTE, which KVM needs to keep the IRTE up-to-date,
-	struct amd_svm_iommu_ir *cur;
+		 * e.g. if the vCPU is migrated or AVIC is disabled.
-
+		 */
-	spin_lock_irqsave(&svm->ir_list_lock, flags);
+		if (WARN_ON_ONCE(!pi_data.ir_data)) {
-	list_for_each_entry(cur, &svm->ir_list, node) {
+			irq_set_vcpu_affinity(host_irq, NULL);
-		if (cur->data != pi->ir_data)
+			return -EIO;
 			continue;
 		list_del(&cur->node);
 		kfree(cur);
 		break;
 	}
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
 }
 static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
 {
 	int ret = 0;
 	unsigned long flags;
 	struct amd_svm_iommu_ir *ir;
 	u64 entry;
 	if (WARN_ON_ONCE(!pi->ir_data))
 		return -EINVAL;
 	/**
 	 * In some cases, the existing irte is updated and re-set,
 	 * so we need to check here if it's already been * added
 	 * to the ir_list.
 	 */
 	if (pi->prev_ga_tag) {
 		struct kvm *kvm = svm->vcpu.kvm;
 		u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
 		struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
 		struct vcpu_svm *prev_svm;
 		if (!prev_vcpu) {
 			ret = -EINVAL;
 			goto out;
 		}
-		prev_svm = to_svm(prev_vcpu);
+		irqfd->irq_bypass_data = pi_data.ir_data;
-		svm_ir_list_del(prev_svm, pi);
+		list_add(&irqfd->vcpu_list, &svm->ir_list);
 		return 0;
 	}
 	return irq_set_vcpu_affinity(host_irq, NULL);
 }
-	/**
+enum avic_vcpu_action {
-	 * Allocating new amd_iommu_pi_data, which will get
+	/*
-	 * add to the per-vcpu ir_list.
+	 * There is no need to differentiate between activate and deactivate,
 	 * as KVM only refreshes AVIC state when the vCPU is scheduled in and
 	 * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
 	 * being (de)activated.
 	 */
-	ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
+	AVIC_TOGGLE_ON_OFF	= BIT(0),
-	if (!ir) {
+	AVIC_ACTIVATE		= AVIC_TOGGLE_ON_OFF,
-		ret = -ENOMEM;
+	AVIC_DEACTIVATE		= AVIC_TOGGLE_ON_OFF,
 		goto out;
 	}
 	ir->data = pi->ir_data;
 	spin_lock_irqsave(&svm->ir_list_lock, flags);
 	/*
-	 * Update the target pCPU for IOMMU doorbells if the vCPU is running.
+	 * No unique action is required to deal with a vCPU that stops/starts
-	 * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
+	 * running.  A vCPU that starts running by definition stops blocking as
-	 * will update the pCPU info when the vCPU awkened and/or scheduled in.
+	 * well, and a vCPU that stops running can't have been blocking, i.e.
-	 * See also avic_vcpu_load().
+	 * doesn't need to toggle GALogIntr.
 	 */
-	entry = READ_ONCE(*(svm->avic_physical_id_cache));
+	AVIC_START_RUNNING	= 0,
-	if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+	AVIC_STOP_RUNNING	= 0,
 		amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
 				    true, pi->ir_data);
-	list_add(&ir->node, &svm->ir_list);
+	/*
-	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+	 * When a vCPU starts blocking, KVM needs to set the GALogIntr flag
-out:
+	 * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
-	return ret;
+	 * sent to the vCPU.
-}
+	 */
 	AVIC_START_BLOCKING	= BIT(1),
 };
-/*
+static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
- * Note:
+					    enum avic_vcpu_action action)
 * The HW cannot support posting multicast/broadcast
 * interrupts to a vCPU. So, we still use legacy interrupt
 * remapping for these kind of interrupts.
 *
 * For lowest-priority interrupts, we only support
 * those with single CPU as the destination, e.g. user
 * configures the interrupts via /proc/irq or uses
 * irqbalance to make the interrupts single-CPU.
 */
 static int
 get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 		 struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
 {
-	struct kvm_lapic_irq irq;
+	bool ga_log_intr = (action & AVIC_START_BLOCKING);
 	struct kvm_vcpu *vcpu = NULL;
 	kvm_set_msi_irq(kvm, e, &irq);
 	if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
 	    !kvm_irq_is_postable(&irq)) {
 		pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
 			 __func__, irq.vector);
 		return -1;
 	}
 	pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
 		 irq.vector);
 	*svm = to_svm(vcpu);
 	vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
 	vcpu_info->vector = irq.vector;
 	return 0;
 }
 /*
 * avic_pi_update_irte - set IRTE for Posted-Interrupts
 *
 * @kvm: kvm
 * @host_irq: host irq of the interrupt
 * @guest_irq: gsi of the interrupt
 * @set: set or unset PI
 * returns 0 on success, < 0 on failure
 */
 int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 			uint32_t guest_irq, bool set)
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_irq_routing_table *irq_rt;
 	bool enable_remapped_mode = true;
 	int idx, ret = 0;
 	if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
 		return 0;
 	pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
 		 __func__, host_irq, guest_irq, set);
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
 	if (guest_irq >= irq_rt->nr_rt_entries ||
 		hlist_empty(&irq_rt->map[guest_irq])) {
 		pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
 			     guest_irq, irq_rt->nr_rt_entries);
 		goto out;
 	}
 	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
 		struct vcpu_data vcpu_info;
 		struct vcpu_svm *svm = NULL;
 		if (e->type != KVM_IRQ_ROUTING_MSI)
 			continue;
 		/**
 		 * Here, we setup with legacy mode in the following cases:
 		 * 1. When cannot target interrupt to a specific vcpu.
 		 * 2. Unsetting posted interrupt.
 		 * 3. APIC virtualization is disabled for the vcpu.
 		 * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
 		 */
 		if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
 		    kvm_vcpu_apicv_active(&svm->vcpu)) {
 			struct amd_iommu_pi_data pi;
 			enable_remapped_mode = false;
 			/* Try to enable guest_mode in IRTE */
 			pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
 					    AVIC_HPA_MASK);
 			pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
 						     svm->vcpu.vcpu_id);
 			pi.is_guest_mode = true;
 			pi.vcpu_data = &vcpu_info;
 			ret = irq_set_vcpu_affinity(host_irq, &pi);
 			/**
 			 * Here, we successfully setting up vcpu affinity in
 			 * IOMMU guest mode. Now, we need to store the posted
 			 * interrupt information in a per-vcpu ir_list so that
 			 * we can reference to them directly when we update vcpu
 			 * scheduling information in IOMMU irte.
 			 */
 			if (!ret && pi.is_guest_mode)
 				svm_ir_list_add(svm, &pi);
 		}
 		if (!ret && svm) {
 			trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
 						 e->gsi, vcpu_info.vector,
 						 vcpu_info.pi_desc_addr, set);
 		}
 		if (ret < 0) {
 			pr_err("%s: failed to update PI IRTE\n", __func__);
 			goto out;
 		}
 	}
 	ret = 0;
 	if (enable_remapped_mode) {
 		/* Use legacy mode in IRTE */
 		struct amd_iommu_pi_data pi;
 		/**
 		 * Here, pi is used to:
 		 * - Tell IOMMU to use legacy mode for this interrupt.
 		 * - Retrieve ga_tag of prior interrupt remapping data.
 		 */
 		pi.prev_ga_tag = 0;
 		pi.is_guest_mode = false;
 		ret = irq_set_vcpu_affinity(host_irq, &pi);
 		/**
 		 * Check if the posted interrupt was previously
 		 * setup with the guest_mode by checking if the ga_tag
 		 * was cached. If so, we need to clean up the per-vcpu
 		 * ir_list.
 		 */
 		if (!ret && pi.prev_ga_tag) {
 			int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
 			struct kvm_vcpu *vcpu;
 			vcpu = kvm_get_vcpu_by_id(kvm, id);
 			if (vcpu)
 				svm_ir_list_del(to_svm(vcpu), &pi);
 		}
 	}
 out:
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 	return ret;
 }
 static inline int
 avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
 {
 	int ret = 0;
 	struct amd_svm_iommu_ir *ir;
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct kvm_kernel_irqfd *irqfd;
 	lockdep_assert_held(&svm->ir_list_lock);
 	if (!kvm_arch_has_assigned_device(vcpu->kvm))
 		return 0;
 	/*
 	 * Here, we go through the per-vcpu ir_list to update all existing
 	 * interrupt remapping table entry targeting this vcpu.
 	 */
 	if (list_empty(&svm->ir_list))
-		return 0;
+		return;
-	list_for_each_entry(ir, &svm->ir_list, node) {
+	list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
-		ret = amd_iommu_update_ga(cpu, r, ir->data);
+		void *data = irqfd->irq_bypass_data;
-		if (ret)
+
-			return ret;
+		if (!(action & AVIC_TOGGLE_ON_OFF))
 			WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
 		else if (cpu >= 0)
 			WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
 		else
 			WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
 	}
 	return 0;
 }
-void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
 			     enum avic_vcpu_action action)
 {
-	u64 entry;
+	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
 	int h_physical_id = kvm_cpu_get_apicid(cpu);
 	struct vcpu_svm *svm = to_svm(vcpu);
 	unsigned long flags;
 	u64 entry;
 	lockdep_assert_preemption_disabled();
 	if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
 		return;
-	/*
+	if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
 	 * No need to update anything if the vCPU is blocking, i.e. if the vCPU
 	 * is being scheduled in after being preempted.  The CPU entries in the
 	 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
 	 * If the vCPU was migrated, its new CPU value will be stuffed when the
 	 * vCPU unblocks.
 	 */
 	if (kvm_vcpu_is_blocking(vcpu))
 		return;
 	/*
@ -1063,38 +895,57 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	 */
 	spin_lock_irqsave(&svm->ir_list_lock, flags);
-	entry = READ_ONCE(*(svm->avic_physical_id_cache));
+	entry = svm->avic_physical_id_entry;
 	WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
-	entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+	entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
 		   AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
 	entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
 	entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+	svm->avic_physical_id_entry = entry;
-	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
+
 	/*
 	 * If IPI virtualization is disabled, clear IsRunning when updating the
 	 * actual Physical ID table, so that the CPU never sees IsRunning=1.
 	 * Keep the APIC ID up-to-date in the entry to minimize the chances of
 	 * things going sideways if hardware peeks at the ID.
 	 */
 	if (!enable_ipiv)
 		entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 	WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
 	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
 }
-void avic_vcpu_put(struct kvm_vcpu *vcpu)
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	u64 entry;
+	/*
 	 * No need to update anything if the vCPU is blocking, i.e. if the vCPU
 	 * is being scheduled in after being preempted.  The CPU entries in the
 	 * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
 	 * If the vCPU was migrated, its new CPU value will be stuffed when the
 	 * vCPU unblocks.
 	 */
 	if (kvm_vcpu_is_blocking(vcpu))
 		return;
 	__avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
 }
 static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
 {
 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
 	struct vcpu_svm *svm = to_svm(vcpu);
 	unsigned long flags;
 	u64 entry = svm->avic_physical_id_entry;
 	lockdep_assert_preemption_disabled();
-	/*
+	if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
 	 * Note, reading the Physical ID entry outside of ir_list_lock is safe
 	 * as only the pCPU that has loaded (or is loading) the vCPU is allowed
 	 * to modify the entry, and preemption is disabled.  I.e. the vCPU
 	 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
 	 * recursively.
 	 */
 	entry = READ_ONCE(*(svm->avic_physical_id_cache));
 	/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
 	if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
 		return;
 	/*
@ -1107,13 +958,62 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
 	 */
 	spin_lock_irqsave(&svm->ir_list_lock, flags);
-	avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+	avic_update_iommu_vcpu_affinity(vcpu, -1, action);
 	WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
 	/*
 	 * Keep the previous APIC ID in the entry so that a rogue doorbell from
 	 * hardware is at least restricted to a CPU associated with the vCPU.
 	 */
 	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+
 	if (enable_ipiv)
 		WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
 	/*
 	 * Note!  Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
 	 * it's a synthetic flag that usurps an unused should-be-zero bit.
 	 */
 	if (action & AVIC_START_BLOCKING)
 		entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
 	svm->avic_physical_id_entry = entry;
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
 }
 void avic_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	/*
 	 * Note, reading the Physical ID entry outside of ir_list_lock is safe
 	 * as only the pCPU that has loaded (or is loading) the vCPU is allowed
 	 * to modify the entry, and preemption is disabled.  I.e. the vCPU
 	 * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
 	 * recursively.
 	 */
 	u64 entry = to_svm(vcpu)->avic_physical_id_entry;
 	/*
 	 * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
 	 * vCPU is preempted while its in the process of blocking.  WARN if the
 	 * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
 	 * the AVIC if it wasn't previously loaded.
 	 */
 	if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
 		if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
 			return;
 		/*
 		 * The vCPU was preempted while blocking, ensure its IRTEs are
 		 * configured to generate GA Log Interrupts.
 		 */
 		if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
 			return;
 	}
 	__avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
 							   AVIC_STOP_RUNNING);
 }
 void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
@ -1142,19 +1042,18 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 {
 	bool activated = kvm_vcpu_apicv_active(vcpu);
 	if (!enable_apicv)
 		return;
 	/* APICv should only be toggled on/off while the vCPU is running. */
 	WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
 	avic_refresh_virtual_apic_mode(vcpu);
-	if (activated)
+	if (kvm_vcpu_apicv_active(vcpu))
-		avic_vcpu_load(vcpu, vcpu->cpu);
+		__avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
 	else
-		avic_vcpu_put(vcpu);
+		__avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
 	avic_set_pi_irte_mode(vcpu, activated);
 }
 void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
@ -1162,20 +1061,25 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
 	if (!kvm_vcpu_apicv_active(vcpu))
 		return;
-       /*
+	/*
-        * Unload the AVIC when the vCPU is about to block, _before_
+	 * Unload the AVIC when the vCPU is about to block, _before_ the vCPU
-        * the vCPU actually blocks.
+	 * actually blocks.
-        *
+	 *
-        * Any IRQs that arrive before IsRunning=0 will not cause an
+	 * Note, any IRQs that arrive before IsRunning=0 will not cause an
-        * incomplete IPI vmexit on the source, therefore vIRR will also
+	 * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
-        * be checked by kvm_vcpu_check_block() before blocking.  The
+	 * this by checking vIRR one last time before blocking.  The memory
-        * memory barrier implicit in set_current_state orders writing
+	 * barrier implicit in set_current_state orders writing IsRunning=0
-        * IsRunning=0 before reading the vIRR.  The processor needs a
+	 * before reading the vIRR.  The processor needs a matching memory
-        * matching memory barrier on interrupt delivery between writing
+	 * barrier on interrupt delivery between writing IRR and reading
-        * IRR and reading IsRunning; the lack of this barrier might be
+	 * IsRunning; the lack of this barrier might be the cause of errata #1235).
-        * the cause of errata #1235).
+	 *
-        */
+	 * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
-	avic_vcpu_put(vcpu);
+	 * doesn't need to detect events for scheduling purposes.  The doorbell
 	 * used to signal running vCPUs cannot be blocked, i.e. will perturb the
 	 * CPU and cause noisy neighbor problems if the VM is sending interrupts
 	 * to the vCPU while it's scheduled out.
 	 */
 	__avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
 }
 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
@ -1228,6 +1132,14 @@ bool avic_hardware_setup(void)
 	if (x2avic_enabled)
 		pr_info("x2AVIC enabled\n");
 	/*
 	 * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
 	 * due to erratum 1235, which results in missed VM-Exits on the sender
 	 * and thus missed wake events for blocking vCPUs due to the CPU
 	 * failing to see a software update to clear IsRunning.
 	 */
 	enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
 	amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
 	return true;
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@ -232,6 +232,7 @@ module_param(tsc_scaling, int, 0444);
 */
 static bool avic;
 module_param(avic, bool, 0444);
 module_param(enable_ipiv, bool, 0444);
 module_param(enable_device_posted_irqs, bool, 0444);
@ -1490,6 +1491,8 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	WARN_ON_ONCE(!list_empty(&svm->ir_list));
 	svm_leave_nested(vcpu);
 	svm_free_nested(svm);
@ -5581,6 +5584,7 @@ static __init int svm_hardware_setup(void)
 	enable_apicv = avic = avic && avic_hardware_setup();
 	if (!enable_apicv) {
 		enable_ipiv = false;
 		svm_x86_ops.vcpu_blocking = NULL;
 		svm_x86_ops.vcpu_unblocking = NULL;
 		svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@ -123,8 +123,8 @@ struct kvm_svm {
 	/* Struct members for AVIC */
 	u32 avic_vm_id;
-	struct page *avic_logical_id_table_page;
+	u32 *avic_logical_id_table;
-	struct page *avic_physical_id_table_page;
+	u64 *avic_physical_id_table;
 	struct hlist_node hnode;
 	struct kvm_sev_info sev_info;
@ -306,14 +306,22 @@ struct vcpu_svm {
 	u32 ldr_reg;
 	u32 dfr_reg;
-	struct page *avic_backing_page;
+
-	u64 *avic_physical_id_cache;
+	/* This is essentially a shadow of the vCPU's actual entry in the
 	 * Physical ID table that is programmed into the VMCB, i.e. that is
 	 * seen by the CPU.  If IPI virtualization is disabled, IsRunning is
 	 * only ever set in the shadow, i.e. is never propagated to the "real"
 	 * table, so that hardware never sees IsRunning=1.
 	 */
 	u64 avic_physical_id_entry;
 	/*
-	 * Per-vcpu list of struct amd_svm_iommu_ir:
+	 * Per-vCPU list of irqfds that are eligible to post IRQs directly to
-	 * This is used mainly to store interrupt remapping information used
+	 * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass).  The list
-	 * when update the vcpu affinity. This avoids the need to scan for
+	 * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the
-	 * IRTE and try to match ga_tag in the IOMMU driver.
+	 * target pCPU), when AVIC is toggled on/off (to (de)activate bypass),
 	 * and if the irqfd becomes ineligible for posting (to put the IRTE
 	 * back into remapped mode).
 	 */
 	struct list_head ir_list;
 	spinlock_t ir_list_lock;
@ -721,7 +729,8 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
 	BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) |	\
 	BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |	\
 	BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) |	\
-	BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED)	\
+	BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) |	\
 	BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG)	\
 )
 bool avic_hardware_setup(void);
@ -736,8 +745,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void avic_vcpu_put(struct kvm_vcpu *vcpu);
 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
-			uint32_t guest_irq, bool set);
+			unsigned int host_irq, uint32_t guest_irq,
 			struct kvm_vcpu *vcpu, u32 vector);
 void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void avic_ring_doorbell(struct kvm_vcpu *vcpu);
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@ -260,6 +260,86 @@ TRACE_EVENT(kvm_cpuid,
 		  __entry->used_max_basic ? ", used max basic" : "")
 );
 #define kvm_deliver_mode		\
 	{0x0, "Fixed"},			\
 	{0x1, "LowPrio"},		\
 	{0x2, "SMI"},			\
 	{0x3, "Res3"},			\
 	{0x4, "NMI"},			\
 	{0x5, "INIT"},			\
 	{0x6, "SIPI"},			\
 	{0x7, "ExtINT"}
 #ifdef CONFIG_KVM_IOAPIC
 TRACE_EVENT(kvm_ioapic_set_irq,
 	    TP_PROTO(__u64 e, int pin, bool coalesced),
 	    TP_ARGS(e, pin, coalesced),
 	TP_STRUCT__entry(
 		__field(	__u64,		e		)
 		__field(	int,		pin		)
 		__field(	bool,		coalesced	)
 	),
 	TP_fast_assign(
 		__entry->e		= e;
 		__entry->pin		= pin;
 		__entry->coalesced	= coalesced;
 	),
 	TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
 		  __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
 		  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
 		  (__entry->e & (1<<11)) ? "logical" : "physical",
 		  (__entry->e & (1<<15)) ? "level" : "edge",
 		  (__entry->e & (1<<16)) ? "|masked" : "",
 		  __entry->coalesced ? " (coalesced)" : "")
 );
 TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
 	    TP_PROTO(__u64 e),
 	    TP_ARGS(e),
 	TP_STRUCT__entry(
 		__field(	__u64,		e		)
 	),
 	TP_fast_assign(
 		__entry->e		= e;
 	),
 	TP_printk("dst %x vec %u (%s|%s|%s%s)",
 		  (u8)(__entry->e >> 56), (u8)__entry->e,
 		  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
 		  (__entry->e & (1<<11)) ? "logical" : "physical",
 		  (__entry->e & (1<<15)) ? "level" : "edge",
 		  (__entry->e & (1<<16)) ? "|masked" : "")
 );
 #endif
 TRACE_EVENT(kvm_msi_set_irq,
 	    TP_PROTO(__u64 address, __u64 data),
 	    TP_ARGS(address, data),
 	TP_STRUCT__entry(
 		__field(	__u64,		address		)
 		__field(	__u64,		data		)
 	),
 	TP_fast_assign(
 		__entry->address	= address;
 		__entry->data		= data;
 	),
 	TP_printk("dst %llx vec %u (%s|%s|%s%s)",
 		  (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
 		  (u8)__entry->data,
 		  __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
 		  (__entry->address & (1<<2)) ? "logical" : "physical",
 		  (__entry->data & (1<<15)) ? "level" : "edge",
 		  (__entry->address & (1<<3)) ? "|rh" : "")
 );
 #define AREG(x) { APIC_##x, "APIC_" #x }
 #define kvm_trace_symbol_apic						    \
@ -1096,37 +1176,32 @@ TRACE_EVENT(kvm_smm_transition,
 * Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC.
 */
 TRACE_EVENT(kvm_pi_irte_update,
-	TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
+	TP_PROTO(unsigned int host_irq, struct kvm_vcpu *vcpu,
-		 unsigned int gsi, unsigned int gvec,
+		 unsigned int gsi, unsigned int gvec, bool set),
-		 u64 pi_desc_addr, bool set),
+	TP_ARGS(host_irq, vcpu, gsi, gvec, set),
 	TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
 	TP_STRUCT__entry(
 		__field(	unsigned int,	host_irq	)
-		__field(	unsigned int,	vcpu_id		)
+		__field(	int,		vcpu_id		)
 		__field(	unsigned int,	gsi		)
 		__field(	unsigned int,	gvec		)
 		__field(	u64,		pi_desc_addr	)
 		__field(	bool,		set		)
 	),
 	TP_fast_assign(
 		__entry->host_irq	= host_irq;
-		__entry->vcpu_id	= vcpu_id;
+		__entry->vcpu_id	= vcpu ? vcpu->vcpu_id : -1;
 		__entry->gsi		= gsi;
 		__entry->gvec		= gvec;
 		__entry->pi_desc_addr	= pi_desc_addr;
 		__entry->set		= set;
 	),
-	TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
+	TP_printk("PI is %s for irq %u, vcpu %d, gsi: 0x%x, gvec: 0x%x",
 		  "gvec: 0x%x, pi_desc_addr: 0x%llx",
 		  __entry->set ? "enabled and being updated" : "disabled",
 		  __entry->host_irq,
 		  __entry->vcpu_id,
 		  __entry->gsi,
-		  __entry->gvec,
+		  __entry->gvec)
 		  __entry->pi_desc_addr)
 );
 /*
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@ -15,7 +15,6 @@ extern bool __read_mostly enable_ept;
 extern bool __read_mostly enable_unrestricted_guest;
 extern bool __read_mostly enable_ept_ad_bits;
 extern bool __read_mostly enable_pml;
 extern bool __read_mostly enable_ipiv;
 extern int __read_mostly pt_mode;
 #define PT_MODE_SYSTEM		0
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@ -1014,7 +1014,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
 	.nested_ops = &vmx_nested_ops,
 	.pi_update_irte = vmx_pi_update_irte,
-	.pi_start_assignment = vmx_pi_start_assignment,
+	.pi_start_bypass = vmx_pi_start_bypass,
 #ifdef CONFIG_X86_64
 	.set_hv_timer = vt_op(set_hv_timer),
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@ -2,6 +2,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/kvm_host.h>
 #include <linux/kvm_irqfd.h>
 #include <asm/irq_remapping.h>
 #include <asm/cpu.h>
@ -72,13 +73,10 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	/*
 	 * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
 	 * full update can be skipped as neither the vector nor the destination
-	 * needs to be changed.
+	 * needs to be changed.  Clear SN even if there is no assigned device,
 	 * again for simplicity.
 	 */
 	if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
 		/*
 		 * Clear SN if it was set due to being preempted.  Again, do
 		 * this even if there is no assigned device for simplicity.
 		 */
 		if (pi_test_and_clear_sn(pi_desc))
 			goto after_clear_sn;
 		return;
@ -148,8 +146,13 @@ after_clear_sn:
 static bool vmx_can_use_vtd_pi(struct kvm *kvm)
 {
 	/*
 	 * Note, reading the number of possible bypass IRQs can race with a
 	 * bypass IRQ being attached to the VM.  vmx_pi_start_bypass() ensures
 	 * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK.
 	 */
 	return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() &&
-	       kvm_arch_has_assigned_device(kvm);
+	       READ_ONCE(kvm->arch.nr_possible_bypass_irqs);
 }
 /*
@ -224,17 +227,23 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 	if (!vmx_needs_pi_wakeup(vcpu))
 		return;
-	if (kvm_vcpu_is_blocking(vcpu) &&
+	/*
 	 * If the vCPU is blocking with IRQs enabled and ISN'T being preempted,
 	 * enable the wakeup handler so that notification IRQ wakes the vCPU as
 	 * expected.  There is no need to enable the wakeup handler if the vCPU
 	 * is preempted between setting its wait state and manually scheduling
 	 * out, as the task is still runnable, i.e. doesn't need a wake event
 	 * from KVM to be scheduled in.
 	 *
 	 * If the wakeup handler isn't being enabled, Suppress Notifications as
 	 * the cost of propagating PIR.IRR to PID.ON is negligible compared to
 	 * the cost of a spurious IRQ, and vCPU put/load is a slow path.
 	 */
 	if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) &&
 	    ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
 	     (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
 		pi_enable_wakeup_handler(vcpu);
-
+	else
 	/*
 	 * Set SN when the vCPU is preempted.  Note, the vCPU can both be seen
 	 * as blocking and preempted, e.g. if it's preempted between setting
 	 * its wait state and manually scheduling out.
 	 */
 	if (vcpu->preempted)
 		pi_set_sn(pi_desc);
 }
@ -281,99 +290,30 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
 /*
- * Bail out of the block loop if the VM has an assigned
+ * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as
- * device, but the blocking vCPU didn't reconfigure the
+ * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup
- * PI.NV to the wakeup vector, i.e. the assigned device
+ * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put().
 * came along after the initial check in vmx_vcpu_pi_put().
 */
-void vmx_pi_start_assignment(struct kvm *kvm)
+void vmx_pi_start_bypass(struct kvm *kvm)
 {
-	if (!kvm_arch_has_irq_bypass())
+	if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm)))
 		return;
 	kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
 }
-/*
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
- * vmx_pi_update_irte - set IRTE for Posted-Interrupts
+		       unsigned int host_irq, uint32_t guest_irq,
- *
+		       struct kvm_vcpu *vcpu, u32 vector)
 * @kvm: kvm
 * @host_irq: host irq of the interrupt
 * @guest_irq: gsi of the interrupt
 * @set: set or unset PI
 * returns 0 on success, < 0 on failure
 */
 int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 		       uint32_t guest_irq, bool set)
 {
-	struct kvm_kernel_irq_routing_entry *e;
+	if (vcpu) {
-	struct kvm_irq_routing_table *irq_rt;
+		struct intel_iommu_pi_data pi_data = {
-	bool enable_remapped_mode = true;
+			.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)),
-	struct kvm_lapic_irq irq;
+			.vector = vector,
-	struct kvm_vcpu *vcpu;
+		};
 	struct vcpu_data vcpu_info;
 	int idx, ret = 0;
-	if (!vmx_can_use_vtd_pi(kvm))
+		return irq_set_vcpu_affinity(host_irq, &pi_data);
-		return 0;
+	} else {
-
+		return irq_set_vcpu_affinity(host_irq, NULL);
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
 	if (guest_irq >= irq_rt->nr_rt_entries ||
 	    hlist_empty(&irq_rt->map[guest_irq])) {
 		pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
 			     guest_irq, irq_rt->nr_rt_entries);
 		goto out;
 	}
 	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
 		if (e->type != KVM_IRQ_ROUTING_MSI)
 			continue;
 		/*
 		 * VT-d PI cannot support posting multicast/broadcast
 		 * interrupts to a vCPU, we still use interrupt remapping
 		 * for these kind of interrupts.
 		 *
 		 * For lowest-priority interrupts, we only support
 		 * those with single CPU as the destination, e.g. user
 		 * configures the interrupts via /proc/irq or uses
 		 * irqbalance to make the interrupts single-CPU.
 		 *
 		 * We will support full lowest-priority interrupt later.
 		 *
 		 * In addition, we can only inject generic interrupts using
 		 * the PI mechanism, refuse to route others through it.
 		 */
 		kvm_set_msi_irq(kvm, e, &irq);
 		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
 		    !kvm_irq_is_postable(&irq))
 			continue;
 		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
 		vcpu_info.vector = irq.vector;
 		trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
 				vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 		if (!set)
 			continue;
 		enable_remapped_mode = false;
 		ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
 		if (ret < 0) {
 			printk(KERN_INFO "%s: failed to update PI IRTE\n",
 					__func__);
 			goto out;
 		}
 	}
 	if (enable_remapped_mode)
 		ret = irq_set_vcpu_affinity(host_irq, NULL);
 	ret = 0;
 out:
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 	return ret;
 }
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@ -3,6 +3,9 @@
 #define __KVM_X86_VMX_POSTED_INTR_H
 #include <linux/bitmap.h>
 #include <linux/find.h>
 #include <linux/kvm_host.h>
 #include <asm/posted_intr.h>
 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
@ -11,9 +14,10 @@ void pi_wakeup_handler(void);
 void __init pi_init_cpu(int cpu);
 void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
-		       uint32_t guest_irq, bool set);
+		       unsigned int host_irq, uint32_t guest_irq,
-void vmx_pi_start_assignment(struct kvm *kvm);
+		       struct kvm_vcpu *vcpu, u32 vector);
 void vmx_pi_start_bypass(struct kvm *kvm);
 static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
 {
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@ -113,8 +113,6 @@ static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, 0444);
 module_param(enable_apicv, bool, 0444);
 bool __read_mostly enable_ipiv = true;
 module_param(enable_ipiv, bool, 0444);
 module_param(enable_device_posted_irqs, bool, 0444);
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -226,6 +226,9 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
 bool __read_mostly enable_apicv = true;
 EXPORT_SYMBOL_GPL(enable_apicv);
 bool __read_mostly enable_ipiv = true;
 EXPORT_SYMBOL_GPL(enable_ipiv);
 bool __read_mostly enable_device_posted_irqs = true;
 EXPORT_SYMBOL_GPL(enable_device_posted_irqs);
@ -4634,17 +4637,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_EXT_CPUID:
 	case KVM_CAP_EXT_EMUL_CPUID:
 	case KVM_CAP_CLOCKSOURCE:
 #ifdef CONFIG_KVM_IOAPIC
 	case KVM_CAP_PIT:
 	case KVM_CAP_PIT2:
 	case KVM_CAP_PIT_STATE2:
 	case KVM_CAP_REINJECT_CONTROL:
 #endif
 	case KVM_CAP_NOP_IO_DELAY:
 	case KVM_CAP_MP_STATE:
 	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_USER_NMI:
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_IOEVENTFD_NO_LENGTH:
-	case KVM_CAP_PIT2:
+
 	case KVM_CAP_PIT_STATE2:
 	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
 	case KVM_CAP_VCPU_EVENTS:
 #ifdef CONFIG_KVM_HYPERV
@ -6401,135 +6407,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 	return 0;
 }
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 	struct kvm_pic *pic = kvm->arch.vpic;
 	int r;
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
 		memcpy(&chip->chip.pic, &pic->pics[0],
 			sizeof(struct kvm_pic_state));
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
 		memcpy(&chip->chip.pic, &pic->pics[1],
 			sizeof(struct kvm_pic_state));
 		break;
 	case KVM_IRQCHIP_IOAPIC:
 		kvm_get_ioapic(kvm, &chip->chip.ioapic);
 		break;
 	default:
 		r = -EINVAL;
 		break;
 	}
 	return r;
 }
 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 	struct kvm_pic *pic = kvm->arch.vpic;
 	int r;
 	r = 0;
 	switch (chip->chip_id) {
 	case KVM_IRQCHIP_PIC_MASTER:
 		spin_lock(&pic->lock);
 		memcpy(&pic->pics[0], &chip->chip.pic,
 			sizeof(struct kvm_pic_state));
 		spin_unlock(&pic->lock);
 		break;
 	case KVM_IRQCHIP_PIC_SLAVE:
 		spin_lock(&pic->lock);
 		memcpy(&pic->pics[1], &chip->chip.pic,
 			sizeof(struct kvm_pic_state));
 		spin_unlock(&pic->lock);
 		break;
 	case KVM_IRQCHIP_IOAPIC:
 		kvm_set_ioapic(kvm, &chip->chip.ioapic);
 		break;
 	default:
 		r = -EINVAL;
 		break;
 	}
 	kvm_pic_update_irq(pic);
 	return r;
 }
 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
 	struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
 	BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
 	mutex_lock(&kps->lock);
 	memcpy(ps, &kps->channels, sizeof(*ps));
 	mutex_unlock(&kps->lock);
 	return 0;
 }
 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
 	int i;
 	struct kvm_pit *pit = kvm->arch.vpit;
 	mutex_lock(&pit->pit_state.lock);
 	memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
 	for (i = 0; i < 3; i++)
 		kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
 	mutex_unlock(&pit->pit_state.lock);
 	return 0;
 }
 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
 		sizeof(ps->channels));
 	ps->flags = kvm->arch.vpit->pit_state.flags;
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	memset(&ps->reserved, 0, sizeof(ps->reserved));
 	return 0;
 }
 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
 	int start = 0;
 	int i;
 	u32 prev_legacy, cur_legacy;
 	struct kvm_pit *pit = kvm->arch.vpit;
 	mutex_lock(&pit->pit_state.lock);
 	prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
 	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
 	if (!prev_legacy && cur_legacy)
 		start = 1;
 	memcpy(&pit->pit_state.channels, &ps->channels,
 	       sizeof(pit->pit_state.channels));
 	pit->pit_state.flags = ps->flags;
 	for (i = 0; i < 3; i++)
 		kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
 				   start && i == 0);
 	mutex_unlock(&pit->pit_state.lock);
 	return 0;
 }
 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 				 struct kvm_reinject_control *control)
 {
 	struct kvm_pit *pit = kvm->arch.vpit;
 	/* pit->pit_state.lock was overloaded to prevent userspace from getting
 	 * an inconsistent state after running multiple KVM_REINJECT_CONTROL
 	 * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
 	 */
 	mutex_lock(&pit->pit_state.lock);
 	kvm_pit_set_reinject(pit, control->pit_reinject);
 	mutex_unlock(&pit->pit_state.lock);
 	return 0;
 }
 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
@ -6549,18 +6426,6 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 		kvm_vcpu_kick(vcpu);
 }
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
 			bool line_status)
 {
 	if (!irqchip_in_kernel(kvm))
 		return -ENXIO;
 	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
 					irq_event->irq, irq_event->level,
 					line_status);
 	return 0;
 }
 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			    struct kvm_enable_cap *cap)
 {
@ -7072,9 +6937,11 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 	struct kvm *kvm = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r = -ENOTTY;
 #ifdef CONFIG_KVM_IOAPIC
 	/*
 	 * This union makes it completely explicit to gcc-3.x
-	 * that these two variables' stack usage should be
+	 * that these three variables' stack usage should be
 	 * combined, not added together.
 	 */
 	union {
@ -7082,6 +6949,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 		struct kvm_pit_state2 ps2;
 		struct kvm_pit_config pit_config;
 	} u;
 #endif
 	switch (ioctl) {
 	case KVM_SET_TSS_ADDR:
@ -7105,6 +6973,7 @@ set_identity_unlock:
 	case KVM_SET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
 		break;
 #ifdef CONFIG_KVM_IOAPIC
 	case KVM_CREATE_IRQCHIP: {
 		mutex_lock(&kvm->lock);
@ -7126,7 +6995,7 @@ set_identity_unlock:
 			goto create_irqchip_unlock;
 		}
-		r = kvm_setup_default_irq_routing(kvm);
+		r = kvm_setup_default_ioapic_and_pic_routing(kvm);
 		if (r) {
 			kvm_ioapic_destroy(kvm);
 			kvm_pic_destroy(kvm);
@ -7174,7 +7043,7 @@ set_identity_unlock:
 		}
 		r = -ENXIO;
-		if (!irqchip_kernel(kvm))
+		if (!irqchip_full(kvm))
 			goto get_irqchip_out;
 		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 		if (r)
@ -7198,7 +7067,7 @@ set_identity_unlock:
 		}
 		r = -ENXIO;
-		if (!irqchip_kernel(kvm))
+		if (!irqchip_full(kvm))
 			goto set_irqchip_out;
 		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 	set_irqchip_out:
@ -7271,6 +7140,7 @@ set_pit2_out:
 		r = kvm_vm_ioctl_reinject(kvm, &control);
 		break;
 	}
 #endif
 	case KVM_SET_BOOT_CPU_ID:
 		r = 0;
 		mutex_lock(&kvm->lock);
@ -10730,8 +10600,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 	if (irqchip_split(vcpu->kvm))
 		kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
 #ifdef CONFIG_KVM_IOAPIC
 	else if (ioapic_in_kernel(vcpu->kvm))
 		kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
 #endif
 	if (is_guest_mode(vcpu))
 		vcpu->arch.load_eoi_exitmap_pending = true;
@ -12801,15 +12673,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	if (ret)
 		goto out_uninit_mmu;
 	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
 	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
 	/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
 	set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
 		&kvm->arch.irq_sources_bitmap);
 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
 	mutex_init(&kvm->arch.apic_map_lock);
 	seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
@ -12940,7 +12805,9 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
 	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
 	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
 #ifdef CONFIG_KVM_IOAPIC
 	kvm_free_pit(kvm);
 #endif
 	kvm_mmu_pre_destroy_vm(kvm);
 	static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
@ -12964,8 +12831,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	}
 	kvm_destroy_vcpus(kvm);
 	kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
 #ifdef CONFIG_KVM_IOAPIC
 	kvm_pic_destroy(kvm);
 	kvm_ioapic_destroy(kvm);
 #endif
 	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 	kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
 	kvm_mmu_uninit_vm(kvm);
@ -13577,8 +13446,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
 void kvm_arch_start_assignment(struct kvm *kvm)
 {
-	if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
+	atomic_inc(&kvm->arch.assigned_device_count);
 		kvm_x86_call(pi_start_assignment)(kvm);
 }
 EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
@ -13629,77 +13497,6 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
 				      struct irq_bypass_producer *prod)
 {
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 	struct kvm *kvm = irqfd->kvm;
 	int ret;
 	kvm_arch_start_assignment(irqfd->kvm);
 	spin_lock_irq(&kvm->irqfds.lock);
 	irqfd->producer = prod;
 	ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
 					   prod->irq, irqfd->gsi, 1);
 	if (ret)
 		kvm_arch_end_assignment(irqfd->kvm);
 	spin_unlock_irq(&kvm->irqfds.lock);
 	return ret;
 }
 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 				      struct irq_bypass_producer *prod)
 {
 	int ret;
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 	struct kvm *kvm = irqfd->kvm;
 	WARN_ON(irqfd->producer != prod);
 	/*
 	 * When producer of consumer is unregistered, we change back to
 	 * remapped mode, so we can re-use the current implementation
 	 * when the irq is masked/disabled or the consumer side (KVM
 	 * int this case doesn't want to receive the interrupts.
 	*/
 	spin_lock_irq(&kvm->irqfds.lock);
 	irqfd->producer = NULL;
 	ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
 					   prod->irq, irqfd->gsi, 0);
 	if (ret)
 		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
 		       " fails: %d\n", irqfd->consumer.token, ret);
 	spin_unlock_irq(&kvm->irqfds.lock);
 	kvm_arch_end_assignment(irqfd->kvm);
 }
 int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
 				   uint32_t guest_irq, bool set)
 {
 	return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
 }
 bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
 				  struct kvm_kernel_irq_routing_entry *new)
 {
 	if (old->type != KVM_IRQ_ROUTING_MSI ||
 	    new->type != KVM_IRQ_ROUTING_MSI)
 		return true;
 	return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
 }
 bool kvm_vector_hashing_enabled(void)
 {
 	return vector_hashing;
@ -14099,7 +13896,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@ -368,6 +368,14 @@ static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
 			container_of(polltbl, struct mshv_irqfd, irqfd_polltbl);
 	irqfd->irqfd_wqh = wqh;
 	/*
 	 * TODO: Ensure there isn't already an exclusive, priority waiter, e.g.
 	 * that the irqfd isn't already bound to another partition.  Only the
 	 * first exclusive waiter encountered will be notified, and
 	 * add_wait_queue_priority() doesn't enforce exclusivity.
 	 */
 	irqfd->irqfd_wait.flags |= WQ_FLAG_EXCLUSIVE;
 	add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
 }
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@ -1054,7 +1054,6 @@ struct irq_2_irte {
 };
 struct amd_ir_data {
 	u32 cached_ga_tag;
 	struct amd_iommu *iommu;
 	struct irq_2_irte irq_2_irte;
 	struct msi_msg msi_entry;
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@ -3804,13 +3804,70 @@ static const struct irq_domain_ops amd_ir_domain_ops = {
 	.deactivate = irq_remapping_deactivate,
 };
-int amd_iommu_activate_guest_mode(void *data)
+static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
 				  bool ga_log_intr)
 {
 	if (cpu >= 0) {
 		entry->lo.fields_vapic.destination =
 					APICID_TO_IRTE_DEST_LO(cpu);
 		entry->hi.fields.destination =
 					APICID_TO_IRTE_DEST_HI(cpu);
 		entry->lo.fields_vapic.is_run = true;
 		entry->lo.fields_vapic.ga_log_intr = false;
 	} else {
 		entry->lo.fields_vapic.is_run = false;
 		entry->lo.fields_vapic.ga_log_intr = ga_log_intr;
 	}
 }
 /*
 * Update the pCPU information for an IRTE that is configured to post IRQs to
 * a vCPU, without issuing an IOMMU invalidation for the IRTE.
 *
 * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination
 * with the pCPU's APIC ID, set IsRun, and clear GALogIntr.  If the vCPU isn't
 * associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based
 * on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is
 * blocking and requires a notification wake event).  I.e. treat vCPUs that are
 * associated with a pCPU as running.  This API is intended to be used when a
 * vCPU is scheduled in/out (or stops running for any reason), to do a fast
 * update of IsRun, GALogIntr, and (conditionally) Destination.
 *
 * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached
 * and thus don't require an invalidation to ensure the IOMMU consumes fresh
 * information.
 */
 int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr)
 {
 	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
 	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
 	if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
 		return -EINVAL;
 	if (!entry || !entry->lo.fields_vapic.guest_mode)
 		return 0;
 	if (!ir_data->iommu)
 		return -ENODEV;
 	__amd_iommu_update_ga(entry, cpu, ga_log_intr);
 	return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
 				ir_data->irq_2_irte.index, entry);
 }
 EXPORT_SYMBOL(amd_iommu_update_ga);
 int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr)
 {
 	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
 	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
 	u64 valid;
-	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
+	if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
 		return -EINVAL;
 	if (!entry)
 		return 0;
 	valid = entry->lo.fields_vapic.valid;
@ -3820,11 +3877,12 @@ int amd_iommu_activate_guest_mode(void *data)
 	entry->lo.fields_vapic.valid       = valid;
 	entry->lo.fields_vapic.guest_mode  = 1;
 	entry->lo.fields_vapic.ga_log_intr = 1;
 	entry->hi.fields.ga_root_ptr       = ir_data->ga_root_ptr;
 	entry->hi.fields.vector            = ir_data->ga_vector;
 	entry->lo.fields_vapic.ga_tag      = ir_data->ga_tag;
 	__amd_iommu_update_ga(entry, cpu, ga_log_intr);
 	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
 			      ir_data->irq_2_irte.index, entry);
 }
@ -3837,8 +3895,10 @@ int amd_iommu_deactivate_guest_mode(void *data)
 	struct irq_cfg *cfg = ir_data->cfg;
 	u64 valid;
-	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
+	if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
-	    !entry || !entry->lo.fields_vapic.guest_mode)
+		return -EINVAL;
 	if (!entry || !entry->lo.fields_vapic.guest_mode)
 		return 0;
 	valid = entry->lo.fields_remap.valid;
@ -3860,11 +3920,10 @@ int amd_iommu_deactivate_guest_mode(void *data)
 }
 EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
-static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info)
 {
 	int ret;
-	struct amd_iommu_pi_data *pi_data = vcpu_info;
+	struct amd_iommu_pi_data *pi_data = info;
 	struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
 	struct amd_ir_data *ir_data = data->chip_data;
 	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
 	struct iommu_dev_data *dev_data;
@ -3885,25 +3944,20 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
 		return -EINVAL;
 	ir_data->cfg = irqd_cfg(data);
 	pi_data->ir_data = ir_data;
-	pi_data->prev_ga_tag = ir_data->cached_ga_tag;
+	if (pi_data) {
-	if (pi_data->is_guest_mode) {
+		pi_data->ir_data = ir_data;
-		ir_data->ga_root_ptr = (pi_data->base >> 12);
+
-		ir_data->ga_vector = vcpu_pi_info->vector;
+		ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12);
 		ir_data->ga_vector = pi_data->vector;
 		ir_data->ga_tag = pi_data->ga_tag;
-		ret = amd_iommu_activate_guest_mode(ir_data);
+		if (pi_data->is_guest_mode)
-		if (!ret)
+			ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu,
-			ir_data->cached_ga_tag = pi_data->ga_tag;
+							    pi_data->ga_log_intr);
 		else
 			ret = amd_iommu_deactivate_guest_mode(ir_data);
 	} else {
 		ret = amd_iommu_deactivate_guest_mode(ir_data);
 		/*
 		 * This communicates the ga_tag back to the caller
 		 * so that it can do all the necessary clean up.
 		 */
 		if (!ret)
 			ir_data->cached_ga_tag = 0;
 	}
 	return ret;
@ -3995,29 +4049,4 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
 	return 0;
 }
 int amd_iommu_update_ga(int cpu, bool is_run, void *data)
 {
 	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
 	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
 	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
 	    !entry || !entry->lo.fields_vapic.guest_mode)
 		return 0;
 	if (!ir_data->iommu)
 		return -ENODEV;
 	if (cpu >= 0) {
 		entry->lo.fields_vapic.destination =
 					APICID_TO_IRTE_DEST_LO(cpu);
 		entry->hi.fields.destination =
 					APICID_TO_IRTE_DEST_HI(cpu);
 	}
 	entry->lo.fields_vapic.is_run = is_run;
 	return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
 				ir_data->irq_2_irte.index, entry);
 }
 EXPORT_SYMBOL(amd_iommu_update_ga);
 #endif
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@ -1244,10 +1244,10 @@ static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
 static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
 {
 	struct intel_ir_data *ir_data = data->chip_data;
-	struct vcpu_data *vcpu_pi_info = info;
+	struct intel_iommu_pi_data *pi_data = info;
 	/* stop posting interrupts, back to the default mode */
-	if (!vcpu_pi_info) {
+	if (!pi_data) {
 		__intel_ir_reconfigure_irte(data, true);
 	} else {
 		struct irte irte_pi;
@ -1265,10 +1265,10 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
 		/* Update the posted mode fields */
 		irte_pi.p_pst = 1;
 		irte_pi.p_urgent = 0;
-		irte_pi.p_vector = vcpu_pi_info->vector;
+		irte_pi.p_vector = pi_data->vector;
-		irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >>
+		irte_pi.pda_l = (pi_data->pi_desc_addr >>
 				(32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
-		irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) &
+		irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) &
 				~(-1UL << PDA_HIGH_BIT);
 		ir_data->irq_2_iommu.posted_vcpu = true;
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@ -342,10 +342,10 @@ int its_get_vlpi(int irq, struct its_vlpi_map *map)
 	return irq_set_vcpu_affinity(irq, &info);
 }
-int its_unmap_vlpi(int irq)
+void its_unmap_vlpi(int irq)
 {
 	irq_clear_status_flags(irq, IRQ_DISABLE_UNLAZY);
-	return irq_set_vcpu_affinity(irq, NULL);
+	WARN_ON_ONCE(irq_set_vcpu_affinity(irq, NULL));
 }
 int its_prop_update_vlpi(int irq, u8 config, bool inv)
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@ -505,15 +505,11 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev,
 	if (ret)
 		goto out_put_eventfd_ctx;
-	ctx->producer.token = trigger;
+	ret = irq_bypass_register_producer(&ctx->producer, trigger, irq);
 	ctx->producer.irq = irq;
 	ret = irq_bypass_register_producer(&ctx->producer);
 	if (unlikely(ret)) {
 		dev_info(&pdev->dev,
-		"irq bypass producer (token %p) registration fails: %d\n",
+		"irq bypass producer (eventfd %p) registration fails: %d\n",
-		ctx->producer.token, ret);
+		trigger, ret);
 		ctx->producer.token = NULL;
 	}
 	ctx->trigger = trigger;
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@ -212,11 +212,11 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid)
 	if (!vq->call_ctx.ctx)
 		return;
-	vq->call_ctx.producer.irq = irq;
+	ret = irq_bypass_register_producer(&vq->call_ctx.producer,
-	ret = irq_bypass_register_producer(&vq->call_ctx.producer);
+					   vq->call_ctx.ctx, irq);
 	if (unlikely(ret))
-		dev_info(&v->dev, "vq %u, irq bypass producer (token %p) registration fails, ret =  %d\n",
+		dev_info(&v->dev, "vq %u, irq bypass producer (eventfd %p) registration fails, ret =  %d\n",
-			 qid, vq->call_ctx.producer.token, ret);
+			 qid, vq->call_ctx.ctx, ret);
 }
 static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid)
@ -712,7 +712,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 			if (ops->get_status(vdpa) &
 			    VIRTIO_CONFIG_S_DRIVER_OK)
 				vhost_vdpa_unsetup_vq_irq(v, idx);
 			vq->call_ctx.producer.token = NULL;
 		}
 		break;
 	}
@ -753,7 +752,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 			cb.callback = vhost_vdpa_virtqueue_cb;
 			cb.private = vq;
 			cb.trigger = vq->call_ctx.ctx;
 			vq->call_ctx.producer.token = vq->call_ctx.ctx;
 			if (ops->get_status(vdpa) &
 			    VIRTIO_CONFIG_S_DRIVER_OK)
 				vhost_vdpa_setup_vq_irq(v, idx);
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@ -434,7 +434,7 @@ struct kvm_kernel_irq_routing_entry;
 int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq,
 			       struct kvm_kernel_irq_routing_entry *irq_entry);
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq);
+void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq);
 int vgic_v4_load(struct kvm_vcpu *vcpu);
 void vgic_v4_commit(struct kvm_vcpu *vcpu);
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@ -12,20 +12,6 @@
 struct amd_iommu;
 /*
 * This is mainly used to communicate information back-and-forth
 * between SVM and IOMMU for setting up and tearing down posted
 * interrupt
 */
 struct amd_iommu_pi_data {
 	u32 ga_tag;
 	u32 prev_ga_tag;
 	u64 base;
 	bool is_guest_mode;
 	struct vcpu_data *vcpu_data;
 	void *ir_data;
 };
 #ifdef CONFIG_AMD_IOMMU
 struct task_struct;
@ -44,10 +30,8 @@ static inline void amd_iommu_detect(void) { }
 /* IOMMU AVIC Function */
 extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32));
-extern int
+extern int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr);
-amd_iommu_update_ga(int cpu, bool is_run, void *data);
+extern int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr);
 extern int amd_iommu_activate_guest_mode(void *data);
 extern int amd_iommu_deactivate_guest_mode(void *data);
 #else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
@ -58,13 +42,12 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
 	return 0;
 }
-static inline int
+static inline int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr)
 amd_iommu_update_ga(int cpu, bool is_run, void *data)
 {
 	return 0;
 }
-static inline int amd_iommu_activate_guest_mode(void *data)
+static inline int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr)
 {
 	return 0;
 }
--- a/include/linux/irqbypass.h
+++ b/include/linux/irqbypass.h
@ -10,6 +10,7 @@
 #include <linux/list.h>
 struct eventfd_ctx;
 struct irq_bypass_consumer;
 /*
@ -18,20 +19,22 @@ struct irq_bypass_consumer;
 * The IRQ bypass manager is a simple set of lists and callbacks that allows
 * IRQ producers (ex. physical interrupt sources) to be matched to IRQ
 * consumers (ex. virtualization hardware that allows IRQ bypass or offload)
- * via a shared token (ex. eventfd_ctx).  Producers and consumers register
+ * via a shared eventfd_ctx.  Producers and consumers register independently.
- * independently.  When a token match is found, the optional @stop callback
+ * When a producer and consumer are paired, i.e. an eventfd match is found, the
- * will be called for each participant.  The pair will then be connected via
+ * optional @stop callback will be called for each participant.  The pair will
- * the @add_* callbacks, and finally the optional @start callback will allow
+ * then be connected via the @add_* callbacks, and finally the optional @start
- * any final coordination.  When either participant is unregistered, the
+ * callback will allow any final coordination.  When either participant is
- * process is repeated using the @del_* callbacks in place of the @add_*
+ * unregistered, the process is repeated using the @del_* callbacks in place of
- * callbacks.  Match tokens must be unique per producer/consumer, 1:N pairings
+ * the @add_* callbacks.  eventfds must be unique per producer/consumer, 1:N
- * are not supported.
+ * pairings are not supported.
 */
 struct irq_bypass_consumer;
 /**
 * struct irq_bypass_producer - IRQ bypass producer definition
- * @node: IRQ bypass manager private list management
+ * @eventfd: eventfd context used to match producers and consumers
- * @token: opaque token to match between producer and consumer (non-NULL)
+ * @consumer: The connected consumer (NULL if no connection)
 * @irq: Linux IRQ number for the producer device
 * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional)
 * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional)
@ -43,8 +46,8 @@ struct irq_bypass_consumer;
 * for a physical device assigned to a VM.
 */
 struct irq_bypass_producer {
-	struct list_head node;
+	struct eventfd_ctx *eventfd;
-	void *token;
+	struct irq_bypass_consumer *consumer;
 	int irq;
 	int (*add_consumer)(struct irq_bypass_producer *,
 			    struct irq_bypass_consumer *);
@ -56,8 +59,8 @@ struct irq_bypass_producer {
 /**
 * struct irq_bypass_consumer - IRQ bypass consumer definition
- * @node: IRQ bypass manager private list management
+ * @eventfd: eventfd context used to match producers and consumers
- * @token: opaque token to match between producer and consumer (non-NULL)
+ * @producer: The connected producer (NULL if no connection)
 * @add_producer: Connect the IRQ consumer to an IRQ producer
 * @del_producer: Disconnect the IRQ consumer from an IRQ producer
 * @stop: Perform any quiesce operations necessary prior to add/del (optional)
@ -69,8 +72,9 @@ struct irq_bypass_producer {
 * portions of the interrupt handling to the VM.
 */
 struct irq_bypass_consumer {
-	struct list_head node;
+	struct eventfd_ctx *eventfd;
-	void *token;
+	struct irq_bypass_producer *producer;
 	int (*add_producer)(struct irq_bypass_consumer *,
 			    struct irq_bypass_producer *);
 	void (*del_producer)(struct irq_bypass_consumer *,
@ -79,9 +83,11 @@ struct irq_bypass_consumer {
 	void (*start)(struct irq_bypass_consumer *);
 };
-int irq_bypass_register_producer(struct irq_bypass_producer *);
+int irq_bypass_register_producer(struct irq_bypass_producer *producer,
-void irq_bypass_unregister_producer(struct irq_bypass_producer *);
+				 struct eventfd_ctx *eventfd, int irq);
-int irq_bypass_register_consumer(struct irq_bypass_consumer *);
+void irq_bypass_unregister_producer(struct irq_bypass_producer *producer);
-void irq_bypass_unregister_consumer(struct irq_bypass_consumer *);
+int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer,
 				 struct eventfd_ctx *eventfd);
 void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer);
 #endif /* IRQBYPASS_H */
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@ -146,7 +146,7 @@ int its_commit_vpe(struct its_vpe *vpe);
 int its_invall_vpe(struct its_vpe *vpe);
 int its_map_vlpi(int irq, struct its_vlpi_map *map);
 int its_get_vlpi(int irq, struct its_vlpi_map *map);
-int its_unmap_vlpi(int irq);
+void its_unmap_vlpi(int irq);
 int its_prop_update_vlpi(int irq, u8 config, bool inv);
 int its_prop_update_vsgi(int irq, u8 priority, bool group);
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@ -190,6 +190,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
 #define KVM_PIT_IRQ_SOURCE_ID			2
 extern struct mutex kvm_lock;
 extern struct list_head vm_list;
@ -1022,16 +1023,12 @@ void kvm_unlock_all_vcpus(struct kvm *kvm);
 void vcpu_load(struct kvm_vcpu *vcpu);
 void vcpu_put(struct kvm_vcpu *vcpu);
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_KVM_IOAPIC
 void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm);
 void kvm_arch_post_irq_routing_update(struct kvm *kvm);
 #else
 static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
 {
 }
 static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm)
 {
 }
 #endif
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
@ -1788,8 +1785,6 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
 void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
 /*
@ -2406,6 +2401,8 @@ struct kvm_vcpu *kvm_get_running_vcpu(void);
 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
 struct kvm_kernel_irqfd;
 bool kvm_arch_has_irq_bypass(void);
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
 			   struct irq_bypass_producer *);
@ -2413,10 +2410,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
 			   struct irq_bypass_producer *);
 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
-				  uint32_t guest_irq, bool set);
+				   struct kvm_kernel_irq_routing_entry *old,
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *,
+				   struct kvm_kernel_irq_routing_entry *new);
 				  struct kvm_kernel_irq_routing_entry *);
 #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
 #ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS
--- a/include/linux/kvm_irqfd.h
+++ b/include/linux/kvm_irqfd.h
@ -55,10 +55,13 @@ struct kvm_kernel_irqfd {
 	/* Used for setup/shutdown */
 	struct eventfd_ctx *eventfd;
 	struct list_head list;
 	poll_table pt;
 	struct work_struct shutdown;
 	struct irq_bypass_consumer consumer;
 	struct irq_bypass_producer *producer;
 	struct kvm_vcpu *irq_bypass_vcpu;
 	struct list_head vcpu_list;
 	void *irq_bypass_data;
 };
 #endif /* __LINUX_KVM_IRQFD_H */
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@ -164,6 +164,8 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
 extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
 					     struct wait_queue_entry *wq_entry);
 extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@ -82,95 +82,15 @@ TRACE_EVENT(kvm_set_irq,
 	TP_printk("gsi %u level %d source %d",
 		  __entry->gsi, __entry->level, __entry->irq_source_id)
 );
 #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
-#if defined(__KVM_HAVE_IOAPIC)
+#ifdef CONFIG_KVM_IOAPIC
 #define kvm_deliver_mode		\
 	{0x0, "Fixed"},			\
 	{0x1, "LowPrio"},		\
 	{0x2, "SMI"},			\
 	{0x3, "Res3"},			\
 	{0x4, "NMI"},			\
 	{0x5, "INIT"},			\
 	{0x6, "SIPI"},			\
 	{0x7, "ExtINT"}
 TRACE_EVENT(kvm_ioapic_set_irq,
 	    TP_PROTO(__u64 e, int pin, bool coalesced),
 	    TP_ARGS(e, pin, coalesced),
 	TP_STRUCT__entry(
 		__field(	__u64,		e		)
 		__field(	int,		pin		)
 		__field(	bool,		coalesced	)
 	),
 	TP_fast_assign(
 		__entry->e		= e;
 		__entry->pin		= pin;
 		__entry->coalesced	= coalesced;
 	),
 	TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
 		  __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
 		  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
 		  (__entry->e & (1<<11)) ? "logical" : "physical",
 		  (__entry->e & (1<<15)) ? "level" : "edge",
 		  (__entry->e & (1<<16)) ? "|masked" : "",
 		  __entry->coalesced ? " (coalesced)" : "")
 );
 TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
 	    TP_PROTO(__u64 e),
 	    TP_ARGS(e),
 	TP_STRUCT__entry(
 		__field(	__u64,		e		)
 	),
 	TP_fast_assign(
 		__entry->e		= e;
 	),
 	TP_printk("dst %x vec %u (%s|%s|%s%s)",
 		  (u8)(__entry->e >> 56), (u8)__entry->e,
 		  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
 		  (__entry->e & (1<<11)) ? "logical" : "physical",
 		  (__entry->e & (1<<15)) ? "level" : "edge",
 		  (__entry->e & (1<<16)) ? "|masked" : "")
 );
 TRACE_EVENT(kvm_msi_set_irq,
 	    TP_PROTO(__u64 address, __u64 data),
 	    TP_ARGS(address, data),
 	TP_STRUCT__entry(
 		__field(	__u64,		address		)
 		__field(	__u64,		data		)
 	),
 	TP_fast_assign(
 		__entry->address	= address;
 		__entry->data		= data;
 	),
 	TP_printk("dst %llx vec %u (%s|%s|%s%s)",
 		  (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
 		  (u8)__entry->data,
 		  __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
 		  (__entry->address & (1<<2)) ? "logical" : "physical",
 		  (__entry->data & (1<<15)) ? "level" : "edge",
 		  (__entry->address & (1<<3)) ? "|rh" : "")
 );
 #define kvm_irqchips						\
 	{KVM_IRQCHIP_PIC_MASTER,	"PIC master"},		\
 	{KVM_IRQCHIP_PIC_SLAVE,		"PIC slave"},		\
 	{KVM_IRQCHIP_IOAPIC,		"IOAPIC"}
-#endif /* defined(__KVM_HAVE_IOAPIC) */
+#endif /* CONFIG_KVM_IOAPIC */
 #if defined(CONFIG_HAVE_KVM_IRQCHIP)
 #ifdef kvm_irqchips
 #define kvm_ack_irq_string "irqchip %s pin %u"
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@ -40,13 +40,31 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
 {
 	unsigned long flags;
-	wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+	wq_entry->flags |= WQ_FLAG_PRIORITY;
 	spin_lock_irqsave(&wq_head->lock, flags);
 	__add_wait_queue(wq_head, wq_entry);
 	spin_unlock_irqrestore(&wq_head->lock, flags);
 }
 EXPORT_SYMBOL_GPL(add_wait_queue_priority);
 int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
 				      struct wait_queue_entry *wq_entry)
 {
 	struct list_head *head = &wq_head->head;
 	wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
 	guard(spinlock_irqsave)(&wq_head->lock);
 	if (!list_empty(head) &&
 	    (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY))
 		return -EBUSY;
 	list_add(&wq_entry->entry, head);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive);
 void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
@ -64,7 +82,7 @@ EXPORT_SYMBOL(remove_wait_queue);
 * the non-exclusive tasks. Normally, exclusive tasks will be at the end of
 * the list and any non-exclusive tasks will be woken first. A priority task
 * may be at the head of the list, and can consume the event without any other
- * tasks being woken.
+ * tasks being woken if it's also an exclusive task.
 *
 * There are circumstances in which we can try to wake a task which has already
 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@ -59,6 +59,7 @@ TEST_PROGS_x86 += x86/nx_huge_pages_test.sh
 TEST_GEN_PROGS_COMMON = demand_paging_test
 TEST_GEN_PROGS_COMMON += dirty_log_test
 TEST_GEN_PROGS_COMMON += guest_print_test
 TEST_GEN_PROGS_COMMON += irqfd_test
 TEST_GEN_PROGS_COMMON += kvm_binary_stats_test
 TEST_GEN_PROGS_COMMON += kvm_create_max_vcpus
 TEST_GEN_PROGS_COMMON += kvm_page_table_test
--- a/tools/testing/selftests/kvm/arm64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c
@ -620,18 +620,12 @@ static void kvm_routing_and_irqfd_check(struct kvm_vm *vm,
 	 * that no actual interrupt was injected for those cases.
 	 */
-	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
+	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++)
-		fd[f] = eventfd(0, 0);
+		fd[f] = kvm_new_eventfd();
 		TEST_ASSERT(fd[f] != -1, __KVM_SYSCALL_ERROR("eventfd()", fd[f]));
 	}
 	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
 		struct kvm_irqfd irqfd = {
 			.fd  = fd[f],
 			.gsi = i - MIN_SPI,
 		};
 		assert(i <= (uint64_t)UINT_MAX);
-		vm_ioctl(vm, KVM_IRQFD, &irqfd);
+		kvm_assign_irqfd(vm, i - MIN_SPI, fd[f]);
 	}
 	for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
--- a/tools/testing/selftests/kvm/config
+++ b/tools/testing/selftests/kvm/config
@ -1,5 +1,6 @@
 CONFIG_KVM=y
 CONFIG_KVM_INTEL=y
 CONFIG_KVM_AMD=y
 CONFIG_EVENTFD=y
 CONFIG_USERFAULTFD=y
 CONFIG_IDLE_PAGE_TRACKING=y
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@ -18,6 +18,7 @@
 #include <asm/atomic.h>
 #include <asm/kvm.h>
 #include <sys/eventfd.h>
 #include <sys/ioctl.h>
 #include "kvm_util_arch.h"
@ -502,6 +503,45 @@ static inline int vm_get_stats_fd(struct kvm_vm *vm)
 	return fd;
 }
 static inline int __kvm_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd,
 			      uint32_t flags)
 {
 	struct kvm_irqfd irqfd = {
 		.fd = eventfd,
 		.gsi = gsi,
 		.flags = flags,
 		.resamplefd = -1,
 	};
 	return __vm_ioctl(vm, KVM_IRQFD, &irqfd);
 }
 static inline void kvm_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd,
 			      uint32_t flags)
 {
 	int ret = __kvm_irqfd(vm, gsi, eventfd, flags);
 	TEST_ASSERT_VM_VCPU_IOCTL(!ret, KVM_IRQFD, ret, vm);
 }
 static inline void kvm_assign_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd)
 {
 	kvm_irqfd(vm, gsi, eventfd, 0);
 }
 static inline void kvm_deassign_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd)
 {
 	kvm_irqfd(vm, gsi, eventfd, KVM_IRQFD_FLAG_DEASSIGN);
 }
 static inline int kvm_new_eventfd(void)
 {
 	int fd = eventfd(0, 0);
 	TEST_ASSERT(fd >= 0, __KVM_SYSCALL_ERROR("eventfd()", fd));
 	return fd;
 }
 static inline void read_stats_header(int stats_fd, struct kvm_stats_header *header)
 {
 	ssize_t ret;
--- a/tools/testing/selftests/kvm/irqfd_test.c
+++ b/tools/testing/selftests/kvm/irqfd_test.c
@ -0,0 +1,135 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <errno.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <signal.h>
 #include <stdint.h>
 #include <sys/sysinfo.h>
 #include "kvm_util.h"
 static struct kvm_vm *vm1;
 static struct kvm_vm *vm2;
 static int __eventfd;
 static bool done;
 /*
 * KVM de-assigns based on eventfd *and* GSI, but requires unique eventfds when
 * assigning (the API isn't symmetrical).  Abuse the oddity and use a per-task
 * GSI base to avoid false failures due to cross-task de-assign, i.e. so that
 * the secondary doesn't de-assign the primary's eventfd and cause assign to
 * unexpectedly succeed on the primary.
 */
 #define GSI_BASE_PRIMARY	0x20
 #define GSI_BASE_SECONDARY	0x30
 static void juggle_eventfd_secondary(struct kvm_vm *vm, int eventfd)
 {
 	int r, i;
 	/*
 	 * The secondary task can encounter EBADF since the primary can close
 	 * the eventfd at any time.  And because the primary can recreate the
 	 * eventfd, at the safe fd in the file table, the secondary can also
 	 * encounter "unexpected" success, e.g. if the close+recreate happens
 	 * between the first and second assignments.  The secondary's role is
 	 * mostly to antagonize KVM, not to detect bugs.
 	 */
 	for (i = 0; i < 2; i++) {
 		r = __kvm_irqfd(vm, GSI_BASE_SECONDARY, eventfd, 0);
 		TEST_ASSERT(!r || errno == EBUSY || errno == EBADF,
 			    "Wanted success, EBUSY, or EBADF, r = %d, errno = %d",
 			    r, errno);
 		/* De-assign should succeed unless the eventfd was closed. */
 		r = __kvm_irqfd(vm, GSI_BASE_SECONDARY + i, eventfd, KVM_IRQFD_FLAG_DEASSIGN);
 		TEST_ASSERT(!r || errno == EBADF,
 			    "De-assign should succeed unless the fd was closed");
 	}
 }
 static void *secondary_irqfd_juggler(void *ign)
 {
 	while (!READ_ONCE(done)) {
 		juggle_eventfd_secondary(vm1, READ_ONCE(__eventfd));
 		juggle_eventfd_secondary(vm2, READ_ONCE(__eventfd));
 	}
 	return NULL;
 }
 static void juggle_eventfd_primary(struct kvm_vm *vm, int eventfd)
 {
 	int r1, r2;
 	/*
 	 * At least one of the assigns should fail.  KVM disallows assigning a
 	 * single eventfd to multiple GSIs (or VMs), so it's possible that both
 	 * assignments can fail, too.
 	 */
 	r1 = __kvm_irqfd(vm, GSI_BASE_PRIMARY, eventfd, 0);
 	TEST_ASSERT(!r1 || errno == EBUSY,
 		    "Wanted success or EBUSY, r = %d, errno = %d", r1, errno);
 	r2 = __kvm_irqfd(vm, GSI_BASE_PRIMARY + 1, eventfd, 0);
 	TEST_ASSERT(r1 || (r2 && errno == EBUSY),
 		    "Wanted failure (EBUSY), r1 = %d, r2 = %d, errno = %d",
 		    r1, r2, errno);
 	/*
 	 * De-assign should always succeed, even if the corresponding assign
 	 * failed.
 	 */
 	kvm_irqfd(vm, GSI_BASE_PRIMARY, eventfd, KVM_IRQFD_FLAG_DEASSIGN);
 	kvm_irqfd(vm, GSI_BASE_PRIMARY + 1, eventfd, KVM_IRQFD_FLAG_DEASSIGN);
 }
 int main(int argc, char *argv[])
 {
 	pthread_t racing_thread;
 	int r, i;
 	/* Create "full" VMs, as KVM_IRQFD requires an in-kernel IRQ chip. */
 	vm1 = vm_create(1);
 	vm2 = vm_create(1);
 	WRITE_ONCE(__eventfd, kvm_new_eventfd());
 	kvm_irqfd(vm1, 10, __eventfd, 0);
 	r = __kvm_irqfd(vm1, 11, __eventfd, 0);
 	TEST_ASSERT(r && errno == EBUSY,
 		    "Wanted EBUSY, r = %d, errno = %d", r, errno);
 	r = __kvm_irqfd(vm2, 12, __eventfd, 0);
 	TEST_ASSERT(r && errno == EBUSY,
 		    "Wanted EBUSY, r = %d, errno = %d", r, errno);
 	/*
 	 * De-assign all eventfds, along with multiple eventfds that were never
 	 * assigned.  KVM's ABI is that de-assign is allowed so long as the
 	 * eventfd itself is valid.
 	 */
 	kvm_irqfd(vm1, 11, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
 	kvm_irqfd(vm1, 12, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
 	kvm_irqfd(vm1, 13, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
 	kvm_irqfd(vm1, 14, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
 	kvm_irqfd(vm1, 10, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
 	close(__eventfd);
 	pthread_create(&racing_thread, NULL, secondary_irqfd_juggler, vm2);
 	for (i = 0; i < 10000; i++) {
 		WRITE_ONCE(__eventfd, kvm_new_eventfd());
 		juggle_eventfd_primary(vm1, __eventfd);
 		juggle_eventfd_primary(vm2, __eventfd);
 		close(__eventfd);
 	}
 	WRITE_ONCE(done, true);
 	pthread_join(racing_thread, NULL);
 }
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@ -1716,7 +1716,18 @@ void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa)
 /* Create an interrupt controller chip for the specified VM. */
 void vm_create_irqchip(struct kvm_vm *vm)
 {
-	vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL);
+	int r;
 	/*
 	 * Allocate a fully in-kernel IRQ chip by default, but fall back to a
 	 * split model (x86 only) if that fails (KVM x86 allows compiling out
 	 * support for KVM_CREATE_IRQCHIP).
 	 */
 	r = __vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL);
 	if (r && errno == ENOTTY && kvm_has_cap(KVM_CAP_SPLIT_IRQCHIP))
 		vm_enable_cap(vm, KVM_CAP_SPLIT_IRQCHIP, 24);
 	else
 		TEST_ASSERT_VM_VCPU_IOCTL(!r, KVM_CREATE_IRQCHIP, r, vm);
 	vm->has_irqchip = true;
 }
--- a/tools/testing/selftests/kvm/x86/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86/xen_shinfo_test.c
@ -547,15 +547,9 @@ int main(int argc, char *argv[])
 	int irq_fd[2] = { -1, -1 };
 	if (do_eventfd_tests) {
-		irq_fd[0] = eventfd(0, 0);
+		irq_fd[0] = kvm_new_eventfd();
-		irq_fd[1] = eventfd(0, 0);
+		irq_fd[1] = kvm_new_eventfd();
 		/* Unexpected, but not a KVM failure */
 		if (irq_fd[0] == -1 || irq_fd[1] == -1)
 			do_evtchn_tests = do_eventfd_tests = false;
 	}
 	if (do_eventfd_tests) {
 		irq_routes.info.nr = 2;
 		irq_routes.entries[0].gsi = 32;
@ -572,15 +566,8 @@ int main(int argc, char *argv[])
 		vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info);
-		struct kvm_irqfd ifd = { };
+		kvm_assign_irqfd(vm, 32, irq_fd[0]);
-
+		kvm_assign_irqfd(vm, 33, irq_fd[1]);
 		ifd.fd = irq_fd[0];
 		ifd.gsi = 32;
 		vm_ioctl(vm, KVM_IRQFD, &ifd);
 		ifd.fd = irq_fd[1];
 		ifd.gsi = 33;
 		vm_ioctl(vm, KVM_IRQFD, &ifd);
 		struct sigaction sa = { };
 		sa.sa_handler = handle_alrm;
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@ -204,6 +204,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 	int ret = 0;
 	if (flags & EPOLLIN) {
 		/*
 		 * WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP,
 		 * as KVM holds irqfds.lock when registering the irqfd with the
 		 * eventfd.
 		 */
 		u64 cnt;
 		eventfd_ctx_do_read(irqfd->eventfd, &cnt);
@ -225,6 +230,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 		/* The eventfd is closing, detach from KVM */
 		unsigned long iflags;
 		/*
 		 * Taking irqfds.lock is safe here, as KVM holds a reference to
 		 * the eventfd when registering the irqfd, i.e. this path can't
 		 * be reached while kvm_irqfd_add() is running.
 		 */
 		spin_lock_irqsave(&kvm->irqfds.lock, iflags);
 		/*
@ -245,22 +255,14 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 	return ret;
 }
 static void
 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 			poll_table *pt)
 {
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(pt, struct kvm_kernel_irqfd, pt);
 	add_wait_queue_priority(wqh, &irqfd->wait);
 }
 /* Must be called under irqfds.lock */
 static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
 	int n_entries;
 	lockdep_assert_held(&kvm->irqfds.lock);
 	n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
 	write_seqcount_begin(&irqfd->irq_entry_sc);
@ -274,6 +276,63 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
 	write_seqcount_end(&irqfd->irq_entry_sc);
 }
 struct kvm_irqfd_pt {
 	struct kvm_kernel_irqfd *irqfd;
 	struct kvm *kvm;
 	poll_table pt;
 	int ret;
 };
 static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh,
 			       poll_table *pt)
 {
 	struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt);
 	struct kvm_kernel_irqfd *irqfd = p->irqfd;
 	struct kvm *kvm = p->kvm;
 	/*
 	 * Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing,
 	 * and irqfds.items.  It does NOT protect registering with the eventfd.
 	 */
 	spin_lock_irq(&kvm->irqfds.lock);
 	/*
 	 * Initialize the routing information prior to adding the irqfd to the
 	 * eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the
 	 * irqfd is registered.
 	 */
 	irqfd_update(kvm, irqfd);
 	/*
 	 * Add the irqfd as a priority waiter on the eventfd, with a custom
 	 * wake-up handler, so that KVM *and only KVM* is notified whenever the
 	 * underlying eventfd is signaled.
 	 */
 	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
 	/*
 	 * Temporarily lie to lockdep about holding irqfds.lock to avoid a
 	 * false positive regarding potential deadlock with irqfd_wakeup()
 	 * (see irqfd_wakeup() for details).
 	 *
 	 * Adding to the wait queue will fail if there is already a priority
 	 * waiter, i.e. if the eventfd is associated with another irqfd (in any
 	 * VM).  Note, kvm_irqfd_deassign() waits for all in-flight shutdown
 	 * jobs to complete, i.e. ensures the irqfd has been removed from the
 	 * eventfd's waitqueue before returning to userspace.
 	 */
 	spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_);
 	p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait);
 	spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_);
 	if (p->ret)
 		goto out;
 	list_add_tail(&irqfd->list, &kvm->irqfds.items);
 out:
 	spin_unlock_irq(&kvm->irqfds.lock);
 }
 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
 void __attribute__((weak)) kvm_arch_irq_bypass_stop(
 				struct irq_bypass_consumer *cons)
@ -285,26 +344,20 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start(
 {
 }
-int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
+void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
-				struct kvm *kvm, unsigned int host_irq,
+					  struct kvm_kernel_irq_routing_entry *old,
-				uint32_t guest_irq, bool set)
+					  struct kvm_kernel_irq_routing_entry *new)
 {
 	return 0;
 }
 bool __attribute__((weak)) kvm_arch_irqfd_route_changed(
 				struct kvm_kernel_irq_routing_entry *old,
 				struct kvm_kernel_irq_routing_entry *new)
 {
 	return true;
 }
 #endif
 static int
 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 {
-	struct kvm_kernel_irqfd *irqfd, *tmp;
+	struct kvm_kernel_irqfd *irqfd;
 	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
 	struct kvm_irqfd_pt irqfd_pt;
 	int ret;
 	__poll_t events;
 	int idx;
@ -390,57 +443,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	}
 	/*
-	 * Install our own custom wake-up handling so we are notified via
+	 * Set the irqfd routing and add it to KVM's list before registering
-	 * a callback whenever someone signals the underlying eventfd
+	 * the irqfd with the eventfd, so that the routing information is valid
 	 * and stays valid, e.g. if there are GSI routing changes, prior to
 	 * making the irqfd visible, i.e. before it might be signaled.
 	 *
 	 * Note, holding SRCU ensures a stable read of routing information, and
 	 * also prevents irqfd_shutdown() from freeing the irqfd before it's
 	 * fully initialized.
 	 */
 	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
 	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
 	spin_lock_irq(&kvm->irqfds.lock);
 	ret = 0;
 	list_for_each_entry(tmp, &kvm->irqfds.items, list) {
 		if (irqfd->eventfd != tmp->eventfd)
 			continue;
 		/* This fd is used for another irq already. */
 		ret = -EBUSY;
 		spin_unlock_irq(&kvm->irqfds.lock);
 		goto fail;
 	}
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	irqfd_update(kvm, irqfd);
 	list_add_tail(&irqfd->list, &kvm->irqfds.items);
 	spin_unlock_irq(&kvm->irqfds.lock);
 	/*
-	 * Check if there was an event already pending on the eventfd
+	 * Register the irqfd with the eventfd by polling on the eventfd, and
-	 * before we registered, and trigger it as if we didn't miss it.
+	 * simultaneously and the irqfd to KVM's list.  If there was en event
 	 * pending on the eventfd prior to registering, manually trigger IRQ
 	 * injection.
 	 */
-	events = vfs_poll(fd_file(f), &irqfd->pt);
+	irqfd_pt.irqfd = irqfd;
 	irqfd_pt.kvm = kvm;
 	init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register);
 	events = vfs_poll(fd_file(f), &irqfd_pt.pt);
 	ret = irqfd_pt.ret;
 	if (ret)
 		goto fail_poll;
 	if (events & EPOLLIN)
 		schedule_work(&irqfd->inject);
 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
 	if (kvm_arch_has_irq_bypass()) {
 		irqfd->consumer.token = (void *)irqfd->eventfd;
 		irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
 		irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
 		irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
 		irqfd->consumer.start = kvm_arch_irq_bypass_start;
-		ret = irq_bypass_register_consumer(&irqfd->consumer);
+		ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd);
 		if (ret)
-			pr_info("irq bypass consumer (token %p) registration fails: %d\n",
+			pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n",
-				irqfd->consumer.token, ret);
+				irqfd->eventfd, ret);
 	}
 #endif
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 	return 0;
 fail_poll:
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 fail:
 	if (irqfd->resampler)
 		irqfd_resampler_shutdown(irqfd);
@ -617,13 +667,8 @@ void kvm_irq_routing_update(struct kvm *kvm)
 		irqfd_update(kvm, irqfd);
 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
-		if (irqfd->producer &&
+		if (irqfd->producer)
-		    kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) {
+			kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry);
 			int ret = kvm_arch_update_irqfd_routing(
 					irqfd->kvm, irqfd->producer->irq,
 					irqfd->gsi, 1);
 			WARN_ON(ret);
 		}
 #endif
 	}
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@ -222,8 +222,6 @@ int kvm_set_irq_routing(struct kvm *kvm,
 	kvm_arch_irq_routing_update(kvm);
 	mutex_unlock(&kvm->irq_lock);
 	kvm_arch_post_irq_routing_update(kvm);
 	synchronize_srcu_expedited(&kvm->irq_srcu);
 	new = old;
--- a/virt/lib/irqbypass.c
+++ b/virt/lib/irqbypass.c
@ -22,8 +22,8 @@
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("IRQ bypass manager utility module");
-static LIST_HEAD(producers);
+static DEFINE_XARRAY(producers);
-static LIST_HEAD(consumers);
+static DEFINE_XARRAY(consumers);
 static DEFINE_MUTEX(lock);
 /* @lock must be held when calling connect */
@ -51,6 +51,10 @@ static int __connect(struct irq_bypass_producer *prod,
 	if (prod->start)
 		prod->start(prod);
 	if (!ret) {
 		prod->consumer = cons;
 		cons->producer = prod;
 	}
 	return ret;
 }
@ -72,56 +76,49 @@ static void __disconnect(struct irq_bypass_producer *prod,
 		cons->start(cons);
 	if (prod->start)
 		prod->start(prod);
 	prod->consumer = NULL;
 	cons->producer = NULL;
 }
 /**
 * irq_bypass_register_producer - register IRQ bypass producer
 * @producer: pointer to producer structure
 * @eventfd: pointer to the eventfd context associated with the producer
 * @irq: Linux IRQ number of the underlying producer device
 *
- * Add the provided IRQ producer to the list of producers and connect
+ * Add the provided IRQ producer to the set of producers and connect with the
- * with any matching token found on the IRQ consumers list.
+ * consumer with a matching eventfd, if one exists.
 */
-int irq_bypass_register_producer(struct irq_bypass_producer *producer)
+int irq_bypass_register_producer(struct irq_bypass_producer *producer,
 				 struct eventfd_ctx *eventfd, int irq)
 {
-	struct irq_bypass_producer *tmp;
+	unsigned long index = (unsigned long)eventfd;
 	struct irq_bypass_consumer *consumer;
 	int ret;
-	if (!producer->token)
+	if (WARN_ON_ONCE(producer->eventfd))
 		return -EINVAL;
-	might_sleep();
+	producer->irq = irq;
-	if (!try_module_get(THIS_MODULE))
+	guard(mutex)(&lock);
 		return -ENODEV;
-	mutex_lock(&lock);
+	ret = xa_insert(&producers, index, producer, GFP_KERNEL);
 	if (ret)
 		return ret;
-	list_for_each_entry(tmp, &producers, node) {
+	consumer = xa_load(&consumers, index);
-		if (tmp->token == producer->token) {
+	if (consumer) {
-			ret = -EBUSY;
+		ret = __connect(producer, consumer);
-			goto out_err;
+		if (ret) {
 			WARN_ON_ONCE(xa_erase(&producers, index) != producer);
 			return ret;
 		}
 	}
-	list_for_each_entry(consumer, &consumers, node) {
+	producer->eventfd = eventfd;
 		if (consumer->token == producer->token) {
 			ret = __connect(producer, consumer);
 			if (ret)
 				goto out_err;
 			break;
 		}
 	}
 	list_add(&producer->node, &producers);
 	mutex_unlock(&lock);
 	return 0;
 out_err:
 	mutex_unlock(&lock);
 	module_put(THIS_MODULE);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
@ -129,95 +126,65 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
 * irq_bypass_unregister_producer - unregister IRQ bypass producer
 * @producer: pointer to producer structure
 *
- * Remove a previously registered IRQ producer from the list of producers
+ * Remove a previously registered IRQ producer (note, it's safe to call this
- * and disconnect it from any connected IRQ consumer.
+ * even if registration was unsuccessful).  Disconnect from the associated
 * consumer, if one exists.
 */
 void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
 {
-	struct irq_bypass_producer *tmp;
+	unsigned long index = (unsigned long)producer->eventfd;
 	struct irq_bypass_consumer *consumer;
-	if (!producer->token)
+	if (!producer->eventfd)
 		return;
-	might_sleep();
+	guard(mutex)(&lock);
-	if (!try_module_get(THIS_MODULE))
+	if (producer->consumer)
-		return; /* nothing in the list anyway */
+		__disconnect(producer, producer->consumer);
-	mutex_lock(&lock);
+	WARN_ON_ONCE(xa_erase(&producers, index) != producer);
-
+	producer->eventfd = NULL;
 	list_for_each_entry(tmp, &producers, node) {
 		if (tmp->token != producer->token)
 			continue;
 		list_for_each_entry(consumer, &consumers, node) {
 			if (consumer->token == producer->token) {
 				__disconnect(producer, consumer);
 				break;
 			}
 		}
 		list_del(&producer->node);
 		module_put(THIS_MODULE);
 		break;
 	}
 	mutex_unlock(&lock);
 	module_put(THIS_MODULE);
 }
 EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
 /**
 * irq_bypass_register_consumer - register IRQ bypass consumer
 * @consumer: pointer to consumer structure
 * @eventfd: pointer to the eventfd context associated with the consumer
 *
- * Add the provided IRQ consumer to the list of consumers and connect
+ * Add the provided IRQ consumer to the set of consumers and connect with the
- * with any matching token found on the IRQ producer list.
+ * producer with a matching eventfd, if one exists.
 */
-int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
+int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer,
 				 struct eventfd_ctx *eventfd)
 {
-	struct irq_bypass_consumer *tmp;
+	unsigned long index = (unsigned long)eventfd;
 	struct irq_bypass_producer *producer;
 	int ret;
-	if (!consumer->token ||
+	if (WARN_ON_ONCE(consumer->eventfd))
 	    !consumer->add_producer || !consumer->del_producer)
 		return -EINVAL;
-	might_sleep();
+	if (!consumer->add_producer || !consumer->del_producer)
 		return -EINVAL;
-	if (!try_module_get(THIS_MODULE))
+	guard(mutex)(&lock);
 		return -ENODEV;
-	mutex_lock(&lock);
+	ret = xa_insert(&consumers, index, consumer, GFP_KERNEL);
 	if (ret)
 		return ret;
-	list_for_each_entry(tmp, &consumers, node) {
+	producer = xa_load(&producers, index);
-		if (tmp->token == consumer->token || tmp == consumer) {
+	if (producer) {
-			ret = -EBUSY;
+		ret = __connect(producer, consumer);
-			goto out_err;
+		if (ret) {
 			WARN_ON_ONCE(xa_erase(&consumers, index) != consumer);
 			return ret;
 		}
 	}
-	list_for_each_entry(producer, &producers, node) {
+	consumer->eventfd = eventfd;
 		if (producer->token == consumer->token) {
 			ret = __connect(producer, consumer);
 			if (ret)
 				goto out_err;
 			break;
 		}
 	}
 	list_add(&consumer->node, &consumers);
 	mutex_unlock(&lock);
 	return 0;
 out_err:
 	mutex_unlock(&lock);
 	module_put(THIS_MODULE);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
@ -225,42 +192,23 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
 * irq_bypass_unregister_consumer - unregister IRQ bypass consumer
 * @consumer: pointer to consumer structure
 *
- * Remove a previously registered IRQ consumer from the list of consumers
+ * Remove a previously registered IRQ consumer (note, it's safe to call this
- * and disconnect it from any connected IRQ producer.
+ * even if registration was unsuccessful).  Disconnect from the associated
 * producer, if one exists.
 */
 void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
 {
-	struct irq_bypass_consumer *tmp;
+	unsigned long index = (unsigned long)consumer->eventfd;
 	struct irq_bypass_producer *producer;
-	if (!consumer->token)
+	if (!consumer->eventfd)
 		return;
-	might_sleep();
+	guard(mutex)(&lock);
-	if (!try_module_get(THIS_MODULE))
+	if (consumer->producer)
-		return; /* nothing in the list anyway */
+		__disconnect(consumer->producer, consumer);
-	mutex_lock(&lock);
+	WARN_ON_ONCE(xa_erase(&consumers, index) != consumer);
-
+	consumer->eventfd = NULL;
 	list_for_each_entry(tmp, &consumers, node) {
 		if (tmp != consumer)
 			continue;
 		list_for_each_entry(producer, &producers, node) {
 			if (producer->token == consumer->token) {
 				__disconnect(producer, consumer);
 				break;
 			}
 		}
 		list_del(&consumer->node);
 		module_put(THIS_MODULE);
 		break;
 	}
 	mutex_unlock(&lock);
 	module_put(THIS_MODULE);
 }
 EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);