summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/arm64/kvm/arm.c20
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c2
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c10
-rw-r--r--arch/x86/include/asm/irq_remapping.h17
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h45
-rw-r--r--arch/x86/include/asm/svm.h13
-rw-r--r--arch/x86/kvm/Kconfig10
-rw-r--r--arch/x86/kvm/Makefile7
-rw-r--r--arch/x86/kvm/hyperv.c10
-rw-r--r--arch/x86/kvm/hyperv.h3
-rw-r--r--arch/x86/kvm/i8254.c90
-rw-r--r--arch/x86/kvm/i8254.h17
-rw-r--r--arch/x86/kvm/i8259.c17
-rw-r--r--arch/x86/kvm/ioapic.c55
-rw-r--r--arch/x86/kvm/ioapic.h24
-rw-r--r--arch/x86/kvm/irq.c567
-rw-r--r--arch/x86/kvm/irq.h35
-rw-r--r--arch/x86/kvm/irq_comm.c469
-rw-r--r--arch/x86/kvm/lapic.c7
-rw-r--r--arch/x86/kvm/svm/avic.c688
-rw-r--r--arch/x86/kvm/svm/svm.c4
-rw-r--r--arch/x86/kvm/svm/svm.h32
-rw-r--r--arch/x86/kvm/trace.h99
-rw-r--r--arch/x86/kvm/vmx/capabilities.h1
-rw-r--r--arch/x86/kvm/vmx/main.c2
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c140
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h10
-rw-r--r--arch/x86/kvm/vmx/vmx.c2
-rw-r--r--arch/x86/kvm/x86.c254
-rw-r--r--drivers/hv/mshv_eventfd.c8
-rw-r--r--drivers/iommu/amd/amd_iommu_types.h1
-rw-r--r--drivers/iommu/amd/iommu.c125
-rw-r--r--drivers/iommu/intel/irq_remapping.c10
-rw-r--r--drivers/irqchip/irq-gic-v4.c4
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c10
-rw-r--r--drivers/vhost/vdpa.c10
-rw-r--r--include/kvm/arm_vgic.h2
-rw-r--r--include/linux/amd-iommu.h25
-rw-r--r--include/linux/irqbypass.h46
-rw-r--r--include/linux/irqchip/arm-gic-v4.h2
-rw-r--r--include/linux/kvm_host.h18
-rw-r--r--include/linux/kvm_irqfd.h5
-rw-r--r--include/linux/wait.h2
-rw-r--r--include/trace/events/kvm.h84
-rw-r--r--kernel/sched/wait.c22
-rw-r--r--tools/testing/selftests/kvm/Makefile.kvm1
-rw-r--r--tools/testing/selftests/kvm/arm64/vgic_irq.c12
-rw-r--r--tools/testing/selftests/kvm/config1
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util.h40
-rw-r--r--tools/testing/selftests/kvm/irqfd_test.c135
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c13
-rw-r--r--tools/testing/selftests/kvm/x86/xen_shinfo_test.c21
-rw-r--r--virt/kvm/eventfd.c159
-rw-r--r--virt/kvm/irqchip.c2
-rw-r--r--virt/lib/irqbypass.c190
56 files changed, 1841 insertions, 1759 deletions
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 23dd3f3fc3eb..f946926716b0 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2765,19 +2765,15 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq);
}
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
- struct kvm_kernel_irq_routing_entry *new)
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
{
- if (old->type != KVM_IRQ_ROUTING_MSI ||
- new->type != KVM_IRQ_ROUTING_MSI)
- return true;
-
- return memcmp(&old->msi, &new->msi, sizeof(new->msi));
-}
+ if (old->type == KVM_IRQ_ROUTING_MSI &&
+ new->type == KVM_IRQ_ROUTING_MSI &&
+ !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+ return;
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
/*
* Remapping the vLPI requires taking the its_lock mutex to resolve
* the new translation. We're in spinlock land at this point, so no
@@ -2785,7 +2781,7 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
*
* Unmap the vLPI and fall back to software LPI injection.
*/
- return kvm_vgic_v4_unset_forwarding(kvm, host_irq);
+ return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq);
}
void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 534049c7c94b..98630dae910d 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -758,7 +758,7 @@ static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
if (irq) {
scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
if (irq->hw)
- WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
+ its_unmap_vlpi(ite->irq->host_irq);
irq->hw = false;
}
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 193946108192..ef3481963122 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -527,28 +527,26 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq)
return NULL;
}
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
+void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
{
struct vgic_irq *irq;
unsigned long flags;
- int ret = 0;
if (!vgic_supports_direct_msis(kvm))
- return 0;
+ return;
irq = __vgic_host_irq_get_vlpi(kvm, host_irq);
if (!irq)
- return 0;
+ return;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
WARN_ON(irq->hw && irq->host_irq != host_irq);
if (irq->hw) {
atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
irq->hw = false;
- ret = its_unmap_vlpi(host_irq);
+ its_unmap_vlpi(host_irq);
}
raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(kvm, irq);
- return ret;
}
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 5036f13ab69f..5a0d42464d44 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -26,7 +26,22 @@ enum {
IRQ_REMAP_X2APIC_MODE,
};
-struct vcpu_data {
+/*
+ * This is mainly used to communicate information back-and-forth
+ * between SVM and IOMMU for setting up and tearing down posted
+ * interrupt
+ */
+struct amd_iommu_pi_data {
+ u64 vapic_addr; /* Physical address of the vCPU's vAPIC. */
+ u32 ga_tag;
+ u32 vector; /* Guest vector of the interrupt */
+ int cpu;
+ bool ga_log_intr;
+ bool is_guest_mode;
+ void *ir_data;
+};
+
+struct intel_iommu_pi_data {
u64 pi_desc_addr; /* Physical address of PI Descriptor */
u32 vector; /* Guest vector of the interrupt */
};
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 8d50e3e0a19b..8897f509860c 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -112,7 +112,7 @@ KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
KVM_X86_OP_OPTIONAL(vcpu_unblocking)
KVM_X86_OP_OPTIONAL(pi_update_irte)
-KVM_X86_OP_OPTIONAL(pi_start_assignment)
+KVM_X86_OP_OPTIONAL(pi_start_bypass)
KVM_X86_OP_OPTIONAL(apicv_pre_state_restore)
KVM_X86_OP_OPTIONAL(apicv_post_state_restore)
KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f7af967aa16f..69d845824bf2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -297,6 +297,7 @@ enum x86_intercept_stage;
*/
#define KVM_APIC_PV_EOI_PENDING 1
+struct kvm_kernel_irqfd;
struct kvm_kernel_irq_routing_entry;
/*
@@ -1320,6 +1321,12 @@ enum kvm_apicv_inhibit {
*/
APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
+ /*
+ * AVIC is disabled because the vCPU's APIC ID is beyond the max
+ * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable.
+ */
+ APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG,
+
NR_APICV_INHIBIT_REASONS,
};
@@ -1338,7 +1345,8 @@ enum kvm_apicv_inhibit {
__APICV_INHIBIT_REASON(IRQWIN), \
__APICV_INHIBIT_REASON(PIT_REINJ), \
__APICV_INHIBIT_REASON(SEV), \
- __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
+ __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED), \
+ __APICV_INHIBIT_REASON(PHYSICAL_ID_TOO_BIG)
struct kvm_arch {
unsigned long n_used_mmu_pages;
@@ -1381,9 +1389,13 @@ struct kvm_arch {
atomic_t noncoherent_dma_count;
#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
atomic_t assigned_device_count;
+ unsigned long nr_possible_bypass_irqs;
+
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
+#endif
atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
struct kvm_apic_map __rcu *apic_map;
@@ -1403,7 +1415,6 @@ struct kvm_arch {
bool pause_in_guest;
bool cstate_in_guest;
- unsigned long irq_sources_bitmap;
s64 kvmclock_offset;
/*
@@ -1432,9 +1443,6 @@ struct kvm_arch {
struct delayed_work kvmclock_update_work;
struct delayed_work kvmclock_sync_work;
- /* reads protected by irq_srcu, writes by irq_lock */
- struct hlist_head mask_notifier_list;
-
#ifdef CONFIG_KVM_HYPERV
struct kvm_hv hyperv;
#endif
@@ -1853,9 +1861,10 @@ struct kvm_x86_ops {
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
- int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
- void (*pi_start_assignment)(struct kvm *kvm);
+ int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
+ void (*pi_start_bypass)(struct kvm *kvm);
void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
@@ -1950,6 +1959,7 @@ struct kvm_arch_async_pf {
extern u32 __read_mostly kvm_nr_uret_msrs;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
+extern bool __read_mostly enable_ipiv;
extern bool __read_mostly enable_device_posted_irqs;
extern struct kvm_x86_ops kvm_x86_ops;
@@ -2044,19 +2054,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
-struct kvm_irq_mask_notifier {
- void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
- int irq;
- struct hlist_node link;
-};
-
-void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn);
-void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn);
-void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
- bool mask);
-
extern bool tdp_enabled;
u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
@@ -2215,9 +2212,6 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
return !!(*irq_state);
}
-int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
-void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
-
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
@@ -2394,9 +2388,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
-void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct kvm_lapic_irq *irq);
-
static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
{
/* We can only post Fixed and LowPrio IRQs */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index ad954a1a6656..ffc27f676243 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -252,16 +252,21 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+/*
+ * GA_LOG_INTR is a synthetic flag that's never propagated to hardware-visible
+ * tables. GA_LOG_INTR is set if the vCPU needs device posted IRQs to generate
+ * GA log interrupts to wake the vCPU (because it's blocking or about to block).
+ */
+#define AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR BIT_ULL(61)
+
#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
-#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK GENMASK_ULL(51, 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFFULL)
#define AVIC_DOORBELL_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
-#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-
#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
@@ -290,8 +295,6 @@ enum avic_ipi_failure_cause {
static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID);
static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
-#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-
#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0)
#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)
#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2eeffcec5382..2c86673155c9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -166,6 +166,16 @@ config KVM_AMD_SEV
Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
Secure Nested Paging (SEV-SNP) technologies on AMD processors.
+config KVM_IOAPIC
+ bool "I/O APIC, PIC, and PIT emulation"
+ default y
+ depends on KVM
+ help
+ Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
+ for full in-kernel APIC emulation.
+
+ If unsure, say Y.
+
config KVM_SMM
bool "System Management Mode emulation"
default y
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a5d362c7b504..c4b8950c7abe 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,12 +5,11 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror
include $(srctree)/virt/kvm/Makefile.kvm
-kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
- i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- debugfs.o mmu/mmu.o mmu/page_track.o \
- mmu/spte.o
+kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \
+ debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
+kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
kvm-$(CONFIG_KVM_HYPERV) += hyperv.o
kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-$(CONFIG_KVM_SMM) += smm.o
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index ee27064dd72f..72b19a88a776 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -497,15 +497,19 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
return ret;
}
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
struct kvm_vcpu_hv_synic *synic;
- synic = synic_get(kvm, vpidx);
+ if (!level)
+ return -1;
+
+ synic = synic_get(kvm, e->hv_sint.vcpu);
if (!synic)
return -EINVAL;
- return synic_set_irq(synic, sint);
+ return synic_set_irq(synic, e->hv_sint.sint);
}
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 913bfc96959c..6ce160ffa678 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -103,7 +103,8 @@ static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 739aa6c0d0c3..d1b79b418c05 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -248,8 +248,8 @@ static void pit_do_work(struct kthread_work *work)
if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
return;
- kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
- kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false);
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false);
/*
* Provides NMI watchdog support via Virtual Wire mode.
@@ -288,7 +288,7 @@ static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
atomic_set(&pit->pit_state.irq_ack, 1);
}
-void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+static void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
{
struct kvm_kpit_state *ps = &pit->pit_state;
struct kvm *kvm = pit->kvm;
@@ -400,8 +400,8 @@ static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
}
}
-void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
- int hpet_legacy_start)
+static void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start)
{
u8 saved_mode;
@@ -649,6 +649,79 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
kvm_pit_reset_reinject(pit);
}
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+ BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+ mutex_lock(&kps->lock);
+ memcpy(ps, &kps->channels, sizeof(*ps));
+ mutex_unlock(&kps->lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int i;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ memset(&ps->reserved, 0, sizeof(ps->reserved));
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int start = 0;
+ int i;
+ u32 prev_legacy, cur_legacy;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&pit->pit_state.channels, &ps->channels,
+ sizeof(pit->pit_state.channels));
+ pit->pit_state.flags = ps->flags;
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
+ start && i == 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
+ */
+ mutex_lock(&pit->pit_state.lock);
+ kvm_pit_set_reinject(pit, control->pit_reinject);
+ mutex_unlock(&pit->pit_state.lock);
+
+ return 0;
+}
+
static const struct kvm_io_device_ops pit_dev_ops = {
.read = pit_ioport_read,
.write = pit_ioport_write,
@@ -671,10 +744,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
if (!pit)
return NULL;
- pit->irq_source_id = kvm_request_irq_source_id(kvm);
- if (pit->irq_source_id < 0)
- goto fail_request;
-
mutex_init(&pit->pit_state.lock);
pid = get_pid(task_tgid(current));
@@ -726,8 +795,6 @@ fail_register_pit:
kvm_pit_set_reinject(pit, false);
kthread_destroy_worker(pit->worker);
fail_kthread:
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
-fail_request:
kfree(pit);
return NULL;
}
@@ -744,7 +811,6 @@ void kvm_free_pit(struct kvm *kvm)
kvm_pit_set_reinject(pit, false);
hrtimer_cancel(&pit->pit_state.timer);
kthread_destroy_worker(pit->worker);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
kfree(pit);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index a768212ba821..60fa499d2f8a 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -6,6 +6,11 @@
#include <kvm/iodev.h>
+#include <uapi/asm/kvm.h>
+
+#include "ioapic.h"
+
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_kpit_channel_state {
u32 count; /* can be 65536 */
u16 latched_count;
@@ -42,7 +47,6 @@ struct kvm_pit {
struct kvm_io_device speaker_dev;
struct kvm *kvm;
struct kvm_kpit_state pit_state;
- int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
struct kthread_worker *worker;
struct kthread_work expired;
@@ -55,11 +59,14 @@ struct kvm_pit {
#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
#define KVM_PIT_CHANNEL_MASK 0x3
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control);
+
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
-
-void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
- int hpet_legacy_start);
-void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
+#endif /* CONFIG_KVM_IOAPIC */
#endif
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a8fb19940975..2ac7f1678c46 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -31,6 +31,8 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bitops.h>
+
+#include "ioapic.h"
#include "irq.h"
#include <linux/kvm_host.h>
@@ -185,8 +187,11 @@ void kvm_pic_update_irq(struct kvm_pic *s)
pic_unlock(s);
}
-int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
+ struct kvm_pic *s = kvm->arch.vpic;
+ int irq = e->irqchip.pin;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
@@ -203,16 +208,6 @@ int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
return ret;
}
-void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
-{
- int i;
-
- pic_lock(s);
- for (i = 0; i < PIC_NUM_PINS; i++)
- __clear_bit(irq_source_id, &s->irq_states[i]);
- pic_unlock(s);
-}
-
/*
* acknowledge interrupt 'irq'
*/
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 45dae2d5d2f1..2b5d389bca5f 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -41,11 +41,11 @@
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/current.h>
-#include <trace/events/kvm.h>
#include "ioapic.h"
#include "lapic.h"
#include "irq.h"
+#include "trace.h"
static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
bool line_status);
@@ -310,6 +310,42 @@ void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
kvm_make_scan_ioapic_request(kvm);
}
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &ioapic->mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ struct kvm_irq_mask_notifier *kimn;
+ int idx, gsi;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, &ioapic->mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
{
unsigned index;
@@ -479,9 +515,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
return ret;
}
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
- int level, bool line_status)
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ int irq = e->irqchip.pin;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
@@ -496,16 +534,6 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
return ret;
}
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
-{
- int i;
-
- spin_lock(&ioapic->lock);
- for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
- __clear_bit(irq_source_id, &ioapic->irq_states[i]);
- spin_unlock(&ioapic->lock);
-}
-
static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
{
int i;
@@ -718,6 +746,7 @@ int kvm_ioapic_init(struct kvm *kvm)
return -ENOMEM;
spin_lock_init(&ioapic->lock);
INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
+ INIT_HLIST_HEAD(&ioapic->mask_notifier_list);
kvm->arch.vioapic = ioapic;
kvm_ioapic_reset(ioapic);
kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index aa8cb4ac0479..bf28dbc11ff6 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -86,8 +86,24 @@ struct kvm_ioapic {
struct delayed_work eoi_inject;
u32 irq_eoi[IOAPIC_NUM_PINS];
u32 irr_delivered;
+
+ /* reads protected by irq_srcu, writes by irq_lock */
+ struct hlist_head mask_notifier_list;
+};
+
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
};
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask);
+
#ifdef DEBUG
#define ASSERT(x) \
do { \
@@ -103,7 +119,7 @@ do { \
static inline int ioapic_in_kernel(struct kvm *kvm)
{
- return irqchip_kernel(kvm);
+ return irqchip_full(kvm);
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -111,9 +127,9 @@ void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
int trigger_mode);
int kvm_ioapic_init(struct kvm *kvm);
void kvm_ioapic_destroy(struct kvm *kvm);
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
- int level, bool line_status);
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
+
void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 97d68d837929..4c3f80e8b13f 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -11,9 +11,12 @@
#include <linux/export.h>
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
+#include "hyperv.h"
+#include "ioapic.h"
#include "irq.h"
-#include "i8254.h"
+#include "trace.h"
#include "x86.h"
#include "xen.h"
@@ -41,6 +44,14 @@ static int pending_userspace_extint(struct kvm_vcpu *v)
return v->arch.pending_external_vector != -1;
}
+static int get_userspace_extint(struct kvm_vcpu *vcpu)
+{
+ int vector = vcpu->arch.pending_external_vector;
+
+ vcpu->arch.pending_external_vector = -1;
+ return vector;
+}
+
/*
* check if there is pending interrupt from
* non-APIC source without intack.
@@ -67,10 +78,13 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
if (!kvm_apic_accept_pic_intr(v))
return 0;
- if (irqchip_split(v->kvm))
- return pending_userspace_extint(v);
- else
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
return v->kvm->arch.vpic->output;
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return pending_userspace_extint(v);
}
/*
@@ -126,13 +140,13 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v)
return v->kvm->arch.xen.upcall_vector;
#endif
- if (irqchip_split(v->kvm)) {
- int vector = v->arch.pending_external_vector;
-
- v->arch.pending_external_vector = -1;
- return vector;
- } else
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
return kvm_pic_read_irq(v->kvm); /* PIC */
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return get_userspace_extint(v);
}
EXPORT_SYMBOL_GPL(kvm_cpu_get_extint);
@@ -163,7 +177,9 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
+#ifdef CONFIG_KVM_IOAPIC
__kvm_migrate_pit_timer(vcpu);
+#endif
kvm_x86_call(migrate_timers)(vcpu);
}
@@ -171,10 +187,539 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
{
bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
- return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
+ return resample ? irqchip_full(kvm) : irqchip_in_kernel(kvm);
}
bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
{
return irqchip_in_kernel(kvm);
}
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, struct dest_map *dest_map)
+{
+ int r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+ unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int dest_vcpus = 0;
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL &&
+ irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
+ pr_info("apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_lowest_prio_delivery(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
+ } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
+ if (!kvm_vector_hashing_enabled()) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ } else {
+ __set_bit(i, dest_vcpu_bitmap);
+ dest_vcpus++;
+ }
+ }
+ }
+
+ if (dest_vcpus != 0) {
+ int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+ lowest = kvm_get_vcpu(kvm, idx);
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
+
+ return r;
+}
+
+static void kvm_msi_to_lapic_irq(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ struct msi_msg msg = { .address_lo = e->msi.address_lo,
+ .address_hi = e->msi.address_hi,
+ .data = e->msi.data };
+
+ trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
+ (u64)msg.address_hi << 32 : 0), msg.data);
+
+ irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
+ irq->vector = msg.arch_data.vector;
+ irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
+ irq->trig_mode = msg.arch_data.is_level;
+ irq->delivery_mode = msg.arch_data.delivery_mode << 8;
+ irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
+ irq->level = 1;
+ irq->shorthand = APIC_DEST_NOSHORT;
+}
+
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct kvm_lapic_irq irq;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ if (!level)
+ return -1;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ switch (e->type) {
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level,
+ line_status);
+#endif
+
+ case KVM_IRQ_ROUTING_MSI:
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
+ return r;
+ break;
+
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ if (!level)
+ return -1;
+
+ return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
+#endif
+ default:
+ break;
+ }
+
+ return -EWOULDBLOCK;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level,
+ line_status);
+ return 0;
+}
+
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ /* We can't check irqchip_in_kernel() here as some callers are
+ * currently initializing the irqchip. Other callers should therefore
+ * check kvm_arch_can_set_irq_routing() before calling this function.
+ */
+ switch (ue->type) {
+#ifdef CONFIG_KVM_IOAPIC
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ if (irqchip_split(kvm))
+ return -EINVAL;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->irqchip.pin += PIC_NUM_PINS / 2;
+ fallthrough;
+ case KVM_IRQCHIP_PIC_MASTER:
+ if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+ return -EINVAL;
+ e->set = kvm_pic_set_irq;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+ return -EINVAL;
+ e->set = kvm_ioapic_set_irq;
+ break;
+ default:
+ return -EINVAL;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ break;
+#endif
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+ break;
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ e->set = kvm_hv_synic_set_irq;
+ e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+ e->hv_sint.sint = ue->u.hv_sint.sint;
+ break;
+#endif
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int r = 0;
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors)
+{
+ /*
+ * Intercept EOI if the vCPU is the target of the new IRQ routing, or
+ * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
+ * may receive a level-triggered IRQ in the future, or already received
+ * level-triggered IRQ. The EOI needs to be intercepted and forwarded
+ * to I/O APIC emulation so that the IRQ can be de-asserted.
+ */
+ if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
+ __set_bit(vector, ioapic_handled_vectors);
+ } else if (kvm_apic_pending_eoi(vcpu, vector)) {
+ __set_bit(vector, ioapic_handled_vectors);
+
+ /*
+ * Track the highest pending EOI for which the vCPU is NOT the
+ * target in the new routing. Only the EOI for the IRQ that is
+ * in-flight (for the old routing) needs to be intercepted, any
+ * future IRQs that arrive on this vCPU will be coincidental to
+ * the level-triggered routing and don't need to be intercepted.
+ */
+ if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
+ vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
+ }
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ struct kvm_lapic_irq irq;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ kvm_msi_to_lapic_irq(vcpu->kvm, entry, &irq);
+
+ if (!irq.trig_mode)
+ continue;
+
+ kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
+ irq.vector, ioapic_handled_vectors);
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_HYPERV
+ kvm_hv_irq_routing_update(kvm);
+#endif
+
+ if (irqchip_split(kvm))
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *entry)
+{
+ unsigned int host_irq = irqfd->producer->irq;
+ struct kvm *kvm = irqfd->kvm;
+ struct kvm_vcpu *vcpu = NULL;
+ struct kvm_lapic_irq irq;
+ int r;
+
+ if (WARN_ON_ONCE(!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass()))
+ return -EINVAL;
+
+ if (entry && entry->type == KVM_IRQ_ROUTING_MSI) {
+ kvm_msi_to_lapic_irq(kvm, entry, &irq);
+
+ /*
+ * Force remapped mode if hardware doesn't support posting the
+ * virtual interrupt to a vCPU. Only IRQs are postable (NMIs,
+ * SMIs, etc. are not), and neither AMD nor Intel IOMMUs support
+ * posting multicast/broadcast IRQs. If the interrupt can't be
+ * posted, the device MSI needs to be routed to the host so that
+ * the guest's desired interrupt can be synthesized by KVM.
+ *
+ * This means that KVM can only post lowest-priority interrupts
+ * if they have a single CPU as the destination, e.g. only if
+ * the guest has affined the interrupt to a single vCPU.
+ */
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq))
+ vcpu = NULL;
+ }
+
+ if (!irqfd->irq_bypass_vcpu && !vcpu)
+ return 0;
+
+ r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi,
+ vcpu, irq.vector);
+ if (r) {
+ WARN_ON_ONCE(irqfd->irq_bypass_vcpu && !vcpu);
+ irqfd->irq_bypass_vcpu = NULL;
+ return r;
+ }
+
+ irqfd->irq_bypass_vcpu = vcpu;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu);
+ return 0;
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret = 0;
+
+ kvm_arch_start_assignment(irqfd->kvm);
+
+ spin_lock_irq(&kvm->irqfds.lock);
+ irqfd->producer = prod;
+
+ if (!kvm->arch.nr_possible_bypass_irqs++)
+ kvm_x86_call(pi_start_bypass)(kvm);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry);
+ if (ret) {
+ kvm->arch.nr_possible_bypass_irqs--;
+ kvm_arch_end_assignment(irqfd->kvm);
+ }
+ }
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+ return ret;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret;
+
+ WARN_ON(irqfd->producer != prod);
+
+ /*
+ * If the producer of an IRQ that is currently being posted to a vCPU
+ * is unregistered, change the associated IRTE back to remapped mode as
+ * the IRQ has been released (or repurposed) by the device driver, i.e.
+ * KVM must relinquish control of the IRTE.
+ */
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, NULL);
+ if (ret)
+ pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n",
+ irqfd->consumer.eventfd, ret);
+ }
+ irqfd->producer = NULL;
+
+ kvm->arch.nr_possible_bypass_irqs--;
+
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+
+ kvm_arch_end_assignment(irqfd->kvm);
+}
+
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI &&
+ old->type != KVM_IRQ_ROUTING_MSI)
+ return;
+
+ if (old->type == KVM_IRQ_ROUTING_MSI &&
+ new->type == KVM_IRQ_ROUTING_MSI &&
+ !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+ return;
+
+ kvm_pi_update_irte(irqfd, new);
+}
+
+#ifdef CONFIG_KVM_IOAPIC
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
+#define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+};
+
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
+
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&chip->chip.pic, &pic->pics[0],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&chip->chip.pic, &pic->pics[1],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_get_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[0], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[1], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_set_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ kvm_pic_update_irq(pic);
+ return r;
+}
+#endif
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 76d46b2f41dd..5e62c1f79ce6 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -18,6 +18,8 @@
#include <kvm/iodev.h>
#include "lapic.h"
+#ifdef CONFIG_KVM_IOAPIC
+
#define PIC_NUM_PINS 16
#define SELECT_PIC(irq) \
((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
@@ -63,17 +65,15 @@ int kvm_pic_init(struct kvm *kvm);
void kvm_pic_destroy(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
-static inline int irqchip_split(struct kvm *kvm)
-{
- int mode = kvm->arch.irqchip_mode;
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm);
- /* Matches smp_wmb() when setting irqchip_mode */
- smp_rmb();
- return mode == KVM_IRQCHIP_SPLIT;
-}
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
-static inline int irqchip_kernel(struct kvm *kvm)
+static inline int irqchip_full(struct kvm *kvm)
{
int mode = kvm->arch.irqchip_mode;
@@ -81,10 +81,26 @@ static inline int irqchip_kernel(struct kvm *kvm)
smp_rmb();
return mode == KVM_IRQCHIP_KERNEL;
}
+#else /* CONFIG_KVM_IOAPIC */
+static __always_inline int irqchip_full(struct kvm *kvm)
+{
+ return false;
+}
+#endif
static inline int pic_in_kernel(struct kvm *kvm)
{
- return irqchip_kernel(kvm);
+ return irqchip_full(kvm);
+}
+
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_SPLIT;
}
static inline int irqchip_in_kernel(struct kvm *kvm)
@@ -105,7 +121,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
-int kvm_setup_default_irq_routing(struct kvm *kvm);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
deleted file mode 100644
index d6d792b5d1bd..000000000000
--- a/arch/x86/kvm/irq_comm.c
+++ /dev/null
@@ -1,469 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * irq_comm.c: Common API for in kernel interrupt controller
- * Copyright (c) 2007, Intel Corporation.
- *
- * Authors:
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kvm_host.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/rculist.h>
-
-#include <trace/events/kvm.h>
-
-#include "irq.h"
-
-#include "ioapic.h"
-
-#include "lapic.h"
-
-#include "hyperv.h"
-#include "x86.h"
-#include "xen.h"
-
-static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
-}
-
-static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
- line_status);
-}
-
-int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, struct dest_map *dest_map)
-{
- int r = -1;
- struct kvm_vcpu *vcpu, *lowest = NULL;
- unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
- unsigned int dest_vcpus = 0;
-
- if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
- return r;
-
- if (irq->dest_mode == APIC_DEST_PHYSICAL &&
- irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
- pr_info("apic: phys broadcast and lowest prio\n");
- irq->delivery_mode = APIC_DM_FIXED;
- }
-
- memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!kvm_apic_present(vcpu))
- continue;
-
- if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
- irq->dest_id, irq->dest_mode))
- continue;
-
- if (!kvm_lowest_prio_delivery(irq)) {
- if (r < 0)
- r = 0;
- r += kvm_apic_set_irq(vcpu, irq, dest_map);
- } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
- if (!kvm_vector_hashing_enabled()) {
- if (!lowest)
- lowest = vcpu;
- else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
- lowest = vcpu;
- } else {
- __set_bit(i, dest_vcpu_bitmap);
- dest_vcpus++;
- }
- }
- }
-
- if (dest_vcpus != 0) {
- int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
- dest_vcpu_bitmap, KVM_MAX_VCPUS);
-
- lowest = kvm_get_vcpu(kvm, idx);
- }
-
- if (lowest)
- r = kvm_apic_set_irq(lowest, irq, dest_map);
-
- return r;
-}
-
-void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct kvm_lapic_irq *irq)
-{
- struct msi_msg msg = { .address_lo = e->msi.address_lo,
- .address_hi = e->msi.address_hi,
- .data = e->msi.data };
-
- trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
- (u64)msg.address_hi << 32 : 0), msg.data);
-
- irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
- irq->vector = msg.arch_data.vector;
- irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
- irq->trig_mode = msg.arch_data.is_level;
- irq->delivery_mode = msg.arch_data.delivery_mode << 8;
- irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
- irq->level = 1;
- irq->shorthand = APIC_DEST_NOSHORT;
-}
-EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
-
-static inline bool kvm_msi_route_invalid(struct kvm *kvm,
- struct kvm_kernel_irq_routing_entry *e)
-{
- return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
-}
-
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level, bool line_status)
-{
- struct kvm_lapic_irq irq;
-
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
-
- if (!level)
- return -1;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
-}
-
-#ifdef CONFIG_KVM_HYPERV
-static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- if (!level)
- return -1;
-
- return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
-}
-#endif
-
-int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_lapic_irq irq;
- int r;
-
- switch (e->type) {
-#ifdef CONFIG_KVM_HYPERV
- case KVM_IRQ_ROUTING_HV_SINT:
- return kvm_hv_set_sint(e, kvm, irq_source_id, level,
- line_status);
-#endif
-
- case KVM_IRQ_ROUTING_MSI:
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
- return r;
- break;
-
-#ifdef CONFIG_KVM_XEN
- case KVM_IRQ_ROUTING_XEN_EVTCHN:
- if (!level)
- return -1;
-
- return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
-#endif
- default:
- break;
- }
-
- return -EWOULDBLOCK;
-}
-
-int kvm_request_irq_source_id(struct kvm *kvm)
-{
- unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
- int irq_source_id;
-
- mutex_lock(&kvm->irq_lock);
- irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
-
- if (irq_source_id >= BITS_PER_LONG) {
- pr_warn("exhausted allocatable IRQ sources!\n");
- irq_source_id = -EFAULT;
- goto unlock;
- }
-
- ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
- ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
- set_bit(irq_source_id, bitmap);
-unlock:
- mutex_unlock(&kvm->irq_lock);
-
- return irq_source_id;
-}
-
-void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
-{
- ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
- ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-
- mutex_lock(&kvm->irq_lock);
- if (irq_source_id < 0 ||
- irq_source_id >= BITS_PER_LONG) {
- pr_err("IRQ source ID out of range!\n");
- goto unlock;
- }
- clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
- if (!irqchip_kernel(kvm))
- goto unlock;
-
- kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
- kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
-unlock:
- mutex_unlock(&kvm->irq_lock);
-}
-
-void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn)
-{
- mutex_lock(&kvm->irq_lock);
- kimn->irq = irq;
- hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
- mutex_unlock(&kvm->irq_lock);
-}
-
-void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn)
-{
- mutex_lock(&kvm->irq_lock);
- hlist_del_rcu(&kimn->link);
- mutex_unlock(&kvm->irq_lock);
- synchronize_srcu(&kvm->irq_srcu);
-}
-
-void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
- bool mask)
-{
- struct kvm_irq_mask_notifier *kimn;
- int idx, gsi;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
- if (gsi != -1)
- hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link)
- if (kimn->irq == gsi)
- kimn->func(kimn, mask);
- srcu_read_unlock(&kvm->irq_srcu, idx);
-}
-
-bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
-{
- return irqchip_in_kernel(kvm);
-}
-
-int kvm_set_routing_entry(struct kvm *kvm,
- struct kvm_kernel_irq_routing_entry *e,
- const struct kvm_irq_routing_entry *ue)
-{
- /* We can't check irqchip_in_kernel() here as some callers are
- * currently initializing the irqchip. Other callers should therefore
- * check kvm_arch_can_set_irq_routing() before calling this function.
- */
- switch (ue->type) {
- case KVM_IRQ_ROUTING_IRQCHIP:
- if (irqchip_split(kvm))
- return -EINVAL;
- e->irqchip.pin = ue->u.irqchip.pin;
- switch (ue->u.irqchip.irqchip) {
- case KVM_IRQCHIP_PIC_SLAVE:
- e->irqchip.pin += PIC_NUM_PINS / 2;
- fallthrough;
- case KVM_IRQCHIP_PIC_MASTER:
- if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
- return -EINVAL;
- e->set = kvm_set_pic_irq;
- break;
- case KVM_IRQCHIP_IOAPIC:
- if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
- return -EINVAL;
- e->set = kvm_set_ioapic_irq;
- break;
- default:
- return -EINVAL;
- }
- e->irqchip.irqchip = ue->u.irqchip.irqchip;
- break;
- case KVM_IRQ_ROUTING_MSI:
- e->set = kvm_set_msi;
- e->msi.address_lo = ue->u.msi.address_lo;
- e->msi.address_hi = ue->u.msi.address_hi;
- e->msi.data = ue->u.msi.data;
-
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
- break;
-#ifdef CONFIG_KVM_HYPERV
- case KVM_IRQ_ROUTING_HV_SINT:
- e->set = kvm_hv_set_sint;
- e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
- e->hv_sint.sint = ue->u.hv_sint.sint;
- break;
-#endif
-#ifdef CONFIG_KVM_XEN
- case KVM_IRQ_ROUTING_XEN_EVTCHN:
- return kvm_xen_setup_evtchn(kvm, e, ue);
-#endif
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
- struct kvm_vcpu **dest_vcpu)
-{
- int r = 0;
- unsigned long i;
- struct kvm_vcpu *vcpu;
-
- if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
- return true;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!kvm_apic_present(vcpu))
- continue;
-
- if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
- irq->dest_id, irq->dest_mode))
- continue;
-
- if (++r == 2)
- return false;
-
- *dest_vcpu = vcpu;
- }
-
- return r == 1;
-}
-EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
-
-#define IOAPIC_ROUTING_ENTRY(irq) \
- { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
- .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
-#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
-
-#define PIC_ROUTING_ENTRY(irq) \
- { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
- .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
-#define ROUTING_ENTRY2(irq) \
- IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
-
-static const struct kvm_irq_routing_entry default_routing[] = {
- ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
- ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
- ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
- ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
- ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
- ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
- ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
- ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
- ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
- ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
- ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
- ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
-};
-
-int kvm_setup_default_irq_routing(struct kvm *kvm)
-{
- return kvm_set_irq_routing(kvm, default_routing,
- ARRAY_SIZE(default_routing), 0);
-}
-
-void kvm_arch_post_irq_routing_update(struct kvm *kvm)
-{
- if (!irqchip_split(kvm))
- return;
- kvm_make_scan_ioapic_request(kvm);
-}
-
-void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
- u8 vector, unsigned long *ioapic_handled_vectors)
-{
- /*
- * Intercept EOI if the vCPU is the target of the new IRQ routing, or
- * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
- * may receive a level-triggered IRQ in the future, or already received
- * level-triggered IRQ. The EOI needs to be intercepted and forwarded
- * to I/O APIC emulation so that the IRQ can be de-asserted.
- */
- if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
- __set_bit(vector, ioapic_handled_vectors);
- } else if (kvm_apic_pending_eoi(vcpu, vector)) {
- __set_bit(vector, ioapic_handled_vectors);
-
- /*
- * Track the highest pending EOI for which the vCPU is NOT the
- * target in the new routing. Only the EOI for the IRQ that is
- * in-flight (for the old routing) needs to be intercepted, any
- * future IRQs that arrive on this vCPU will be coincidental to
- * the level-triggered routing and don't need to be intercepted.
- */
- if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
- vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
- }
-}
-
-void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
- ulong *ioapic_handled_vectors)
-{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_kernel_irq_routing_entry *entry;
- struct kvm_irq_routing_table *table;
- u32 i, nr_ioapic_pins;
- int idx;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
- kvm->arch.nr_reserved_ioapic_pins);
- for (i = 0; i < nr_ioapic_pins; ++i) {
- hlist_for_each_entry(entry, &table->map[i], link) {
- struct kvm_lapic_irq irq;
-
- if (entry->type != KVM_IRQ_ROUTING_MSI)
- continue;
-
- kvm_set_msi_irq(vcpu->kvm, entry, &irq);
-
- if (!irq.trig_mode)
- continue;
-
- kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
- irq.vector, ioapic_handled_vectors);
- }
- }
- srcu_read_unlock(&kvm->irq_srcu, idx);
-}
-
-void kvm_arch_irq_routing_update(struct kvm *kvm)
-{
-#ifdef CONFIG_KVM_HYPERV
- kvm_hv_irq_routing_update(kvm);
-#endif
-}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73418dc0ebb2..4cf8c1f753d3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1455,7 +1455,7 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
{
- int trigger_mode;
+ int __maybe_unused trigger_mode;
/* Eoi the ioapic only if the ioapic doesn't own the vector. */
if (!kvm_ioapic_handles_vector(apic, vector))
@@ -1476,12 +1476,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
return;
}
+#ifdef CONFIG_KVM_IOAPIC
if (apic_test_vector(vector, apic->regs + APIC_TMR))
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+#endif
}
static int apic_set_eoi(struct kvm_lapic *apic)
@@ -3146,8 +3148,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+#ifdef CONFIG_KVM_IOAPIC
if (ioapic_in_kernel(vcpu->kvm))
kvm_rtc_eoi_tracking_restore_one(vcpu);
+#endif
vcpu->arch.apic_arb_prio = 0;
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 067f8e3f5a0d..a34c5c3b164e 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -18,6 +18,7 @@
#include <linux/hashtable.h>
#include <linux/amd-iommu.h>
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
#include <asm/irq_remapping.h>
#include <asm/msr.h>
@@ -29,36 +30,39 @@
#include "svm.h"
/*
- * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID,
- * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry
- * if an interrupt can't be delivered, e.g. because the vCPU isn't running.
+ * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
+ * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
+ * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index
+ * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
+ * lookup on the index, where as vCPUs whose index doesn't match their ID need
+ * to walk the entire xarray of vCPUs in the worst case scenario.
*
- * For the vCPU ID, use however many bits are currently allowed for the max
+ * For the vCPU index, use however many bits are currently allowed for the max
* guest physical APIC ID (limited by the size of the physical ID table), and
* use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
* size of the GATag is defined by hardware (32 bits), but is an opaque value
* as far as hardware is concerned.
*/
-#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
+#define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
-#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+#define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK)
-#define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
- ((vcpu_id) & AVIC_VCPU_ID_MASK))
-#define AVIC_GATAG(vm_id, vcpu_id) \
+#define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
+ ((vcpu_idx) & AVIC_VCPU_IDX_MASK))
+#define AVIC_GATAG(vm_id, vcpu_idx) \
({ \
- u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \
+ u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \
\
- WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \
WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
ga_tag; \
})
-static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
+static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
static bool force_avic;
module_param_unsafe(force_avic, bool, 0444);
@@ -75,14 +79,6 @@ static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
bool x2avic_enabled;
-/*
- * This is a wrapper of struct amd_iommu_ir_data.
- */
-struct amd_svm_iommu_ir {
- struct list_head node; /* Used by SVM for per-vcpu ir_list */
- void *data; /* Storing pointer to struct amd_ir_data */
-};
-
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -147,16 +143,16 @@ int avic_ga_log_notifier(u32 ga_tag)
struct kvm_svm *kvm_svm;
struct kvm_vcpu *vcpu = NULL;
u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
- u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+ u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
- pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
- trace_kvm_avic_ga_log(vm_id, vcpu_id);
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
+ trace_kvm_avic_ga_log(vm_id, vcpu_idx);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
if (kvm_svm->avic_vm_id != vm_id)
continue;
- vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
+ vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
break;
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
@@ -180,10 +176,8 @@ void avic_vm_destroy(struct kvm *kvm)
if (!enable_apicv)
return;
- if (kvm_svm->avic_logical_id_table_page)
- __free_page(kvm_svm->avic_logical_id_table_page);
- if (kvm_svm->avic_physical_id_table_page)
- __free_page(kvm_svm->avic_physical_id_table_page);
+ free_page((unsigned long)kvm_svm->avic_logical_id_table);
+ free_page((unsigned long)kvm_svm->avic_physical_id_table);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_del(&kvm_svm->hnode);
@@ -196,27 +190,19 @@ int avic_vm_init(struct kvm *kvm)
int err = -ENOMEM;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
struct kvm_svm *k2;
- struct page *p_page;
- struct page *l_page;
u32 vm_id;
if (!enable_apicv)
return 0;
- /* Allocating physical APIC ID table (4KB) */
- p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!p_page)
+ kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!kvm_svm->avic_physical_id_table)
goto free_avic;
- kvm_svm->avic_physical_id_table_page = p_page;
-
- /* Allocating logical APIC ID table (4KB) */
- l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!l_page)
+ kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!kvm_svm->avic_logical_id_table)
goto free_avic;
- kvm_svm->avic_logical_id_table_page = l_page;
-
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
again:
vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
@@ -242,17 +228,19 @@ free_avic:
return err;
}
+static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
+{
+ return __sme_set(__pa(svm->vcpu.arch.apic->regs));
+}
+
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
{
struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
- phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
- phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
- phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
- vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
- vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
- vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
- vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
+ vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
+ vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
+ vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
+ vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
if (kvm_apicv_activated(svm->vcpu.kvm))
avic_activate_vmcb(svm);
@@ -260,32 +248,31 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
avic_deactivate_vmcb(svm);
}
-static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
- unsigned int index)
-{
- u64 *avic_physical_id_table;
- struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
-
- if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
- (index > X2AVIC_MAX_PHYSICAL_ID))
- return NULL;
-
- avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
-
- return &avic_physical_id_table[index];
-}
-
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
- u64 *entry, new_entry;
- int id = vcpu->vcpu_id;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
+ u32 id = vcpu->vcpu_id;
+ u64 new_entry;
+ /*
+ * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
+ * hardware. Immediately clear apicv_active, i.e. don't wait until the
+ * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
+ * avic_vcpu_load() expects to be called if and only if the vCPU has
+ * fully initialized AVIC.
+ */
if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
- (id > X2AVIC_MAX_PHYSICAL_ID))
- return -EINVAL;
+ (id > X2AVIC_MAX_PHYSICAL_ID)) {
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
+ vcpu->arch.apic->apicv_active = false;
+ return 0;
+ }
+
+ BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
+ (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
- if (!vcpu->arch.apic->regs)
+ if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
return -EINVAL;
if (kvm_apicv_activated(vcpu->kvm)) {
@@ -302,19 +289,21 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return ret;
}
- svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
+ /* Note, fls64() returns the bit position, +1. */
+ BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
+ fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
/* Setting AVIC backing page address in the phy APIC ID table */
- entry = avic_get_physical_id_entry(vcpu, id);
- if (!entry)
- return -EINVAL;
+ new_entry = avic_get_backing_page_address(svm) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+ svm->avic_physical_id_entry = new_entry;
- new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
- AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
- AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
- WRITE_ONCE(*entry, new_entry);
-
- svm->avic_physical_id_cache = entry;
+ /*
+ * Initialize the real table, as vCPUs must have a valid entry in order
+ * for broadcast IPIs to function correctly (broadcast IPIs ignore
+ * invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
+ */
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
return 0;
}
@@ -448,7 +437,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
if (apic_x2apic_mode(source))
avic_logical_id_table = NULL;
else
- avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
+ avic_logical_id_table = kvm_svm->avic_logical_id_table;
/*
* AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
@@ -550,7 +539,6 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- u32 *logical_apic_id_table;
u32 cluster, index;
ldr = GET_APIC_LOGICAL_ID(ldr);
@@ -571,9 +559,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
return NULL;
index += (cluster << 2);
- logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
-
- return &logical_apic_id_table[index];
+ return &kvm_svm->avic_logical_id_table[index];
}
static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
@@ -722,6 +708,9 @@ int avic_init_vcpu(struct vcpu_svm *svm)
int ret;
struct kvm_vcpu *vcpu = &svm->vcpu;
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
+
if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
return 0;
@@ -729,8 +718,6 @@ int avic_init_vcpu(struct vcpu_svm *svm)
if (ret)
return ret;
- INIT_LIST_HEAD(&svm->ir_list);
- spin_lock_init(&svm->ir_list_lock);
svm->dfr_reg = APIC_DFR_FLAT;
return ret;
@@ -742,316 +729,161 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
{
- int ret = 0;
+ struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
unsigned long flags;
- struct amd_svm_iommu_ir *ir;
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm))
- return 0;
- /*
- * Here, we go through the per-vcpu ir_list to update all existing
- * interrupt remapping table entry targeting this vcpu.
- */
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
- if (list_empty(&svm->ir_list))
- goto out;
+ if (!vcpu)
+ return;
- list_for_each_entry(ir, &svm->ir_list, node) {
- if (activate)
- ret = amd_iommu_activate_guest_mode(ir->data);
- else
- ret = amd_iommu_deactivate_guest_mode(ir->data);
- if (ret)
- break;
- }
-out:
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
- return ret;
+ spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
+ list_del(&irqfd->vcpu_list);
+ spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
}
-static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
{
- unsigned long flags;
- struct amd_svm_iommu_ir *cur;
-
- spin_lock_irqsave(&svm->ir_list_lock, flags);
- list_for_each_entry(cur, &svm->ir_list, node) {
- if (cur->data != pi->ir_data)
- continue;
- list_del(&cur->node);
- kfree(cur);
- break;
- }
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-}
-
-static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
-{
- int ret = 0;
- unsigned long flags;
- struct amd_svm_iommu_ir *ir;
- u64 entry;
-
- if (WARN_ON_ONCE(!pi->ir_data))
- return -EINVAL;
-
- /**
- * In some cases, the existing irte is updated and re-set,
- * so we need to check here if it's already been * added
- * to the ir_list.
- */
- if (pi->prev_ga_tag) {
- struct kvm *kvm = svm->vcpu.kvm;
- u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
- struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
- struct vcpu_svm *prev_svm;
-
- if (!prev_vcpu) {
- ret = -EINVAL;
- goto out;
- }
-
- prev_svm = to_svm(prev_vcpu);
- svm_ir_list_del(prev_svm, pi);
- }
-
- /**
- * Allocating new amd_iommu_pi_data, which will get
- * add to the per-vcpu ir_list.
- */
- ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
- if (!ir) {
- ret = -ENOMEM;
- goto out;
- }
- ir->data = pi->ir_data;
-
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
/*
- * Update the target pCPU for IOMMU doorbells if the vCPU is running.
- * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
- * will update the pCPU info when the vCPU awkened and/or scheduled in.
- * See also avic_vcpu_load().
+ * If the IRQ was affined to a different vCPU, remove the IRTE metadata
+ * from the *previous* vCPU's list.
*/
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
- amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
- true, pi->ir_data);
-
- list_add(&ir->node, &svm->ir_list);
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-out:
- return ret;
-}
+ svm_ir_list_del(irqfd);
-/*
- * Note:
- * The HW cannot support posting multicast/broadcast
- * interrupts to a vCPU. So, we still use legacy interrupt
- * remapping for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- */
-static int
-get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
-{
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu = NULL;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq)) {
- pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
- __func__, irq.vector);
- return -1;
- }
-
- pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
- irq.vector);
- *svm = to_svm(vcpu);
- vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
- vcpu_info->vector = irq.vector;
-
- return 0;
-}
-
-/*
- * avic_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- bool enable_remapped_mode = true;
- int idx, ret = 0;
-
- if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
- return 0;
-
- pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
- __func__, host_irq, guest_irq, set);
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
- }
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- struct vcpu_data vcpu_info;
- struct vcpu_svm *svm = NULL;
+ if (vcpu) {
+ /*
+ * Try to enable guest_mode in IRTE, unless AVIC is inhibited,
+ * in which case configure the IRTE for legacy mode, but track
+ * the IRTE metadata so that it can be converted to guest mode
+ * if AVIC is enabled/uninhibited in the future.
+ */
+ struct amd_iommu_pi_data pi_data = {
+ .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+ vcpu->vcpu_idx),
+ .is_guest_mode = kvm_vcpu_apicv_active(vcpu),
+ .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
+ .vector = vector,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 entry;
+ int ret;
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
+ /*
+ * Prevent the vCPU from being scheduled out or migrated until
+ * the IRTE is updated and its metadata has been added to the
+ * list of IRQs being posted to the vCPU, to ensure the IRTE
+ * isn't programmed with stale pCPU/IsRunning information.
+ */
+ guard(spinlock_irqsave)(&svm->ir_list_lock);
- /**
- * Here, we setup with legacy mode in the following cases:
- * 1. When cannot target interrupt to a specific vcpu.
- * 2. Unsetting posted interrupt.
- * 3. APIC virtualization is disabled for the vcpu.
- * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
+ /*
+ * Update the target pCPU for IOMMU doorbells if the vCPU is
+ * running. If the vCPU is NOT running, i.e. is blocking or
+ * scheduled out, KVM will update the pCPU info when the vCPU
+ * is awakened and/or scheduled in. See also avic_vcpu_load().
*/
- if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
- kvm_vcpu_apicv_active(&svm->vcpu)) {
- struct amd_iommu_pi_data pi;
-
- enable_remapped_mode = false;
-
- /* Try to enable guest_mode in IRTE */
- pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
- AVIC_HPA_MASK);
- pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
- svm->vcpu.vcpu_id);
- pi.is_guest_mode = true;
- pi.vcpu_data = &vcpu_info;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
-
- /**
- * Here, we successfully setting up vcpu affinity in
- * IOMMU guest mode. Now, we need to store the posted
- * interrupt information in a per-vcpu ir_list so that
- * we can reference to them directly when we update vcpu
- * scheduling information in IOMMU irte.
- */
- if (!ret && pi.is_guest_mode)
- svm_ir_list_add(svm, &pi);
+ entry = svm->avic_physical_id_entry;
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
+ pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ } else {
+ pi_data.cpu = -1;
+ pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
}
- if (!ret && svm) {
- trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
- e->gsi, vcpu_info.vector,
- vcpu_info.pi_desc_addr, set);
- }
+ ret = irq_set_vcpu_affinity(host_irq, &pi_data);
+ if (ret)
+ return ret;
- if (ret < 0) {
- pr_err("%s: failed to update PI IRTE\n", __func__);
- goto out;
+ /*
+ * Revert to legacy mode if the IOMMU didn't provide metadata
+ * for the IRTE, which KVM needs to keep the IRTE up-to-date,
+ * e.g. if the vCPU is migrated or AVIC is disabled.
+ */
+ if (WARN_ON_ONCE(!pi_data.ir_data)) {
+ irq_set_vcpu_affinity(host_irq, NULL);
+ return -EIO;
}
- }
- ret = 0;
- if (enable_remapped_mode) {
- /* Use legacy mode in IRTE */
- struct amd_iommu_pi_data pi;
+ irqfd->irq_bypass_data = pi_data.ir_data;
+ list_add(&irqfd->vcpu_list, &svm->ir_list);
+ return 0;
+ }
+ return irq_set_vcpu_affinity(host_irq, NULL);
+}
- /**
- * Here, pi is used to:
- * - Tell IOMMU to use legacy mode for this interrupt.
- * - Retrieve ga_tag of prior interrupt remapping data.
- */
- pi.prev_ga_tag = 0;
- pi.is_guest_mode = false;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
+enum avic_vcpu_action {
+ /*
+ * There is no need to differentiate between activate and deactivate,
+ * as KVM only refreshes AVIC state when the vCPU is scheduled in and
+ * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
+ * being (de)activated.
+ */
+ AVIC_TOGGLE_ON_OFF = BIT(0),
+ AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF,
+ AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF,
- /**
- * Check if the posted interrupt was previously
- * setup with the guest_mode by checking if the ga_tag
- * was cached. If so, we need to clean up the per-vcpu
- * ir_list.
- */
- if (!ret && pi.prev_ga_tag) {
- int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
- struct kvm_vcpu *vcpu;
+ /*
+ * No unique action is required to deal with a vCPU that stops/starts
+ * running. A vCPU that starts running by definition stops blocking as
+ * well, and a vCPU that stops running can't have been blocking, i.e.
+ * doesn't need to toggle GALogIntr.
+ */
+ AVIC_START_RUNNING = 0,
+ AVIC_STOP_RUNNING = 0,
- vcpu = kvm_get_vcpu_by_id(kvm, id);
- if (vcpu)
- svm_ir_list_del(to_svm(vcpu), &pi);
- }
- }
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
-}
+ /*
+ * When a vCPU starts blocking, KVM needs to set the GALogIntr flag
+ * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
+ * sent to the vCPU.
+ */
+ AVIC_START_BLOCKING = BIT(1),
+};
-static inline int
-avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
+static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
{
- int ret = 0;
- struct amd_svm_iommu_ir *ir;
+ bool ga_log_intr = (action & AVIC_START_BLOCKING);
struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_kernel_irqfd *irqfd;
lockdep_assert_held(&svm->ir_list_lock);
- if (!kvm_arch_has_assigned_device(vcpu->kvm))
- return 0;
-
/*
* Here, we go through the per-vcpu ir_list to update all existing
* interrupt remapping table entry targeting this vcpu.
*/
if (list_empty(&svm->ir_list))
- return 0;
+ return;
- list_for_each_entry(ir, &svm->ir_list, node) {
- ret = amd_iommu_update_ga(cpu, r, ir->data);
- if (ret)
- return ret;
+ list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
+ void *data = irqfd->irq_bypass_data;
+
+ if (!(action & AVIC_TOGGLE_ON_OFF))
+ WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
+ else if (cpu >= 0)
+ WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
+ else
+ WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
}
- return 0;
}
-void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
{
- u64 entry;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
+ u64 entry;
lockdep_assert_preemption_disabled();
if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
- /*
- * No need to update anything if the vCPU is blocking, i.e. if the vCPU
- * is being scheduled in after being preempted. The CPU entries in the
- * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
- * If the vCPU was migrated, its new CPU value will be stuffed when the
- * vCPU unblocks.
- */
- if (kvm_vcpu_is_blocking(vcpu))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
return;
/*
@@ -1063,38 +895,57 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ entry = svm->avic_physical_id_entry;
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
+ AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
- avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
+ svm->avic_physical_id_entry = entry;
+
+ /*
+ * If IPI virtualization is disabled, clear IsRunning when updating the
+ * actual Physical ID table, so that the CPU never sees IsRunning=1.
+ * Keep the APIC ID up-to-date in the entry to minimize the chances of
+ * things going sideways if hardware peeks at the ID.
+ */
+ if (!enable_ipiv)
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
-void avic_vcpu_put(struct kvm_vcpu *vcpu)
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- u64 entry;
+ /*
+ * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+ * is being scheduled in after being preempted. The CPU entries in the
+ * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+ * If the vCPU was migrated, its new CPU value will be stuffed when the
+ * vCPU unblocks.
+ */
+ if (kvm_vcpu_is_blocking(vcpu))
+ return;
+
+ __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
+}
+
+static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
+ u64 entry = svm->avic_physical_id_entry;
lockdep_assert_preemption_disabled();
- /*
- * Note, reading the Physical ID entry outside of ir_list_lock is safe
- * as only the pCPU that has loaded (or is loading) the vCPU is allowed
- * to modify the entry, and preemption is disabled. I.e. the vCPU
- * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
- * recursively.
- */
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
-
- /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
- if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
return;
/*
@@ -1107,13 +958,62 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
- avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+ avic_update_iommu_vcpu_affinity(vcpu, -1, action);
+
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
+ /*
+ * Keep the previous APIC ID in the entry so that a rogue doorbell from
+ * hardware is at least restricted to a CPU associated with the vCPU.
+ */
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+
+ if (enable_ipiv)
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ /*
+ * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
+ * it's a synthetic flag that usurps an unused should-be-zero bit.
+ */
+ if (action & AVIC_START_BLOCKING)
+ entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
+
+ svm->avic_physical_id_entry = entry;
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, reading the Physical ID entry outside of ir_list_lock is safe
+ * as only the pCPU that has loaded (or is loading) the vCPU is allowed
+ * to modify the entry, and preemption is disabled. I.e. the vCPU
+ * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
+ * recursively.
+ */
+ u64 entry = to_svm(vcpu)->avic_physical_id_entry;
+
+ /*
+ * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
+ * vCPU is preempted while its in the process of blocking. WARN if the
+ * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
+ * the AVIC if it wasn't previously loaded.
+ */
+ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
+ if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
+ return;
+ /*
+ * The vCPU was preempted while blocking, ensure its IRTEs are
+ * configured to generate GA Log Interrupts.
+ */
+ if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
+ return;
+ }
+
+ __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
+ AVIC_STOP_RUNNING);
}
void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
@@ -1142,19 +1042,18 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
- bool activated = kvm_vcpu_apicv_active(vcpu);
-
if (!enable_apicv)
return;
+ /* APICv should only be toggled on/off while the vCPU is running. */
+ WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
+
avic_refresh_virtual_apic_mode(vcpu);
- if (activated)
- avic_vcpu_load(vcpu, vcpu->cpu);
+ if (kvm_vcpu_apicv_active(vcpu))
+ __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
else
- avic_vcpu_put(vcpu);
-
- avic_set_pi_irte_mode(vcpu, activated);
+ __avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
}
void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
@@ -1162,20 +1061,25 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_apicv_active(vcpu))
return;
- /*
- * Unload the AVIC when the vCPU is about to block, _before_
- * the vCPU actually blocks.
- *
- * Any IRQs that arrive before IsRunning=0 will not cause an
- * incomplete IPI vmexit on the source, therefore vIRR will also
- * be checked by kvm_vcpu_check_block() before blocking. The
- * memory barrier implicit in set_current_state orders writing
- * IsRunning=0 before reading the vIRR. The processor needs a
- * matching memory barrier on interrupt delivery between writing
- * IRR and reading IsRunning; the lack of this barrier might be
- * the cause of errata #1235).
- */
- avic_vcpu_put(vcpu);
+ /*
+ * Unload the AVIC when the vCPU is about to block, _before_ the vCPU
+ * actually blocks.
+ *
+ * Note, any IRQs that arrive before IsRunning=0 will not cause an
+ * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
+ * this by checking vIRR one last time before blocking. The memory
+ * barrier implicit in set_current_state orders writing IsRunning=0
+ * before reading the vIRR. The processor needs a matching memory
+ * barrier on interrupt delivery between writing IRR and reading
+ * IsRunning; the lack of this barrier might be the cause of errata #1235).
+ *
+ * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
+ * doesn't need to detect events for scheduling purposes. The doorbell
+ * used to signal running vCPUs cannot be blocked, i.e. will perturb the
+ * CPU and cause noisy neighbor problems if the VM is sending interrupts
+ * to the vCPU while it's scheduled out.
+ */
+ __avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
}
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
@@ -1228,6 +1132,14 @@ bool avic_hardware_setup(void)
if (x2avic_enabled)
pr_info("x2AVIC enabled\n");
+ /*
+ * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
+ * due to erratum 1235, which results in missed VM-Exits on the sender
+ * and thus missed wake events for blocking vCPUs due to the CPU
+ * failing to see a software update to clear IsRunning.
+ */
+ enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
+
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
return true;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ab9b947dbf4f..7f0df9dbc232 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -232,6 +232,7 @@ module_param(tsc_scaling, int, 0444);
*/
static bool avic;
module_param(avic, bool, 0444);
+module_param(enable_ipiv, bool, 0444);
module_param(enable_device_posted_irqs, bool, 0444);
@@ -1490,6 +1491,8 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ WARN_ON_ONCE(!list_empty(&svm->ir_list));
+
svm_leave_nested(vcpu);
svm_free_nested(svm);
@@ -5581,6 +5584,7 @@ static __init int svm_hardware_setup(void)
enable_apicv = avic = avic && avic_hardware_setup();
if (!enable_apicv) {
+ enable_ipiv = false;
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e6f3c6a153a0..b5cd1927b009 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -123,8 +123,8 @@ struct kvm_svm {
/* Struct members for AVIC */
u32 avic_vm_id;
- struct page *avic_logical_id_table_page;
- struct page *avic_physical_id_table_page;
+ u32 *avic_logical_id_table;
+ u64 *avic_physical_id_table;
struct hlist_node hnode;
struct kvm_sev_info sev_info;
@@ -306,14 +306,22 @@ struct vcpu_svm {
u32 ldr_reg;
u32 dfr_reg;
- struct page *avic_backing_page;
- u64 *avic_physical_id_cache;
+
+ /* This is essentially a shadow of the vCPU's actual entry in the
+ * Physical ID table that is programmed into the VMCB, i.e. that is
+ * seen by the CPU. If IPI virtualization is disabled, IsRunning is
+ * only ever set in the shadow, i.e. is never propagated to the "real"
+ * table, so that hardware never sees IsRunning=1.
+ */
+ u64 avic_physical_id_entry;
/*
- * Per-vcpu list of struct amd_svm_iommu_ir:
- * This is used mainly to store interrupt remapping information used
- * when update the vcpu affinity. This avoids the need to scan for
- * IRTE and try to match ga_tag in the IOMMU driver.
+ * Per-vCPU list of irqfds that are eligible to post IRQs directly to
+ * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list
+ * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the
+ * target pCPU), when AVIC is toggled on/off (to (de)activate bypass),
+ * and if the irqfd becomes ineligible for posting (to put the IRTE
+ * back into remapped mode).
*/
struct list_head ir_list;
spinlock_t ir_list_lock;
@@ -721,7 +729,8 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \
- BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \
+ BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \
)
bool avic_hardware_setup(void);
@@ -736,8 +745,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ba736cbb0587..57d79fd31df0 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -260,6 +260,86 @@ TRACE_EVENT(kvm_cpuid,
__entry->used_max_basic ? ", used max basic" : "")
);
+#define kvm_deliver_mode \
+ {0x0, "Fixed"}, \
+ {0x1, "LowPrio"}, \
+ {0x2, "SMI"}, \
+ {0x3, "Res3"}, \
+ {0x4, "NMI"}, \
+ {0x5, "INIT"}, \
+ {0x6, "SIPI"}, \
+ {0x7, "ExtINT"}
+
+#ifdef CONFIG_KVM_IOAPIC
+TRACE_EVENT(kvm_ioapic_set_irq,
+ TP_PROTO(__u64 e, int pin, bool coalesced),
+ TP_ARGS(e, pin, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ __field( int, pin )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ __entry->pin = pin;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
+ __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+ TP_PROTO(__u64 e),
+ TP_ARGS(e),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s%s)",
+ (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "")
+);
+#endif
+
+TRACE_EVENT(kvm_msi_set_irq,
+ TP_PROTO(__u64 address, __u64 data),
+ TP_ARGS(address, data),
+
+ TP_STRUCT__entry(
+ __field( __u64, address )
+ __field( __u64, data )
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ __entry->data = data;
+ ),
+
+ TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+ (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+ (u8)__entry->data,
+ __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->address & (1<<2)) ? "logical" : "physical",
+ (__entry->data & (1<<15)) ? "level" : "edge",
+ (__entry->address & (1<<3)) ? "|rh" : "")
+);
+
#define AREG(x) { APIC_##x, "APIC_" #x }
#define kvm_trace_symbol_apic \
@@ -1096,37 +1176,32 @@ TRACE_EVENT(kvm_smm_transition,
* Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC.
*/
TRACE_EVENT(kvm_pi_irte_update,
- TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
- unsigned int gsi, unsigned int gvec,
- u64 pi_desc_addr, bool set),
- TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
+ TP_PROTO(unsigned int host_irq, struct kvm_vcpu *vcpu,
+ unsigned int gsi, unsigned int gvec, bool set),
+ TP_ARGS(host_irq, vcpu, gsi, gvec, set),
TP_STRUCT__entry(
__field( unsigned int, host_irq )
- __field( unsigned int, vcpu_id )
+ __field( int, vcpu_id )
__field( unsigned int, gsi )
__field( unsigned int, gvec )
- __field( u64, pi_desc_addr )
__field( bool, set )
),
TP_fast_assign(
__entry->host_irq = host_irq;
- __entry->vcpu_id = vcpu_id;
+ __entry->vcpu_id = vcpu ? vcpu->vcpu_id : -1;
__entry->gsi = gsi;
__entry->gvec = gvec;
- __entry->pi_desc_addr = pi_desc_addr;
__entry->set = set;
),
- TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
- "gvec: 0x%x, pi_desc_addr: 0x%llx",
+ TP_printk("PI is %s for irq %u, vcpu %d, gsi: 0x%x, gvec: 0x%x",
__entry->set ? "enabled and being updated" : "disabled",
__entry->host_irq,
__entry->vcpu_id,
__entry->gsi,
- __entry->gvec,
- __entry->pi_desc_addr)
+ __entry->gvec)
);
/*
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index cb6588238f46..5316c27f6099 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -15,7 +15,6 @@ extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
-extern bool __read_mostly enable_ipiv;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index d1e02e567b57..a986fc45145e 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -1014,7 +1014,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.nested_ops = &vmx_nested_ops,
.pi_update_irte = vmx_pi_update_irte,
- .pi_start_assignment = vmx_pi_start_assignment,
+ .pi_start_bypass = vmx_pi_start_bypass,
#ifdef CONFIG_X86_64
.set_hv_timer = vt_op(set_hv_timer),
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 5c615e5845bf..4a6d9a17da23 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -2,6 +2,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
#include <asm/irq_remapping.h>
#include <asm/cpu.h>
@@ -72,13 +73,10 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
/*
* If the vCPU wasn't on the wakeup list and wasn't migrated, then the
* full update can be skipped as neither the vector nor the destination
- * needs to be changed.
+ * needs to be changed. Clear SN even if there is no assigned device,
+ * again for simplicity.
*/
if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
- /*
- * Clear SN if it was set due to being preempted. Again, do
- * this even if there is no assigned device for simplicity.
- */
if (pi_test_and_clear_sn(pi_desc))
goto after_clear_sn;
return;
@@ -148,8 +146,13 @@ after_clear_sn:
static bool vmx_can_use_vtd_pi(struct kvm *kvm)
{
+ /*
+ * Note, reading the number of possible bypass IRQs can race with a
+ * bypass IRQ being attached to the VM. vmx_pi_start_bypass() ensures
+ * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK.
+ */
return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() &&
- kvm_arch_has_assigned_device(kvm);
+ READ_ONCE(kvm->arch.nr_possible_bypass_irqs);
}
/*
@@ -224,17 +227,23 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
if (!vmx_needs_pi_wakeup(vcpu))
return;
- if (kvm_vcpu_is_blocking(vcpu) &&
+ /*
+ * If the vCPU is blocking with IRQs enabled and ISN'T being preempted,
+ * enable the wakeup handler so that notification IRQ wakes the vCPU as
+ * expected. There is no need to enable the wakeup handler if the vCPU
+ * is preempted between setting its wait state and manually scheduling
+ * out, as the task is still runnable, i.e. doesn't need a wake event
+ * from KVM to be scheduled in.
+ *
+ * If the wakeup handler isn't being enabled, Suppress Notifications as
+ * the cost of propagating PIR.IRR to PID.ON is negligible compared to
+ * the cost of a spurious IRQ, and vCPU put/load is a slow path.
+ */
+ if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) &&
((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
(!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
pi_enable_wakeup_handler(vcpu);
-
- /*
- * Set SN when the vCPU is preempted. Note, the vCPU can both be seen
- * as blocking and preempted, e.g. if it's preempted between setting
- * its wait state and manually scheduling out.
- */
- if (vcpu->preempted)
+ else
pi_set_sn(pi_desc);
}
@@ -281,99 +290,30 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
/*
- * Bail out of the block loop if the VM has an assigned
- * device, but the blocking vCPU didn't reconfigure the
- * PI.NV to the wakeup vector, i.e. the assigned device
- * came along after the initial check in vmx_vcpu_pi_put().
+ * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as
+ * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup
+ * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put().
*/
-void vmx_pi_start_assignment(struct kvm *kvm)
+void vmx_pi_start_bypass(struct kvm *kvm)
{
- if (!kvm_arch_has_irq_bypass())
+ if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm)))
return;
kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
}
-/*
- * vmx_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- bool enable_remapped_mode = true;
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu;
- struct vcpu_data vcpu_info;
- int idx, ret = 0;
-
- if (!vmx_can_use_vtd_pi(kvm))
- return 0;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
+ if (vcpu) {
+ struct intel_iommu_pi_data pi_data = {
+ .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)),
+ .vector = vector,
+ };
+
+ return irq_set_vcpu_affinity(host_irq, &pi_data);
+ } else {
+ return irq_set_vcpu_affinity(host_irq, NULL);
}
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
- /*
- * VT-d PI cannot support posting multicast/broadcast
- * interrupts to a vCPU, we still use interrupt remapping
- * for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- *
- * We will support full lowest-priority interrupt later.
- *
- * In addition, we can only inject generic interrupts using
- * the PI mechanism, refuse to route others through it.
- */
-
- kvm_set_msi_irq(kvm, e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq))
- continue;
-
- vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
- vcpu_info.vector = irq.vector;
-
- trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
- vcpu_info.vector, vcpu_info.pi_desc_addr, set);
-
- if (!set)
- continue;
-
- enable_remapped_mode = false;
-
- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- if (ret < 0) {
- printk(KERN_INFO "%s: failed to update PI IRTE\n",
- __func__);
- goto out;
- }
- }
-
- if (enable_remapped_mode)
- ret = irq_set_vcpu_affinity(host_irq, NULL);
-
- ret = 0;
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
}
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 80499ea0e674..a4af39948cf0 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -3,6 +3,9 @@
#define __KVM_X86_VMX_POSTED_INTR_H
#include <linux/bitmap.h>
+#include <linux/find.h>
+#include <linux/kvm_host.h>
+
#include <asm/posted_intr.h>
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
@@ -11,9 +14,10 @@ void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
-void vmx_pi_start_assignment(struct kvm *kvm);
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
+void vmx_pi_start_bypass(struct kvm *kvm);
static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
{
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 191a9ed0da22..5f37248b5d8b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -113,8 +113,6 @@ static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, 0444);
module_param(enable_apicv, bool, 0444);
-
-bool __read_mostly enable_ipiv = true;
module_param(enable_ipiv, bool, 0444);
module_param(enable_device_posted_irqs, bool, 0444);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 93636f77c42d..ac961153a736 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -226,6 +226,9 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
EXPORT_SYMBOL_GPL(enable_apicv);
+bool __read_mostly enable_ipiv = true;
+EXPORT_SYMBOL_GPL(enable_ipiv);
+
bool __read_mostly enable_device_posted_irqs = true;
EXPORT_SYMBOL_GPL(enable_device_posted_irqs);
@@ -4634,17 +4637,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXT_CPUID:
case KVM_CAP_EXT_EMUL_CPUID:
case KVM_CAP_CLOCKSOURCE:
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CAP_PIT:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_REINJECT_CONTROL:
+#endif
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_USER_NMI:
- case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_IOEVENTFD_NO_LENGTH:
- case KVM_CAP_PIT2:
- case KVM_CAP_PIT_STATE2:
+
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
case KVM_CAP_VCPU_EVENTS:
#ifdef CONFIG_KVM_HYPERV
@@ -6401,135 +6407,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy(&chip->chip.pic, &pic->pics[0],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy(&chip->chip.pic, &pic->pics[1],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- kvm_get_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- return r;
-}
-
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic->lock);
- memcpy(&pic->pics[0], &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic->lock);
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic->lock);
- memcpy(&pic->pics[1], &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic->lock);
- break;
- case KVM_IRQCHIP_IOAPIC:
- kvm_set_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- kvm_pic_update_irq(pic);
- return r;
-}
-
-static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
-
- BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
-
- mutex_lock(&kps->lock);
- memcpy(ps, &kps->channels, sizeof(*ps));
- mutex_unlock(&kps->lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- int i;
- struct kvm_pit *pit = kvm->arch.vpit;
-
- mutex_lock(&pit->pit_state.lock);
- memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
- for (i = 0; i < 3; i++)
- kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
- mutex_unlock(&pit->pit_state.lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
- sizeof(ps->channels));
- ps->flags = kvm->arch.vpit->pit_state.flags;
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- memset(&ps->reserved, 0, sizeof(ps->reserved));
- return 0;
-}
-
-static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- int start = 0;
- int i;
- u32 prev_legacy, cur_legacy;
- struct kvm_pit *pit = kvm->arch.vpit;
-
- mutex_lock(&pit->pit_state.lock);
- prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
- cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
- if (!prev_legacy && cur_legacy)
- start = 1;
- memcpy(&pit->pit_state.channels, &ps->channels,
- sizeof(pit->pit_state.channels));
- pit->pit_state.flags = ps->flags;
- for (i = 0; i < 3; i++)
- kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
- start && i == 0);
- mutex_unlock(&pit->pit_state.lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_reinject(struct kvm *kvm,
- struct kvm_reinject_control *control)
-{
- struct kvm_pit *pit = kvm->arch.vpit;
-
- /* pit->pit_state.lock was overloaded to prevent userspace from getting
- * an inconsistent state after running multiple KVM_REINJECT_CONTROL
- * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
- */
- mutex_lock(&pit->pit_state.lock);
- kvm_pit_set_reinject(pit, control->pit_reinject);
- mutex_unlock(&pit->pit_state.lock);
-
- return 0;
-}
-
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
@@ -6549,18 +6426,6 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
kvm_vcpu_kick(vcpu);
}
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
- bool line_status)
-{
- if (!irqchip_in_kernel(kvm))
- return -ENXIO;
-
- irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event->irq, irq_event->level,
- line_status);
- return 0;
-}
-
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
@@ -7072,9 +6937,11 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -ENOTTY;
+
+#ifdef CONFIG_KVM_IOAPIC
/*
* This union makes it completely explicit to gcc-3.x
- * that these two variables' stack usage should be
+ * that these three variables' stack usage should be
* combined, not added together.
*/
union {
@@ -7082,6 +6949,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm_pit_state2 ps2;
struct kvm_pit_config pit_config;
} u;
+#endif
switch (ioctl) {
case KVM_SET_TSS_ADDR:
@@ -7105,6 +6973,7 @@ set_identity_unlock:
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
break;
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CREATE_IRQCHIP: {
mutex_lock(&kvm->lock);
@@ -7126,7 +6995,7 @@ set_identity_unlock:
goto create_irqchip_unlock;
}
- r = kvm_setup_default_irq_routing(kvm);
+ r = kvm_setup_default_ioapic_and_pic_routing(kvm);
if (r) {
kvm_ioapic_destroy(kvm);
kvm_pic_destroy(kvm);
@@ -7174,7 +7043,7 @@ set_identity_unlock:
}
r = -ENXIO;
- if (!irqchip_kernel(kvm))
+ if (!irqchip_full(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@@ -7198,7 +7067,7 @@ set_identity_unlock:
}
r = -ENXIO;
- if (!irqchip_kernel(kvm))
+ if (!irqchip_full(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
set_irqchip_out:
@@ -7271,6 +7140,7 @@ set_pit2_out:
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
+#endif
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
@@ -10730,8 +10600,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+#ifdef CONFIG_KVM_IOAPIC
else if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+#endif
if (is_guest_mode(vcpu))
vcpu->arch.load_eoi_exitmap_pending = true;
@@ -12801,15 +12673,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto out_uninit_mmu;
- INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
- /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
- set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
- /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
- set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
- &kvm->arch.irq_sources_bitmap);
-
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
@@ -12940,7 +12805,9 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
+#ifdef CONFIG_KVM_IOAPIC
kvm_free_pit(kvm);
+#endif
kvm_mmu_pre_destroy_vm(kvm);
static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
@@ -12964,8 +12831,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
}
kvm_destroy_vcpus(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
+#ifdef CONFIG_KVM_IOAPIC
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
+#endif
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
@@ -13577,8 +13446,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
void kvm_arch_start_assignment(struct kvm *kvm)
{
- if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
- kvm_x86_call(pi_start_assignment)(kvm);
+ atomic_inc(&kvm->arch.assigned_device_count);
}
EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
@@ -13629,77 +13497,6 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
-int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
- struct irq_bypass_producer *prod)
-{
- struct kvm_kernel_irqfd *irqfd =
- container_of(cons, struct kvm_kernel_irqfd, consumer);
- struct kvm *kvm = irqfd->kvm;
- int ret;
-
- kvm_arch_start_assignment(irqfd->kvm);
-
- spin_lock_irq(&kvm->irqfds.lock);
- irqfd->producer = prod;
-
- ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
- if (ret)
- kvm_arch_end_assignment(irqfd->kvm);
-
- spin_unlock_irq(&kvm->irqfds.lock);
-
-
- return ret;
-}
-
-void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
- struct irq_bypass_producer *prod)
-{
- int ret;
- struct kvm_kernel_irqfd *irqfd =
- container_of(cons, struct kvm_kernel_irqfd, consumer);
- struct kvm *kvm = irqfd->kvm;
-
- WARN_ON(irqfd->producer != prod);
-
- /*
- * When producer of consumer is unregistered, we change back to
- * remapped mode, so we can re-use the current implementation
- * when the irq is masked/disabled or the consumer side (KVM
- * int this case doesn't want to receive the interrupts.
- */
- spin_lock_irq(&kvm->irqfds.lock);
- irqfd->producer = NULL;
-
- ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 0);
- if (ret)
- printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
- " fails: %d\n", irqfd->consumer.token, ret);
-
- spin_unlock_irq(&kvm->irqfds.lock);
-
-
- kvm_arch_end_assignment(irqfd->kvm);
-}
-
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
-}
-
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
- struct kvm_kernel_irq_routing_entry *new)
-{
- if (old->type != KVM_IRQ_ROUTING_MSI ||
- new->type != KVM_IRQ_ROUTING_MSI)
- return true;
-
- return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
-}
-
bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
@@ -14099,7 +13896,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 8dd22be2ca0b..b348928871c2 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -368,6 +368,14 @@ static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
container_of(polltbl, struct mshv_irqfd, irqfd_polltbl);
irqfd->irqfd_wqh = wqh;
+
+ /*
+ * TODO: Ensure there isn't already an exclusive, priority waiter, e.g.
+ * that the irqfd isn't already bound to another partition. Only the
+ * first exclusive waiter encountered will be notified, and
+ * add_wait_queue_priority() doesn't enforce exclusivity.
+ */
+ irqfd->irqfd_wait.flags |= WQ_FLAG_EXCLUSIVE;
add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
}
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index ccbab3a4811a..053a0f05768c 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -1054,7 +1054,6 @@ struct irq_2_irte {
};
struct amd_ir_data {
- u32 cached_ga_tag;
struct amd_iommu *iommu;
struct irq_2_irte irq_2_irte;
struct msi_msg msi_entry;
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 3117d99cf83d..c50d4a8a51be 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3804,13 +3804,70 @@ static const struct irq_domain_ops amd_ir_domain_ops = {
.deactivate = irq_remapping_deactivate,
};
-int amd_iommu_activate_guest_mode(void *data)
+static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
+ bool ga_log_intr)
+{
+ if (cpu >= 0) {
+ entry->lo.fields_vapic.destination =
+ APICID_TO_IRTE_DEST_LO(cpu);
+ entry->hi.fields.destination =
+ APICID_TO_IRTE_DEST_HI(cpu);
+ entry->lo.fields_vapic.is_run = true;
+ entry->lo.fields_vapic.ga_log_intr = false;
+ } else {
+ entry->lo.fields_vapic.is_run = false;
+ entry->lo.fields_vapic.ga_log_intr = ga_log_intr;
+ }
+}
+
+/*
+ * Update the pCPU information for an IRTE that is configured to post IRQs to
+ * a vCPU, without issuing an IOMMU invalidation for the IRTE.
+ *
+ * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination
+ * with the pCPU's APIC ID, set IsRun, and clear GALogIntr. If the vCPU isn't
+ * associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based
+ * on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is
+ * blocking and requires a notification wake event). I.e. treat vCPUs that are
+ * associated with a pCPU as running. This API is intended to be used when a
+ * vCPU is scheduled in/out (or stops running for any reason), to do a fast
+ * update of IsRun, GALogIntr, and (conditionally) Destination.
+ *
+ * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached
+ * and thus don't require an invalidation to ensure the IOMMU consumes fresh
+ * information.
+ */
+int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr)
+{
+ struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
+ struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
+
+ if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
+ return -EINVAL;
+
+ if (!entry || !entry->lo.fields_vapic.guest_mode)
+ return 0;
+
+ if (!ir_data->iommu)
+ return -ENODEV;
+
+ __amd_iommu_update_ga(entry, cpu, ga_log_intr);
+
+ return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
+ ir_data->irq_2_irte.index, entry);
+}
+EXPORT_SYMBOL(amd_iommu_update_ga);
+
+int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr)
{
struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
u64 valid;
- if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
+ if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
+ return -EINVAL;
+
+ if (!entry)
return 0;
valid = entry->lo.fields_vapic.valid;
@@ -3820,11 +3877,12 @@ int amd_iommu_activate_guest_mode(void *data)
entry->lo.fields_vapic.valid = valid;
entry->lo.fields_vapic.guest_mode = 1;
- entry->lo.fields_vapic.ga_log_intr = 1;
entry->hi.fields.ga_root_ptr = ir_data->ga_root_ptr;
entry->hi.fields.vector = ir_data->ga_vector;
entry->lo.fields_vapic.ga_tag = ir_data->ga_tag;
+ __amd_iommu_update_ga(entry, cpu, ga_log_intr);
+
return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
ir_data->irq_2_irte.index, entry);
}
@@ -3837,8 +3895,10 @@ int amd_iommu_deactivate_guest_mode(void *data)
struct irq_cfg *cfg = ir_data->cfg;
u64 valid;
- if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
- !entry || !entry->lo.fields_vapic.guest_mode)
+ if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
+ return -EINVAL;
+
+ if (!entry || !entry->lo.fields_vapic.guest_mode)
return 0;
valid = entry->lo.fields_remap.valid;
@@ -3860,11 +3920,10 @@ int amd_iommu_deactivate_guest_mode(void *data)
}
EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
-static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info)
{
int ret;
- struct amd_iommu_pi_data *pi_data = vcpu_info;
- struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
+ struct amd_iommu_pi_data *pi_data = info;
struct amd_ir_data *ir_data = data->chip_data;
struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
struct iommu_dev_data *dev_data;
@@ -3885,25 +3944,20 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
return -EINVAL;
ir_data->cfg = irqd_cfg(data);
- pi_data->ir_data = ir_data;
- pi_data->prev_ga_tag = ir_data->cached_ga_tag;
- if (pi_data->is_guest_mode) {
- ir_data->ga_root_ptr = (pi_data->base >> 12);
- ir_data->ga_vector = vcpu_pi_info->vector;
+ if (pi_data) {
+ pi_data->ir_data = ir_data;
+
+ ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12);
+ ir_data->ga_vector = pi_data->vector;
ir_data->ga_tag = pi_data->ga_tag;
- ret = amd_iommu_activate_guest_mode(ir_data);
- if (!ret)
- ir_data->cached_ga_tag = pi_data->ga_tag;
+ if (pi_data->is_guest_mode)
+ ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu,
+ pi_data->ga_log_intr);
+ else
+ ret = amd_iommu_deactivate_guest_mode(ir_data);
} else {
ret = amd_iommu_deactivate_guest_mode(ir_data);
-
- /*
- * This communicates the ga_tag back to the caller
- * so that it can do all the necessary clean up.
- */
- if (!ret)
- ir_data->cached_ga_tag = 0;
}
return ret;
@@ -3995,29 +4049,4 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
return 0;
}
-
-int amd_iommu_update_ga(int cpu, bool is_run, void *data)
-{
- struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
- struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
-
- if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
- !entry || !entry->lo.fields_vapic.guest_mode)
- return 0;
-
- if (!ir_data->iommu)
- return -ENODEV;
-
- if (cpu >= 0) {
- entry->lo.fields_vapic.destination =
- APICID_TO_IRTE_DEST_LO(cpu);
- entry->hi.fields.destination =
- APICID_TO_IRTE_DEST_HI(cpu);
- }
- entry->lo.fields_vapic.is_run = is_run;
-
- return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
- ir_data->irq_2_irte.index, entry);
-}
-EXPORT_SYMBOL(amd_iommu_update_ga);
#endif
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index cf7b6882ec75..2fc451253dc3 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1244,10 +1244,10 @@ static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
{
struct intel_ir_data *ir_data = data->chip_data;
- struct vcpu_data *vcpu_pi_info = info;
+ struct intel_iommu_pi_data *pi_data = info;
/* stop posting interrupts, back to the default mode */
- if (!vcpu_pi_info) {
+ if (!pi_data) {
__intel_ir_reconfigure_irte(data, true);
} else {
struct irte irte_pi;
@@ -1265,10 +1265,10 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
/* Update the posted mode fields */
irte_pi.p_pst = 1;
irte_pi.p_urgent = 0;
- irte_pi.p_vector = vcpu_pi_info->vector;
- irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >>
+ irte_pi.p_vector = pi_data->vector;
+ irte_pi.pda_l = (pi_data->pi_desc_addr >>
(32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
- irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) &
+ irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) &
~(-1UL << PDA_HIGH_BIT);
ir_data->irq_2_iommu.posted_vcpu = true;
diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
index 58c28895f8c4..8455b4a5fbb0 100644
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -342,10 +342,10 @@ int its_get_vlpi(int irq, struct its_vlpi_map *map)
return irq_set_vcpu_affinity(irq, &info);
}
-int its_unmap_vlpi(int irq)
+void its_unmap_vlpi(int irq)
{
irq_clear_status_flags(irq, IRQ_DISABLE_UNLAZY);
- return irq_set_vcpu_affinity(irq, NULL);
+ WARN_ON_ONCE(irq_set_vcpu_affinity(irq, NULL));
}
int its_prop_update_vlpi(int irq, u8 config, bool inv)
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 565966351dfa..123298a4dc8f 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -505,15 +505,11 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev,
if (ret)
goto out_put_eventfd_ctx;
- ctx->producer.token = trigger;
- ctx->producer.irq = irq;
- ret = irq_bypass_register_producer(&ctx->producer);
+ ret = irq_bypass_register_producer(&ctx->producer, trigger, irq);
if (unlikely(ret)) {
dev_info(&pdev->dev,
- "irq bypass producer (token %p) registration fails: %d\n",
- ctx->producer.token, ret);
-
- ctx->producer.token = NULL;
+ "irq bypass producer (eventfd %p) registration fails: %d\n",
+ trigger, ret);
}
ctx->trigger = trigger;
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 5a49b5a6d496..af1e1fdfd9ed 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -212,11 +212,11 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid)
if (!vq->call_ctx.ctx)
return;
- vq->call_ctx.producer.irq = irq;
- ret = irq_bypass_register_producer(&vq->call_ctx.producer);
+ ret = irq_bypass_register_producer(&vq->call_ctx.producer,
+ vq->call_ctx.ctx, irq);
if (unlikely(ret))
- dev_info(&v->dev, "vq %u, irq bypass producer (token %p) registration fails, ret = %d\n",
- qid, vq->call_ctx.producer.token, ret);
+ dev_info(&v->dev, "vq %u, irq bypass producer (eventfd %p) registration fails, ret = %d\n",
+ qid, vq->call_ctx.ctx, ret);
}
static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid)
@@ -712,7 +712,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
if (ops->get_status(vdpa) &
VIRTIO_CONFIG_S_DRIVER_OK)
vhost_vdpa_unsetup_vq_irq(v, idx);
- vq->call_ctx.producer.token = NULL;
}
break;
}
@@ -753,7 +752,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
cb.callback = vhost_vdpa_virtqueue_cb;
cb.private = vq;
cb.trigger = vq->call_ctx.ctx;
- vq->call_ctx.producer.token = vq->call_ctx.ctx;
if (ops->get_status(vdpa) &
VIRTIO_CONFIG_S_DRIVER_OK)
vhost_vdpa_setup_vq_irq(v, idx);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 4a34f7f0a864..b2a04481de1a 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -434,7 +434,7 @@ struct kvm_kernel_irq_routing_entry;
int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq,
struct kvm_kernel_irq_routing_entry *irq_entry);
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq);
+void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq);
int vgic_v4_load(struct kvm_vcpu *vcpu);
void vgic_v4_commit(struct kvm_vcpu *vcpu);
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 062fbd4c9b77..8cced632ecd0 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -12,20 +12,6 @@
struct amd_iommu;
-/*
- * This is mainly used to communicate information back-and-forth
- * between SVM and IOMMU for setting up and tearing down posted
- * interrupt
- */
-struct amd_iommu_pi_data {
- u32 ga_tag;
- u32 prev_ga_tag;
- u64 base;
- bool is_guest_mode;
- struct vcpu_data *vcpu_data;
- void *ir_data;
-};
-
#ifdef CONFIG_AMD_IOMMU
struct task_struct;
@@ -44,10 +30,8 @@ static inline void amd_iommu_detect(void) { }
/* IOMMU AVIC Function */
extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32));
-extern int
-amd_iommu_update_ga(int cpu, bool is_run, void *data);
-
-extern int amd_iommu_activate_guest_mode(void *data);
+extern int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr);
+extern int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr);
extern int amd_iommu_deactivate_guest_mode(void *data);
#else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
@@ -58,13 +42,12 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
return 0;
}
-static inline int
-amd_iommu_update_ga(int cpu, bool is_run, void *data)
+static inline int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr)
{
return 0;
}
-static inline int amd_iommu_activate_guest_mode(void *data)
+static inline int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr)
{
return 0;
}
diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h
index 9bdb2a781841..ede1fa938152 100644
--- a/include/linux/irqbypass.h
+++ b/include/linux/irqbypass.h
@@ -10,6 +10,7 @@
#include <linux/list.h>
+struct eventfd_ctx;
struct irq_bypass_consumer;
/*
@@ -18,20 +19,22 @@ struct irq_bypass_consumer;
* The IRQ bypass manager is a simple set of lists and callbacks that allows
* IRQ producers (ex. physical interrupt sources) to be matched to IRQ
* consumers (ex. virtualization hardware that allows IRQ bypass or offload)
- * via a shared token (ex. eventfd_ctx). Producers and consumers register
- * independently. When a token match is found, the optional @stop callback
- * will be called for each participant. The pair will then be connected via
- * the @add_* callbacks, and finally the optional @start callback will allow
- * any final coordination. When either participant is unregistered, the
- * process is repeated using the @del_* callbacks in place of the @add_*
- * callbacks. Match tokens must be unique per producer/consumer, 1:N pairings
- * are not supported.
+ * via a shared eventfd_ctx. Producers and consumers register independently.
+ * When a producer and consumer are paired, i.e. an eventfd match is found, the
+ * optional @stop callback will be called for each participant. The pair will
+ * then be connected via the @add_* callbacks, and finally the optional @start
+ * callback will allow any final coordination. When either participant is
+ * unregistered, the process is repeated using the @del_* callbacks in place of
+ * the @add_* callbacks. eventfds must be unique per producer/consumer, 1:N
+ * pairings are not supported.
*/
+struct irq_bypass_consumer;
+
/**
* struct irq_bypass_producer - IRQ bypass producer definition
- * @node: IRQ bypass manager private list management
- * @token: opaque token to match between producer and consumer (non-NULL)
+ * @eventfd: eventfd context used to match producers and consumers
+ * @consumer: The connected consumer (NULL if no connection)
* @irq: Linux IRQ number for the producer device
* @add_consumer: Connect the IRQ producer to an IRQ consumer (optional)
* @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional)
@@ -43,8 +46,8 @@ struct irq_bypass_consumer;
* for a physical device assigned to a VM.
*/
struct irq_bypass_producer {
- struct list_head node;
- void *token;
+ struct eventfd_ctx *eventfd;
+ struct irq_bypass_consumer *consumer;
int irq;
int (*add_consumer)(struct irq_bypass_producer *,
struct irq_bypass_consumer *);
@@ -56,8 +59,8 @@ struct irq_bypass_producer {
/**
* struct irq_bypass_consumer - IRQ bypass consumer definition
- * @node: IRQ bypass manager private list management
- * @token: opaque token to match between producer and consumer (non-NULL)
+ * @eventfd: eventfd context used to match producers and consumers
+ * @producer: The connected producer (NULL if no connection)
* @add_producer: Connect the IRQ consumer to an IRQ producer
* @del_producer: Disconnect the IRQ consumer from an IRQ producer
* @stop: Perform any quiesce operations necessary prior to add/del (optional)
@@ -69,8 +72,9 @@ struct irq_bypass_producer {
* portions of the interrupt handling to the VM.
*/
struct irq_bypass_consumer {
- struct list_head node;
- void *token;
+ struct eventfd_ctx *eventfd;
+ struct irq_bypass_producer *producer;
+
int (*add_producer)(struct irq_bypass_consumer *,
struct irq_bypass_producer *);
void (*del_producer)(struct irq_bypass_consumer *,
@@ -79,9 +83,11 @@ struct irq_bypass_consumer {
void (*start)(struct irq_bypass_consumer *);
};
-int irq_bypass_register_producer(struct irq_bypass_producer *);
-void irq_bypass_unregister_producer(struct irq_bypass_producer *);
-int irq_bypass_register_consumer(struct irq_bypass_consumer *);
-void irq_bypass_unregister_consumer(struct irq_bypass_consumer *);
+int irq_bypass_register_producer(struct irq_bypass_producer *producer,
+ struct eventfd_ctx *eventfd, int irq);
+void irq_bypass_unregister_producer(struct irq_bypass_producer *producer);
+int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer,
+ struct eventfd_ctx *eventfd);
+void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer);
#endif /* IRQBYPASS_H */
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index 7f1f11a5e4e4..0b0887099fd7 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -146,7 +146,7 @@ int its_commit_vpe(struct its_vpe *vpe);
int its_invall_vpe(struct its_vpe *vpe);
int its_map_vlpi(int irq, struct its_vlpi_map *map);
int its_get_vlpi(int irq, struct its_vlpi_map *map);
-int its_unmap_vlpi(int irq);
+void its_unmap_vlpi(int irq);
int its_prop_update_vlpi(int irq, u8 config, bool inv);
int its_prop_update_vsgi(int irq, u8 priority, bool group);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3bde4fb5c6aa..fb9ec06aa807 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -190,6 +190,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
#define KVM_USERSPACE_IRQ_SOURCE_ID 0
#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
+#define KVM_PIT_IRQ_SOURCE_ID 2
extern struct mutex kvm_lock;
extern struct list_head vm_list;
@@ -1022,16 +1023,12 @@ void kvm_unlock_all_vcpus(struct kvm *kvm);
void vcpu_load(struct kvm_vcpu *vcpu);
void vcpu_put(struct kvm_vcpu *vcpu);
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_KVM_IOAPIC
void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm);
-void kvm_arch_post_irq_routing_update(struct kvm *kvm);
#else
static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
{
}
-static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm)
-{
-}
#endif
#ifdef CONFIG_HAVE_KVM_IRQCHIP
@@ -1788,8 +1785,6 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian);
-int kvm_request_irq_source_id(struct kvm *kvm);
-void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
/*
@@ -2406,6 +2401,8 @@ struct kvm_vcpu *kvm_get_running_vcpu(void);
struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
+struct kvm_kernel_irqfd;
+
bool kvm_arch_has_irq_bypass(void);
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
struct irq_bypass_producer *);
@@ -2413,10 +2410,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
struct irq_bypass_producer *);
void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *,
- struct kvm_kernel_irq_routing_entry *);
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new);
#endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
#ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS
diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
index 8ad43692e3bb..ef8c134ded8a 100644
--- a/include/linux/kvm_irqfd.h
+++ b/include/linux/kvm_irqfd.h
@@ -55,10 +55,13 @@ struct kvm_kernel_irqfd {
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
- poll_table pt;
struct work_struct shutdown;
struct irq_bypass_consumer consumer;
struct irq_bypass_producer *producer;
+
+ struct kvm_vcpu *irq_bypass_vcpu;
+ struct list_head vcpu_list;
+ void *irq_bypass_data;
};
#endif /* __LINUX_KVM_IRQFD_H */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 965a19809c7e..09855d819418 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -164,6 +164,8 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
+ struct wait_queue_entry *wq_entry);
extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index fc7d0f8ff078..0b6b79b1a1bc 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -82,95 +82,15 @@ TRACE_EVENT(kvm_set_irq,
TP_printk("gsi %u level %d source %d",
__entry->gsi, __entry->level, __entry->irq_source_id)
);
-#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
-
-#if defined(__KVM_HAVE_IOAPIC)
-#define kvm_deliver_mode \
- {0x0, "Fixed"}, \
- {0x1, "LowPrio"}, \
- {0x2, "SMI"}, \
- {0x3, "Res3"}, \
- {0x4, "NMI"}, \
- {0x5, "INIT"}, \
- {0x6, "SIPI"}, \
- {0x7, "ExtINT"}
-
-TRACE_EVENT(kvm_ioapic_set_irq,
- TP_PROTO(__u64 e, int pin, bool coalesced),
- TP_ARGS(e, pin, coalesced),
-
- TP_STRUCT__entry(
- __field( __u64, e )
- __field( int, pin )
- __field( bool, coalesced )
- ),
-
- TP_fast_assign(
- __entry->e = e;
- __entry->pin = pin;
- __entry->coalesced = coalesced;
- ),
-
- TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
- __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
- __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
- (__entry->e & (1<<11)) ? "logical" : "physical",
- (__entry->e & (1<<15)) ? "level" : "edge",
- (__entry->e & (1<<16)) ? "|masked" : "",
- __entry->coalesced ? " (coalesced)" : "")
-);
-
-TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
- TP_PROTO(__u64 e),
- TP_ARGS(e),
-
- TP_STRUCT__entry(
- __field( __u64, e )
- ),
- TP_fast_assign(
- __entry->e = e;
- ),
-
- TP_printk("dst %x vec %u (%s|%s|%s%s)",
- (u8)(__entry->e >> 56), (u8)__entry->e,
- __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
- (__entry->e & (1<<11)) ? "logical" : "physical",
- (__entry->e & (1<<15)) ? "level" : "edge",
- (__entry->e & (1<<16)) ? "|masked" : "")
-);
-
-TRACE_EVENT(kvm_msi_set_irq,
- TP_PROTO(__u64 address, __u64 data),
- TP_ARGS(address, data),
-
- TP_STRUCT__entry(
- __field( __u64, address )
- __field( __u64, data )
- ),
-
- TP_fast_assign(
- __entry->address = address;
- __entry->data = data;
- ),
-
- TP_printk("dst %llx vec %u (%s|%s|%s%s)",
- (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
- (u8)__entry->data,
- __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
- (__entry->address & (1<<2)) ? "logical" : "physical",
- (__entry->data & (1<<15)) ? "level" : "edge",
- (__entry->address & (1<<3)) ? "|rh" : "")
-);
+#ifdef CONFIG_KVM_IOAPIC
#define kvm_irqchips \
{KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \
{KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \
{KVM_IRQCHIP_IOAPIC, "IOAPIC"}
-#endif /* defined(__KVM_HAVE_IOAPIC) */
-
-#if defined(CONFIG_HAVE_KVM_IRQCHIP)
+#endif /* CONFIG_KVM_IOAPIC */
#ifdef kvm_irqchips
#define kvm_ack_irq_string "irqchip %s pin %u"
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 51e38f5f4701..15632c89c4f2 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -40,13 +40,31 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
{
unsigned long flags;
- wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+ wq_entry->flags |= WQ_FLAG_PRIORITY;
spin_lock_irqsave(&wq_head->lock, flags);
__add_wait_queue(wq_head, wq_entry);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
+ struct wait_queue_entry *wq_entry)
+{
+ struct list_head *head = &wq_head->head;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+
+ guard(spinlock_irqsave)(&wq_head->lock);
+
+ if (!list_empty(head) &&
+ (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY))
+ return -EBUSY;
+
+ list_add(&wq_entry->entry, head);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive);
+
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@@ -64,7 +82,7 @@ EXPORT_SYMBOL(remove_wait_queue);
* the non-exclusive tasks. Normally, exclusive tasks will be at the end of
* the list and any non-exclusive tasks will be woken first. A priority task
* may be at the head of the list, and can consume the event without any other
- * tasks being woken.
+ * tasks being woken if it's also an exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 38b95998e1e6..028456f1aae1 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -59,6 +59,7 @@ TEST_PROGS_x86 += x86/nx_huge_pages_test.sh
TEST_GEN_PROGS_COMMON = demand_paging_test
TEST_GEN_PROGS_COMMON += dirty_log_test
TEST_GEN_PROGS_COMMON += guest_print_test
+TEST_GEN_PROGS_COMMON += irqfd_test
TEST_GEN_PROGS_COMMON += kvm_binary_stats_test
TEST_GEN_PROGS_COMMON += kvm_create_max_vcpus
TEST_GEN_PROGS_COMMON += kvm_page_table_test
diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c
index f4ac28d53747..a09dd423c2d7 100644
--- a/tools/testing/selftests/kvm/arm64/vgic_irq.c
+++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c
@@ -620,18 +620,12 @@ static void kvm_routing_and_irqfd_check(struct kvm_vm *vm,
* that no actual interrupt was injected for those cases.
*/
- for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
- fd[f] = eventfd(0, 0);
- TEST_ASSERT(fd[f] != -1, __KVM_SYSCALL_ERROR("eventfd()", fd[f]));
- }
+ for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++)
+ fd[f] = kvm_new_eventfd();
for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
- struct kvm_irqfd irqfd = {
- .fd = fd[f],
- .gsi = i - MIN_SPI,
- };
assert(i <= (uint64_t)UINT_MAX);
- vm_ioctl(vm, KVM_IRQFD, &irqfd);
+ kvm_assign_irqfd(vm, i - MIN_SPI, fd[f]);
}
for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) {
diff --git a/tools/testing/selftests/kvm/config b/tools/testing/selftests/kvm/config
index 8835fed09e9f..96d874b239eb 100644
--- a/tools/testing/selftests/kvm/config
+++ b/tools/testing/selftests/kvm/config
@@ -1,5 +1,6 @@
CONFIG_KVM=y
CONFIG_KVM_INTEL=y
CONFIG_KVM_AMD=y
+CONFIG_EVENTFD=y
CONFIG_USERFAULTFD=y
CONFIG_IDLE_PAGE_TRACKING=y
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index bee65ca08721..1ddf85eebd1d 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -18,6 +18,7 @@
#include <asm/atomic.h>
#include <asm/kvm.h>
+#include <sys/eventfd.h>
#include <sys/ioctl.h>
#include "kvm_util_arch.h"
@@ -502,6 +503,45 @@ static inline int vm_get_stats_fd(struct kvm_vm *vm)
return fd;
}
+static inline int __kvm_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd,
+ uint32_t flags)
+{
+ struct kvm_irqfd irqfd = {
+ .fd = eventfd,
+ .gsi = gsi,
+ .flags = flags,
+ .resamplefd = -1,
+ };
+
+ return __vm_ioctl(vm, KVM_IRQFD, &irqfd);
+}
+
+static inline void kvm_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd,
+ uint32_t flags)
+{
+ int ret = __kvm_irqfd(vm, gsi, eventfd, flags);
+
+ TEST_ASSERT_VM_VCPU_IOCTL(!ret, KVM_IRQFD, ret, vm);
+}
+
+static inline void kvm_assign_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd)
+{
+ kvm_irqfd(vm, gsi, eventfd, 0);
+}
+
+static inline void kvm_deassign_irqfd(struct kvm_vm *vm, uint32_t gsi, int eventfd)
+{
+ kvm_irqfd(vm, gsi, eventfd, KVM_IRQFD_FLAG_DEASSIGN);
+}
+
+static inline int kvm_new_eventfd(void)
+{
+ int fd = eventfd(0, 0);
+
+ TEST_ASSERT(fd >= 0, __KVM_SYSCALL_ERROR("eventfd()", fd));
+ return fd;
+}
+
static inline void read_stats_header(int stats_fd, struct kvm_stats_header *header)
{
ssize_t ret;
diff --git a/tools/testing/selftests/kvm/irqfd_test.c b/tools/testing/selftests/kvm/irqfd_test.c
new file mode 100644
index 000000000000..7c301b4c7005
--- /dev/null
+++ b/tools/testing/selftests/kvm/irqfd_test.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <stdint.h>
+#include <sys/sysinfo.h>
+
+#include "kvm_util.h"
+
+static struct kvm_vm *vm1;
+static struct kvm_vm *vm2;
+static int __eventfd;
+static bool done;
+
+/*
+ * KVM de-assigns based on eventfd *and* GSI, but requires unique eventfds when
+ * assigning (the API isn't symmetrical). Abuse the oddity and use a per-task
+ * GSI base to avoid false failures due to cross-task de-assign, i.e. so that
+ * the secondary doesn't de-assign the primary's eventfd and cause assign to
+ * unexpectedly succeed on the primary.
+ */
+#define GSI_BASE_PRIMARY 0x20
+#define GSI_BASE_SECONDARY 0x30
+
+static void juggle_eventfd_secondary(struct kvm_vm *vm, int eventfd)
+{
+ int r, i;
+
+ /*
+ * The secondary task can encounter EBADF since the primary can close
+ * the eventfd at any time. And because the primary can recreate the
+ * eventfd, at the safe fd in the file table, the secondary can also
+ * encounter "unexpected" success, e.g. if the close+recreate happens
+ * between the first and second assignments. The secondary's role is
+ * mostly to antagonize KVM, not to detect bugs.
+ */
+ for (i = 0; i < 2; i++) {
+ r = __kvm_irqfd(vm, GSI_BASE_SECONDARY, eventfd, 0);
+ TEST_ASSERT(!r || errno == EBUSY || errno == EBADF,
+ "Wanted success, EBUSY, or EBADF, r = %d, errno = %d",
+ r, errno);
+
+ /* De-assign should succeed unless the eventfd was closed. */
+ r = __kvm_irqfd(vm, GSI_BASE_SECONDARY + i, eventfd, KVM_IRQFD_FLAG_DEASSIGN);
+ TEST_ASSERT(!r || errno == EBADF,
+ "De-assign should succeed unless the fd was closed");
+ }
+}
+
+static void *secondary_irqfd_juggler(void *ign)
+{
+ while (!READ_ONCE(done)) {
+ juggle_eventfd_secondary(vm1, READ_ONCE(__eventfd));
+ juggle_eventfd_secondary(vm2, READ_ONCE(__eventfd));
+ }
+
+ return NULL;
+}
+
+static void juggle_eventfd_primary(struct kvm_vm *vm, int eventfd)
+{
+ int r1, r2;
+
+ /*
+ * At least one of the assigns should fail. KVM disallows assigning a
+ * single eventfd to multiple GSIs (or VMs), so it's possible that both
+ * assignments can fail, too.
+ */
+ r1 = __kvm_irqfd(vm, GSI_BASE_PRIMARY, eventfd, 0);
+ TEST_ASSERT(!r1 || errno == EBUSY,
+ "Wanted success or EBUSY, r = %d, errno = %d", r1, errno);
+
+ r2 = __kvm_irqfd(vm, GSI_BASE_PRIMARY + 1, eventfd, 0);
+ TEST_ASSERT(r1 || (r2 && errno == EBUSY),
+ "Wanted failure (EBUSY), r1 = %d, r2 = %d, errno = %d",
+ r1, r2, errno);
+
+ /*
+ * De-assign should always succeed, even if the corresponding assign
+ * failed.
+ */
+ kvm_irqfd(vm, GSI_BASE_PRIMARY, eventfd, KVM_IRQFD_FLAG_DEASSIGN);
+ kvm_irqfd(vm, GSI_BASE_PRIMARY + 1, eventfd, KVM_IRQFD_FLAG_DEASSIGN);
+}
+
+int main(int argc, char *argv[])
+{
+ pthread_t racing_thread;
+ int r, i;
+
+ /* Create "full" VMs, as KVM_IRQFD requires an in-kernel IRQ chip. */
+ vm1 = vm_create(1);
+ vm2 = vm_create(1);
+
+ WRITE_ONCE(__eventfd, kvm_new_eventfd());
+
+ kvm_irqfd(vm1, 10, __eventfd, 0);
+
+ r = __kvm_irqfd(vm1, 11, __eventfd, 0);
+ TEST_ASSERT(r && errno == EBUSY,
+ "Wanted EBUSY, r = %d, errno = %d", r, errno);
+
+ r = __kvm_irqfd(vm2, 12, __eventfd, 0);
+ TEST_ASSERT(r && errno == EBUSY,
+ "Wanted EBUSY, r = %d, errno = %d", r, errno);
+
+ /*
+ * De-assign all eventfds, along with multiple eventfds that were never
+ * assigned. KVM's ABI is that de-assign is allowed so long as the
+ * eventfd itself is valid.
+ */
+ kvm_irqfd(vm1, 11, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
+ kvm_irqfd(vm1, 12, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
+ kvm_irqfd(vm1, 13, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
+ kvm_irqfd(vm1, 14, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
+ kvm_irqfd(vm1, 10, READ_ONCE(__eventfd), KVM_IRQFD_FLAG_DEASSIGN);
+
+ close(__eventfd);
+
+ pthread_create(&racing_thread, NULL, secondary_irqfd_juggler, vm2);
+
+ for (i = 0; i < 10000; i++) {
+ WRITE_ONCE(__eventfd, kvm_new_eventfd());
+
+ juggle_eventfd_primary(vm1, __eventfd);
+ juggle_eventfd_primary(vm2, __eventfd);
+ close(__eventfd);
+ }
+
+ WRITE_ONCE(done, true);
+ pthread_join(racing_thread, NULL);
+}
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index a055343a7bf7..93a921deeb99 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1716,7 +1716,18 @@ void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa)
/* Create an interrupt controller chip for the specified VM. */
void vm_create_irqchip(struct kvm_vm *vm)
{
- vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL);
+ int r;
+
+ /*
+ * Allocate a fully in-kernel IRQ chip by default, but fall back to a
+ * split model (x86 only) if that fails (KVM x86 allows compiling out
+ * support for KVM_CREATE_IRQCHIP).
+ */
+ r = __vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL);
+ if (r && errno == ENOTTY && kvm_has_cap(KVM_CAP_SPLIT_IRQCHIP))
+ vm_enable_cap(vm, KVM_CAP_SPLIT_IRQCHIP, 24);
+ else
+ TEST_ASSERT_VM_VCPU_IOCTL(!r, KVM_CREATE_IRQCHIP, r, vm);
vm->has_irqchip = true;
}
diff --git a/tools/testing/selftests/kvm/x86/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86/xen_shinfo_test.c
index 287829f850f7..23909b501ac2 100644
--- a/tools/testing/selftests/kvm/x86/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86/xen_shinfo_test.c
@@ -547,15 +547,9 @@ int main(int argc, char *argv[])
int irq_fd[2] = { -1, -1 };
if (do_eventfd_tests) {
- irq_fd[0] = eventfd(0, 0);
- irq_fd[1] = eventfd(0, 0);
+ irq_fd[0] = kvm_new_eventfd();
+ irq_fd[1] = kvm_new_eventfd();
- /* Unexpected, but not a KVM failure */
- if (irq_fd[0] == -1 || irq_fd[1] == -1)
- do_evtchn_tests = do_eventfd_tests = false;
- }
-
- if (do_eventfd_tests) {
irq_routes.info.nr = 2;
irq_routes.entries[0].gsi = 32;
@@ -572,15 +566,8 @@ int main(int argc, char *argv[])
vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info);
- struct kvm_irqfd ifd = { };
-
- ifd.fd = irq_fd[0];
- ifd.gsi = 32;
- vm_ioctl(vm, KVM_IRQFD, &ifd);
-
- ifd.fd = irq_fd[1];
- ifd.gsi = 33;
- vm_ioctl(vm, KVM_IRQFD, &ifd);
+ kvm_assign_irqfd(vm, 32, irq_fd[0]);
+ kvm_assign_irqfd(vm, 33, irq_fd[1]);
struct sigaction sa = { };
sa.sa_handler = handle_alrm;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 11e5d1e3f12e..6b1133a6617f 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -204,6 +204,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
int ret = 0;
if (flags & EPOLLIN) {
+ /*
+ * WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP,
+ * as KVM holds irqfds.lock when registering the irqfd with the
+ * eventfd.
+ */
u64 cnt;
eventfd_ctx_do_read(irqfd->eventfd, &cnt);
@@ -225,6 +230,11 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
/* The eventfd is closing, detach from KVM */
unsigned long iflags;
+ /*
+ * Taking irqfds.lock is safe here, as KVM holds a reference to
+ * the eventfd when registering the irqfd, i.e. this path can't
+ * be reached while kvm_irqfd_add() is running.
+ */
spin_lock_irqsave(&kvm->irqfds.lock, iflags);
/*
@@ -245,22 +255,14 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
return ret;
}
-static void
-irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
- poll_table *pt)
-{
- struct kvm_kernel_irqfd *irqfd =
- container_of(pt, struct kvm_kernel_irqfd, pt);
- add_wait_queue_priority(wqh, &irqfd->wait);
-}
-
-/* Must be called under irqfds.lock */
static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
int n_entries;
+ lockdep_assert_held(&kvm->irqfds.lock);
+
n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
write_seqcount_begin(&irqfd->irq_entry_sc);
@@ -274,6 +276,63 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
write_seqcount_end(&irqfd->irq_entry_sc);
}
+struct kvm_irqfd_pt {
+ struct kvm_kernel_irqfd *irqfd;
+ struct kvm *kvm;
+ poll_table pt;
+ int ret;
+};
+
+static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh,
+ poll_table *pt)
+{
+ struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt);
+ struct kvm_kernel_irqfd *irqfd = p->irqfd;
+ struct kvm *kvm = p->kvm;
+
+ /*
+ * Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing,
+ * and irqfds.items. It does NOT protect registering with the eventfd.
+ */
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ /*
+ * Initialize the routing information prior to adding the irqfd to the
+ * eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the
+ * irqfd is registered.
+ */
+ irqfd_update(kvm, irqfd);
+
+ /*
+ * Add the irqfd as a priority waiter on the eventfd, with a custom
+ * wake-up handler, so that KVM *and only KVM* is notified whenever the
+ * underlying eventfd is signaled.
+ */
+ init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
+
+ /*
+ * Temporarily lie to lockdep about holding irqfds.lock to avoid a
+ * false positive regarding potential deadlock with irqfd_wakeup()
+ * (see irqfd_wakeup() for details).
+ *
+ * Adding to the wait queue will fail if there is already a priority
+ * waiter, i.e. if the eventfd is associated with another irqfd (in any
+ * VM). Note, kvm_irqfd_deassign() waits for all in-flight shutdown
+ * jobs to complete, i.e. ensures the irqfd has been removed from the
+ * eventfd's waitqueue before returning to userspace.
+ */
+ spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_);
+ p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait);
+ spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_);
+ if (p->ret)
+ goto out;
+
+ list_add_tail(&irqfd->list, &kvm->irqfds.items);
+
+out:
+ spin_unlock_irq(&kvm->irqfds.lock);
+}
+
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
void __attribute__((weak)) kvm_arch_irq_bypass_stop(
struct irq_bypass_consumer *cons)
@@ -285,26 +344,20 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start(
{
}
-int __attribute__((weak)) kvm_arch_update_irqfd_routing(
- struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
+void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
{
- return 0;
-}
-bool __attribute__((weak)) kvm_arch_irqfd_route_changed(
- struct kvm_kernel_irq_routing_entry *old,
- struct kvm_kernel_irq_routing_entry *new)
-{
- return true;
}
#endif
static int
kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
{
- struct kvm_kernel_irqfd *irqfd, *tmp;
+ struct kvm_kernel_irqfd *irqfd;
struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
+ struct kvm_irqfd_pt irqfd_pt;
int ret;
__poll_t events;
int idx;
@@ -390,57 +443,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
}
/*
- * Install our own custom wake-up handling so we are notified via
- * a callback whenever someone signals the underlying eventfd
+ * Set the irqfd routing and add it to KVM's list before registering
+ * the irqfd with the eventfd, so that the routing information is valid
+ * and stays valid, e.g. if there are GSI routing changes, prior to
+ * making the irqfd visible, i.e. before it might be signaled.
+ *
+ * Note, holding SRCU ensures a stable read of routing information, and
+ * also prevents irqfd_shutdown() from freeing the irqfd before it's
+ * fully initialized.
*/
- init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
- init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
-
- spin_lock_irq(&kvm->irqfds.lock);
-
- ret = 0;
- list_for_each_entry(tmp, &kvm->irqfds.items, list) {
- if (irqfd->eventfd != tmp->eventfd)
- continue;
- /* This fd is used for another irq already. */
- ret = -EBUSY;
- spin_unlock_irq(&kvm->irqfds.lock);
- goto fail;
- }
-
idx = srcu_read_lock(&kvm->irq_srcu);
- irqfd_update(kvm, irqfd);
-
- list_add_tail(&irqfd->list, &kvm->irqfds.items);
-
- spin_unlock_irq(&kvm->irqfds.lock);
/*
- * Check if there was an event already pending on the eventfd
- * before we registered, and trigger it as if we didn't miss it.
+ * Register the irqfd with the eventfd by polling on the eventfd, and
+ * simultaneously and the irqfd to KVM's list. If there was en event
+ * pending on the eventfd prior to registering, manually trigger IRQ
+ * injection.
*/
- events = vfs_poll(fd_file(f), &irqfd->pt);
+ irqfd_pt.irqfd = irqfd;
+ irqfd_pt.kvm = kvm;
+ init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register);
+
+ events = vfs_poll(fd_file(f), &irqfd_pt.pt);
+
+ ret = irqfd_pt.ret;
+ if (ret)
+ goto fail_poll;
if (events & EPOLLIN)
schedule_work(&irqfd->inject);
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
if (kvm_arch_has_irq_bypass()) {
- irqfd->consumer.token = (void *)irqfd->eventfd;
irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
irqfd->consumer.start = kvm_arch_irq_bypass_start;
- ret = irq_bypass_register_consumer(&irqfd->consumer);
+ ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd);
if (ret)
- pr_info("irq bypass consumer (token %p) registration fails: %d\n",
- irqfd->consumer.token, ret);
+ pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n",
+ irqfd->eventfd, ret);
}
#endif
srcu_read_unlock(&kvm->irq_srcu, idx);
return 0;
+fail_poll:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
fail:
if (irqfd->resampler)
irqfd_resampler_shutdown(irqfd);
@@ -617,13 +667,8 @@ void kvm_irq_routing_update(struct kvm *kvm)
irqfd_update(kvm, irqfd);
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
- if (irqfd->producer &&
- kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) {
- int ret = kvm_arch_update_irqfd_routing(
- irqfd->kvm, irqfd->producer->irq,
- irqfd->gsi, 1);
- WARN_ON(ret);
- }
+ if (irqfd->producer)
+ kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry);
#endif
}
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 162d8ed889f2..6ccabfd32287 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -222,8 +222,6 @@ int kvm_set_irq_routing(struct kvm *kvm,
kvm_arch_irq_routing_update(kvm);
mutex_unlock(&kvm->irq_lock);
- kvm_arch_post_irq_routing_update(kvm);
-
synchronize_srcu_expedited(&kvm->irq_srcu);
new = old;
diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c
index 28fda42e471b..62c160200be9 100644
--- a/virt/lib/irqbypass.c
+++ b/virt/lib/irqbypass.c
@@ -22,8 +22,8 @@
MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("IRQ bypass manager utility module");
-static LIST_HEAD(producers);
-static LIST_HEAD(consumers);
+static DEFINE_XARRAY(producers);
+static DEFINE_XARRAY(consumers);
static DEFINE_MUTEX(lock);
/* @lock must be held when calling connect */
@@ -51,6 +51,10 @@ static int __connect(struct irq_bypass_producer *prod,
if (prod->start)
prod->start(prod);
+ if (!ret) {
+ prod->consumer = cons;
+ cons->producer = prod;
+ }
return ret;
}
@@ -72,56 +76,49 @@ static void __disconnect(struct irq_bypass_producer *prod,
cons->start(cons);
if (prod->start)
prod->start(prod);
+
+ prod->consumer = NULL;
+ cons->producer = NULL;
}
/**
* irq_bypass_register_producer - register IRQ bypass producer
* @producer: pointer to producer structure
+ * @eventfd: pointer to the eventfd context associated with the producer
+ * @irq: Linux IRQ number of the underlying producer device
*
- * Add the provided IRQ producer to the list of producers and connect
- * with any matching token found on the IRQ consumers list.
+ * Add the provided IRQ producer to the set of producers and connect with the
+ * consumer with a matching eventfd, if one exists.
*/
-int irq_bypass_register_producer(struct irq_bypass_producer *producer)
+int irq_bypass_register_producer(struct irq_bypass_producer *producer,
+ struct eventfd_ctx *eventfd, int irq)
{
- struct irq_bypass_producer *tmp;
+ unsigned long index = (unsigned long)eventfd;
struct irq_bypass_consumer *consumer;
int ret;
- if (!producer->token)
+ if (WARN_ON_ONCE(producer->eventfd))
return -EINVAL;
- might_sleep();
-
- if (!try_module_get(THIS_MODULE))
- return -ENODEV;
+ producer->irq = irq;
- mutex_lock(&lock);
+ guard(mutex)(&lock);
- list_for_each_entry(tmp, &producers, node) {
- if (tmp->token == producer->token) {
- ret = -EBUSY;
- goto out_err;
- }
- }
+ ret = xa_insert(&producers, index, producer, GFP_KERNEL);
+ if (ret)
+ return ret;
- list_for_each_entry(consumer, &consumers, node) {
- if (consumer->token == producer->token) {
- ret = __connect(producer, consumer);
- if (ret)
- goto out_err;
- break;
+ consumer = xa_load(&consumers, index);
+ if (consumer) {
+ ret = __connect(producer, consumer);
+ if (ret) {
+ WARN_ON_ONCE(xa_erase(&producers, index) != producer);
+ return ret;
}
}
- list_add(&producer->node, &producers);
-
- mutex_unlock(&lock);
-
+ producer->eventfd = eventfd;
return 0;
-out_err:
- mutex_unlock(&lock);
- module_put(THIS_MODULE);
- return ret;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
@@ -129,95 +126,65 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
* irq_bypass_unregister_producer - unregister IRQ bypass producer
* @producer: pointer to producer structure
*
- * Remove a previously registered IRQ producer from the list of producers
- * and disconnect it from any connected IRQ consumer.
+ * Remove a previously registered IRQ producer (note, it's safe to call this
+ * even if registration was unsuccessful). Disconnect from the associated
+ * consumer, if one exists.
*/
void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
{
- struct irq_bypass_producer *tmp;
- struct irq_bypass_consumer *consumer;
+ unsigned long index = (unsigned long)producer->eventfd;
- if (!producer->token)
+ if (!producer->eventfd)
return;
- might_sleep();
-
- if (!try_module_get(THIS_MODULE))
- return; /* nothing in the list anyway */
-
- mutex_lock(&lock);
-
- list_for_each_entry(tmp, &producers, node) {
- if (tmp->token != producer->token)
- continue;
-
- list_for_each_entry(consumer, &consumers, node) {
- if (consumer->token == producer->token) {
- __disconnect(producer, consumer);
- break;
- }
- }
-
- list_del(&producer->node);
- module_put(THIS_MODULE);
- break;
- }
+ guard(mutex)(&lock);
- mutex_unlock(&lock);
+ if (producer->consumer)
+ __disconnect(producer, producer->consumer);
- module_put(THIS_MODULE);
+ WARN_ON_ONCE(xa_erase(&producers, index) != producer);
+ producer->eventfd = NULL;
}
EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
/**
* irq_bypass_register_consumer - register IRQ bypass consumer
* @consumer: pointer to consumer structure
+ * @eventfd: pointer to the eventfd context associated with the consumer
*
- * Add the provided IRQ consumer to the list of consumers and connect
- * with any matching token found on the IRQ producer list.
+ * Add the provided IRQ consumer to the set of consumers and connect with the
+ * producer with a matching eventfd, if one exists.
*/
-int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
+int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer,
+ struct eventfd_ctx *eventfd)
{
- struct irq_bypass_consumer *tmp;
+ unsigned long index = (unsigned long)eventfd;
struct irq_bypass_producer *producer;
int ret;
- if (!consumer->token ||
- !consumer->add_producer || !consumer->del_producer)
+ if (WARN_ON_ONCE(consumer->eventfd))
return -EINVAL;
- might_sleep();
-
- if (!try_module_get(THIS_MODULE))
- return -ENODEV;
+ if (!consumer->add_producer || !consumer->del_producer)
+ return -EINVAL;
- mutex_lock(&lock);
+ guard(mutex)(&lock);
- list_for_each_entry(tmp, &consumers, node) {
- if (tmp->token == consumer->token || tmp == consumer) {
- ret = -EBUSY;
- goto out_err;
- }
- }
+ ret = xa_insert(&consumers, index, consumer, GFP_KERNEL);
+ if (ret)
+ return ret;
- list_for_each_entry(producer, &producers, node) {
- if (producer->token == consumer->token) {
- ret = __connect(producer, consumer);
- if (ret)
- goto out_err;
- break;
+ producer = xa_load(&producers, index);
+ if (producer) {
+ ret = __connect(producer, consumer);
+ if (ret) {
+ WARN_ON_ONCE(xa_erase(&consumers, index) != consumer);
+ return ret;
}
}
- list_add(&consumer->node, &consumers);
-
- mutex_unlock(&lock);
-
+ consumer->eventfd = eventfd;
return 0;
-out_err:
- mutex_unlock(&lock);
- module_put(THIS_MODULE);
- return ret;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
@@ -225,42 +192,23 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
* irq_bypass_unregister_consumer - unregister IRQ bypass consumer
* @consumer: pointer to consumer structure
*
- * Remove a previously registered IRQ consumer from the list of consumers
- * and disconnect it from any connected IRQ producer.
+ * Remove a previously registered IRQ consumer (note, it's safe to call this
+ * even if registration was unsuccessful). Disconnect from the associated
+ * producer, if one exists.
*/
void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
{
- struct irq_bypass_consumer *tmp;
- struct irq_bypass_producer *producer;
+ unsigned long index = (unsigned long)consumer->eventfd;
- if (!consumer->token)
+ if (!consumer->eventfd)
return;
- might_sleep();
-
- if (!try_module_get(THIS_MODULE))
- return; /* nothing in the list anyway */
-
- mutex_lock(&lock);
-
- list_for_each_entry(tmp, &consumers, node) {
- if (tmp != consumer)
- continue;
-
- list_for_each_entry(producer, &producers, node) {
- if (producer->token == consumer->token) {
- __disconnect(producer, consumer);
- break;
- }
- }
-
- list_del(&consumer->node);
- module_put(THIS_MODULE);
- break;
- }
+ guard(mutex)(&lock);
- mutex_unlock(&lock);
+ if (consumer->producer)
+ __disconnect(consumer->producer, consumer);
- module_put(THIS_MODULE);
+ WARN_ON_ONCE(xa_erase(&consumers, index) != consumer);
+ consumer->eventfd = NULL;
}
EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);