summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig10
-rw-r--r--arch/x86/kvm/Makefile7
-rw-r--r--arch/x86/kvm/cpuid.c19
-rw-r--r--arch/x86/kvm/hyperv.c15
-rw-r--r--arch/x86/kvm/hyperv.h3
-rw-r--r--arch/x86/kvm/i8254.c94
-rw-r--r--arch/x86/kvm/i8254.h17
-rw-r--r--arch/x86/kvm/i8259.c17
-rw-r--r--arch/x86/kvm/ioapic.c62
-rw-r--r--arch/x86/kvm/ioapic.h26
-rw-r--r--arch/x86/kvm/irq.c560
-rw-r--r--arch/x86/kvm/irq.h35
-rw-r--r--arch/x86/kvm/irq_comm.c442
-rw-r--r--arch/x86/kvm/lapic.c132
-rw-r--r--arch/x86/kvm/lapic.h30
-rw-r--r--arch/x86/kvm/mmu/mmu.c89
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h3
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h8
-rw-r--r--arch/x86/kvm/mmu/spte.c43
-rw-r--r--arch/x86/kvm/mmu/spte.h10
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c19
-rw-r--r--arch/x86/kvm/reverse_cpuid.h7
-rw-r--r--arch/x86/kvm/svm/avic.c688
-rw-r--r--arch/x86/kvm/svm/nested.c164
-rw-r--r--arch/x86/kvm/svm/sev.c388
-rw-r--r--arch/x86/kvm/svm/svm.c666
-rw-r--r--arch/x86/kvm/svm/svm.h151
-rw-r--r--arch/x86/kvm/svm/vmenter.S6
-rw-r--r--arch/x86/kvm/trace.h99
-rw-r--r--arch/x86/kvm/vmx/capabilities.h1
-rw-r--r--arch/x86/kvm/vmx/common.h6
-rw-r--r--arch/x86/kvm/vmx/main.c265
-rw-r--r--arch/x86/kvm/vmx/nested.c75
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c8
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c153
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h15
-rw-r--r--arch/x86/kvm/vmx/run_flags.h10
-rw-r--r--arch/x86/kvm/vmx/tdx.c189
-rw-r--r--arch/x86/kvm/vmx/tdx.h1
-rw-r--r--arch/x86/kvm/vmx/vmenter.S3
-rw-r--r--arch/x86/kvm/vmx/vmx.c321
-rw-r--r--arch/x86/kvm/vmx/vmx.h60
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h82
-rw-r--r--arch/x86/kvm/x86.c448
-rw-r--r--arch/x86/kvm/x86.h58
-rw-r--r--arch/x86/kvm/xen.c20
46 files changed, 2934 insertions, 2591 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2eeffcec5382..2c86673155c9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -166,6 +166,16 @@ config KVM_AMD_SEV
Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
Secure Nested Paging (SEV-SNP) technologies on AMD processors.
+config KVM_IOAPIC
+ bool "I/O APIC, PIC, and PIT emulation"
+ default y
+ depends on KVM
+ help
+ Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
+ for full in-kernel APIC emulation.
+
+ If unsure, say Y.
+
config KVM_SMM
bool "System Management Mode emulation"
default y
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a5d362c7b504..c4b8950c7abe 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,12 +5,11 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror
include $(srctree)/virt/kvm/Makefile.kvm
-kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
- i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- debugfs.o mmu/mmu.o mmu/page_track.o \
- mmu/spte.o
+kvm-y += x86.o emulate.o irq.o lapic.o cpuid.o pmu.o mtrr.o \
+ debugfs.o mmu/mmu.o mmu/page_track.o mmu/spte.o
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
+kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
kvm-$(CONFIG_KVM_HYPERV) += hyperv.o
kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-$(CONFIG_KVM_SMM) += smm.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6569b453546b..e2836a255b16 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -978,6 +978,8 @@ void kvm_set_cpu_caps(void)
F(FZRM),
F(FSRS),
F(FSRC),
+ F(WRMSRNS),
+ X86_64_F(LKGS),
F(AMX_FP16),
F(AVX_IFMA),
F(LAM),
@@ -1093,6 +1095,7 @@ void kvm_set_cpu_caps(void)
F(AMD_SSB_NO),
F(AMD_STIBP),
F(AMD_STIBP_ALWAYS_ON),
+ F(AMD_IBRS_SAME_MODE),
F(AMD_PSFD),
F(AMD_IBPB_RET),
);
@@ -1150,6 +1153,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_init(CPUID_8000_0021_EAX,
F(NO_NESTED_DATA_BP),
+ F(WRMSR_XX_BASE_NS),
/*
* Synthesize "LFENCE is serializing" into the AMD-defined entry
* in KVM's supported CPUID, i.e. if the feature is reported as
@@ -1162,17 +1166,27 @@ void kvm_set_cpu_caps(void)
*/
SYNTHESIZED_F(LFENCE_RDTSC),
/* SmmPgCfgLock */
+ /* 4: Resv */
+ SYNTHESIZED_F(VERW_CLEAR),
F(NULL_SEL_CLR_BASE),
+ /* UpperAddressIgnore */
F(AUTOIBRS),
+ F(PREFETCHI),
EMULATED_F(NO_SMM_CTL_MSR),
/* PrefetchCtlMsr */
- F(WRMSR_XX_BASE_NS),
+ /* GpOnUserCpuid */
+ /* EPSF */
SYNTHESIZED_F(SBPB),
SYNTHESIZED_F(IBPB_BRTYPE),
SYNTHESIZED_F(SRSO_NO),
F(SRSO_USER_KERNEL_NO),
);
+ kvm_cpu_cap_init(CPUID_8000_0021_ECX,
+ SYNTHESIZED_F(TSA_SQ_NO),
+ SYNTHESIZED_F(TSA_L1_NO),
+ );
+
kvm_cpu_cap_init(CPUID_8000_0022_EAX,
F(PERFMON_V2),
);
@@ -1742,8 +1756,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
break;
case 0x80000021:
- entry->ebx = entry->ecx = entry->edx = 0;
+ entry->ebx = entry->edx = 0;
cpuid_entry_override(entry, CPUID_8000_0021_EAX);
+ cpuid_entry_override(entry, CPUID_8000_0021_ECX);
break;
/* AMD Extended Performance Monitoring and Debug */
case 0x80000022: {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 24f0318c50d7..72b19a88a776 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -497,15 +497,19 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
return ret;
}
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
struct kvm_vcpu_hv_synic *synic;
- synic = synic_get(kvm, vpidx);
+ if (!level)
+ return -1;
+
+ synic = synic_get(kvm, e->hv_sint.vcpu);
if (!synic)
return -EINVAL;
- return synic_set_irq(synic, sint);
+ return synic_set_irq(synic, e->hv_sint.sint);
}
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
@@ -1979,6 +1983,9 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY)
goto out_flush_all;
+ if (is_noncanonical_invlpg_address(entries[i], vcpu))
+ continue;
+
/*
* Lower 12 bits of 'address' encode the number of additional
* pages to flush.
@@ -2001,11 +2008,11 @@ out_flush_all:
static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ unsigned long *vcpu_mask = hv_vcpu->vcpu_mask;
u64 *sparse_banks = hv_vcpu->sparse_banks;
struct kvm *kvm = vcpu->kvm;
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
- DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
/*
* Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE'
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 913bfc96959c..6ce160ffa678 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -103,7 +103,8 @@ static inline bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
-int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 739aa6c0d0c3..850972deac8e 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -248,8 +248,8 @@ static void pit_do_work(struct kthread_work *work)
if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
return;
- kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
- kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false);
+ kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false);
/*
* Provides NMI watchdog support via Virtual Wire mode.
@@ -288,7 +288,7 @@ static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
atomic_set(&pit->pit_state.irq_ack, 1);
}
-void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+static void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
{
struct kvm_kpit_state *ps = &pit->pit_state;
struct kvm *kvm = pit->kvm;
@@ -400,8 +400,8 @@ static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
}
}
-void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
- int hpet_legacy_start)
+static void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start)
{
u8 saved_mode;
@@ -641,7 +641,7 @@ static void kvm_pit_reset(struct kvm_pit *pit)
kvm_pit_reset_reinject(pit);
}
-static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
+static void pit_mask_notifier(struct kvm_irq_mask_notifier *kimn, bool mask)
{
struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
@@ -649,6 +649,79 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
kvm_pit_reset_reinject(pit);
}
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+ BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+ mutex_lock(&kps->lock);
+ memcpy(ps, &kps->channels, sizeof(*ps));
+ mutex_unlock(&kps->lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int i;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ memset(&ps->reserved, 0, sizeof(ps->reserved));
+ return 0;
+}
+
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int start = 0;
+ int i;
+ u32 prev_legacy, cur_legacy;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&pit->pit_state.channels, &ps->channels,
+ sizeof(pit->pit_state.channels));
+ pit->pit_state.flags = ps->flags;
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
+ start && i == 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
+ */
+ mutex_lock(&pit->pit_state.lock);
+ kvm_pit_set_reinject(pit, control->pit_reinject);
+ mutex_unlock(&pit->pit_state.lock);
+
+ return 0;
+}
+
static const struct kvm_io_device_ops pit_dev_ops = {
.read = pit_ioport_read,
.write = pit_ioport_write,
@@ -671,10 +744,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
if (!pit)
return NULL;
- pit->irq_source_id = kvm_request_irq_source_id(kvm);
- if (pit->irq_source_id < 0)
- goto fail_request;
-
mutex_init(&pit->pit_state.lock);
pid = get_pid(task_tgid(current));
@@ -694,7 +763,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
- pit->mask_notifier.func = pit_mask_notifer;
+ pit->mask_notifier.func = pit_mask_notifier;
kvm_pit_reset(pit);
@@ -726,8 +795,6 @@ fail_register_pit:
kvm_pit_set_reinject(pit, false);
kthread_destroy_worker(pit->worker);
fail_kthread:
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
-fail_request:
kfree(pit);
return NULL;
}
@@ -744,7 +811,6 @@ void kvm_free_pit(struct kvm *kvm)
kvm_pit_set_reinject(pit, false);
hrtimer_cancel(&pit->pit_state.timer);
kthread_destroy_worker(pit->worker);
- kvm_free_irq_source_id(kvm, pit->irq_source_id);
kfree(pit);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index a768212ba821..60fa499d2f8a 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -6,6 +6,11 @@
#include <kvm/iodev.h>
+#include <uapi/asm/kvm.h>
+
+#include "ioapic.h"
+
+#ifdef CONFIG_KVM_IOAPIC
struct kvm_kpit_channel_state {
u32 count; /* can be 65536 */
u16 latched_count;
@@ -42,7 +47,6 @@ struct kvm_pit {
struct kvm_io_device speaker_dev;
struct kvm *kvm;
struct kvm_kpit_state pit_state;
- int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
struct kthread_worker *worker;
struct kthread_work expired;
@@ -55,11 +59,14 @@ struct kvm_pit {
#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
#define KVM_PIT_CHANNEL_MASK 0x3
+int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps);
+int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps);
+int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control);
+
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
void kvm_free_pit(struct kvm *kvm);
-
-void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
- int hpet_legacy_start);
-void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
+#endif /* CONFIG_KVM_IOAPIC */
#endif
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a8fb19940975..2ac7f1678c46 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -31,6 +31,8 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bitops.h>
+
+#include "ioapic.h"
#include "irq.h"
#include <linux/kvm_host.h>
@@ -185,8 +187,11 @@ void kvm_pic_update_irq(struct kvm_pic *s)
pic_unlock(s);
}
-int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
+ struct kvm_pic *s = kvm->arch.vpic;
+ int irq = e->irqchip.pin;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
@@ -203,16 +208,6 @@ int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
return ret;
}
-void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
-{
- int i;
-
- pic_lock(s);
- for (i = 0; i < PIC_NUM_PINS; i++)
- __clear_bit(irq_source_id, &s->irq_states[i]);
- pic_unlock(s);
-}
-
/*
* acknowledge interrupt 'irq'
*/
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 995eb5054360..2b5d389bca5f 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -41,11 +41,11 @@
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/current.h>
-#include <trace/events/kvm.h>
#include "ioapic.h"
#include "lapic.h"
#include "irq.h"
+#include "trace.h"
static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
bool line_status);
@@ -296,11 +296,8 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
index == RTC_GSI) {
u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
- if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
- e->fields.dest_id, dm) ||
- kvm_apic_pending_eoi(vcpu, e->fields.vector))
- __set_bit(e->fields.vector,
- ioapic_handled_vectors);
+ kvm_scan_ioapic_irq(vcpu, e->fields.dest_id, dm,
+ e->fields.vector, ioapic_handled_vectors);
}
}
spin_unlock(&ioapic->lock);
@@ -313,6 +310,42 @@ void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
kvm_make_scan_ioapic_request(kvm);
}
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &ioapic->mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ struct kvm_irq_mask_notifier *kimn;
+ int idx, gsi;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, &ioapic->mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
{
unsigned index;
@@ -482,9 +515,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
return ret;
}
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
- int level, bool line_status)
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ int irq = e->irqchip.pin;
int ret, irq_level;
BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
@@ -499,16 +534,6 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
return ret;
}
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
-{
- int i;
-
- spin_lock(&ioapic->lock);
- for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
- __clear_bit(irq_source_id, &ioapic->irq_states[i]);
- spin_unlock(&ioapic->lock);
-}
-
static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
{
int i;
@@ -721,6 +746,7 @@ int kvm_ioapic_init(struct kvm *kvm)
return -ENOMEM;
spin_lock_init(&ioapic->lock);
INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
+ INIT_HLIST_HEAD(&ioapic->mask_notifier_list);
kvm->arch.vioapic = ioapic;
kvm_ioapic_reset(ioapic);
kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 539333ac4b38..bf28dbc11ff6 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -86,8 +86,24 @@ struct kvm_ioapic {
struct delayed_work eoi_inject;
u32 irq_eoi[IOAPIC_NUM_PINS];
u32 irr_delivered;
+
+ /* reads protected by irq_srcu, writes by irq_lock */
+ struct hlist_head mask_notifier_list;
+};
+
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
};
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask);
+
#ifdef DEBUG
#define ASSERT(x) \
do { \
@@ -103,7 +119,7 @@ do { \
static inline int ioapic_in_kernel(struct kvm *kvm)
{
- return irqchip_kernel(kvm);
+ return irqchip_full(kvm);
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -111,13 +127,15 @@ void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
int trigger_mode);
int kvm_ioapic_init(struct kvm *kvm);
void kvm_ioapic_destroy(struct kvm *kvm);
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
- int level, bool line_status);
-void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
+int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
+
void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
ulong *ioapic_handled_vectors);
void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
ulong *ioapic_handled_vectors);
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors);
#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 97d68d837929..16da89259011 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -11,9 +11,12 @@
#include <linux/export.h>
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
+#include "hyperv.h"
+#include "ioapic.h"
#include "irq.h"
-#include "i8254.h"
+#include "trace.h"
#include "x86.h"
#include "xen.h"
@@ -41,6 +44,14 @@ static int pending_userspace_extint(struct kvm_vcpu *v)
return v->arch.pending_external_vector != -1;
}
+static int get_userspace_extint(struct kvm_vcpu *vcpu)
+{
+ int vector = vcpu->arch.pending_external_vector;
+
+ vcpu->arch.pending_external_vector = -1;
+ return vector;
+}
+
/*
* check if there is pending interrupt from
* non-APIC source without intack.
@@ -67,10 +78,13 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
if (!kvm_apic_accept_pic_intr(v))
return 0;
- if (irqchip_split(v->kvm))
- return pending_userspace_extint(v);
- else
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
return v->kvm->arch.vpic->output;
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return pending_userspace_extint(v);
}
/*
@@ -126,13 +140,13 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v)
return v->kvm->arch.xen.upcall_vector;
#endif
- if (irqchip_split(v->kvm)) {
- int vector = v->arch.pending_external_vector;
-
- v->arch.pending_external_vector = -1;
- return vector;
- } else
+#ifdef CONFIG_KVM_IOAPIC
+ if (pic_in_kernel(v->kvm))
return kvm_pic_read_irq(v->kvm); /* PIC */
+#endif
+
+ WARN_ON_ONCE(!irqchip_split(v->kvm));
+ return get_userspace_extint(v);
}
EXPORT_SYMBOL_GPL(kvm_cpu_get_extint);
@@ -163,7 +177,9 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
+#ifdef CONFIG_KVM_IOAPIC
__kvm_migrate_pit_timer(vcpu);
+#endif
kvm_x86_call(migrate_timers)(vcpu);
}
@@ -171,10 +187,532 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
{
bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
- return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
+ return resample ? irqchip_full(kvm) : irqchip_in_kernel(kvm);
}
bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
{
return irqchip_in_kernel(kvm);
}
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, struct dest_map *dest_map)
+{
+ int r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+ unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int dest_vcpus = 0;
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL &&
+ irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
+ pr_info("apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_lowest_prio_delivery(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
+ } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
+ if (!kvm_vector_hashing_enabled()) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ } else {
+ __set_bit(i, dest_vcpu_bitmap);
+ dest_vcpus++;
+ }
+ }
+ }
+
+ if (dest_vcpus != 0) {
+ int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+ lowest = kvm_get_vcpu(kvm, idx);
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
+
+ return r;
+}
+
+static void kvm_msi_to_lapic_irq(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ struct msi_msg msg = { .address_lo = e->msi.address_lo,
+ .address_hi = e->msi.address_hi,
+ .data = e->msi.data };
+
+ trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
+ (u64)msg.address_hi << 32 : 0), msg.data);
+
+ irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
+ irq->vector = msg.arch_data.vector;
+ irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
+ irq->trig_mode = msg.arch_data.is_level;
+ irq->delivery_mode = msg.arch_data.delivery_mode << 8;
+ irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
+ irq->level = 1;
+ irq->shorthand = APIC_DEST_NOSHORT;
+}
+
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct kvm_lapic_irq irq;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ if (!level)
+ return -1;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ switch (e->type) {
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ return kvm_hv_synic_set_irq(e, kvm, irq_source_id, level,
+ line_status);
+#endif
+
+ case KVM_IRQ_ROUTING_MSI:
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ kvm_msi_to_lapic_irq(kvm, e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
+ return r;
+ break;
+
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ if (!level)
+ return -1;
+
+ return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
+#endif
+ default:
+ break;
+ }
+
+ return -EWOULDBLOCK;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level,
+ line_status);
+ return 0;
+}
+
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ /* We can't check irqchip_in_kernel() here as some callers are
+ * currently initializing the irqchip. Other callers should therefore
+ * check kvm_arch_can_set_irq_routing() before calling this function.
+ */
+ switch (ue->type) {
+#ifdef CONFIG_KVM_IOAPIC
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ if (irqchip_split(kvm))
+ return -EINVAL;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->irqchip.pin += PIC_NUM_PINS / 2;
+ fallthrough;
+ case KVM_IRQCHIP_PIC_MASTER:
+ if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+ return -EINVAL;
+ e->set = kvm_pic_set_irq;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+ return -EINVAL;
+ e->set = kvm_ioapic_set_irq;
+ break;
+ default:
+ return -EINVAL;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ break;
+#endif
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+ break;
+#ifdef CONFIG_KVM_HYPERV
+ case KVM_IRQ_ROUTING_HV_SINT:
+ e->set = kvm_hv_synic_set_irq;
+ e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+ e->hv_sint.sint = ue->u.hv_sint.sint;
+ break;
+#endif
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int r = 0;
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors)
+{
+ /*
+ * Intercept EOI if the vCPU is the target of the new IRQ routing, or
+ * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
+ * may receive a level-triggered IRQ in the future, or already received
+ * level-triggered IRQ. The EOI needs to be intercepted and forwarded
+ * to I/O APIC emulation so that the IRQ can be de-asserted.
+ */
+ if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
+ __set_bit(vector, ioapic_handled_vectors);
+ } else if (kvm_apic_pending_eoi(vcpu, vector)) {
+ __set_bit(vector, ioapic_handled_vectors);
+
+ /*
+ * Track the highest pending EOI for which the vCPU is NOT the
+ * target in the new routing. Only the EOI for the IRQ that is
+ * in-flight (for the old routing) needs to be intercepted, any
+ * future IRQs that arrive on this vCPU will be coincidental to
+ * the level-triggered routing and don't need to be intercepted.
+ */
+ if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
+ vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
+ }
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ struct kvm_lapic_irq irq;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ kvm_msi_to_lapic_irq(vcpu->kvm, entry, &irq);
+
+ if (!irq.trig_mode)
+ continue;
+
+ kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
+ irq.vector, ioapic_handled_vectors);
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_HYPERV
+ kvm_hv_irq_routing_update(kvm);
+#endif
+
+ if (irqchip_split(kvm))
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static int kvm_pi_update_irte(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *entry)
+{
+ unsigned int host_irq = irqfd->producer->irq;
+ struct kvm *kvm = irqfd->kvm;
+ struct kvm_vcpu *vcpu = NULL;
+ struct kvm_lapic_irq irq;
+ int r;
+
+ if (WARN_ON_ONCE(!irqchip_in_kernel(kvm) || !kvm_arch_has_irq_bypass()))
+ return -EINVAL;
+
+ if (entry && entry->type == KVM_IRQ_ROUTING_MSI) {
+ kvm_msi_to_lapic_irq(kvm, entry, &irq);
+
+ /*
+ * Force remapped mode if hardware doesn't support posting the
+ * virtual interrupt to a vCPU. Only IRQs are postable (NMIs,
+ * SMIs, etc. are not), and neither AMD nor Intel IOMMUs support
+ * posting multicast/broadcast IRQs. If the interrupt can't be
+ * posted, the device MSI needs to be routed to the host so that
+ * the guest's desired interrupt can be synthesized by KVM.
+ *
+ * This means that KVM can only post lowest-priority interrupts
+ * if they have a single CPU as the destination, e.g. only if
+ * the guest has affined the interrupt to a single vCPU.
+ */
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq))
+ vcpu = NULL;
+ }
+
+ if (!irqfd->irq_bypass_vcpu && !vcpu)
+ return 0;
+
+ r = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, host_irq, irqfd->gsi,
+ vcpu, irq.vector);
+ if (r) {
+ WARN_ON_ONCE(irqfd->irq_bypass_vcpu && !vcpu);
+ irqfd->irq_bypass_vcpu = NULL;
+ return r;
+ }
+
+ irqfd->irq_bypass_vcpu = vcpu;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu, irqfd->gsi, irq.vector, !!vcpu);
+ return 0;
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret = 0;
+
+ spin_lock_irq(&kvm->irqfds.lock);
+ irqfd->producer = prod;
+
+ if (!kvm->arch.nr_possible_bypass_irqs++)
+ kvm_x86_call(pi_start_bypass)(kvm);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, &irqfd->irq_entry);
+ if (ret)
+ kvm->arch.nr_possible_bypass_irqs--;
+ }
+ spin_unlock_irq(&kvm->irqfds.lock);
+
+ return ret;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+ struct kvm *kvm = irqfd->kvm;
+ int ret;
+
+ WARN_ON(irqfd->producer != prod);
+
+ /*
+ * If the producer of an IRQ that is currently being posted to a vCPU
+ * is unregistered, change the associated IRTE back to remapped mode as
+ * the IRQ has been released (or repurposed) by the device driver, i.e.
+ * KVM must relinquish control of the IRTE.
+ */
+ spin_lock_irq(&kvm->irqfds.lock);
+
+ if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+ ret = kvm_pi_update_irte(irqfd, NULL);
+ if (ret)
+ pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n",
+ irqfd->consumer.eventfd, ret);
+ }
+ irqfd->producer = NULL;
+
+ kvm->arch.nr_possible_bypass_irqs--;
+
+ spin_unlock_irq(&kvm->irqfds.lock);
+}
+
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+ struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI &&
+ old->type != KVM_IRQ_ROUTING_MSI)
+ return;
+
+ if (old->type == KVM_IRQ_ROUTING_MSI &&
+ new->type == KVM_IRQ_ROUTING_MSI &&
+ !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+ return;
+
+ kvm_pi_update_irte(irqfd, new);
+}
+
+#ifdef CONFIG_KVM_IOAPIC
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
+#define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+};
+
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
+
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&chip->chip.pic, &pic->pics[0],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&chip->chip.pic, &pic->pics[1],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_get_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[0], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[1], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_set_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ kvm_pic_update_irq(pic);
+ return r;
+}
+#endif
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 76d46b2f41dd..5e62c1f79ce6 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -18,6 +18,8 @@
#include <kvm/iodev.h>
#include "lapic.h"
+#ifdef CONFIG_KVM_IOAPIC
+
#define PIC_NUM_PINS 16
#define SELECT_PIC(irq) \
((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
@@ -63,17 +65,15 @@ int kvm_pic_init(struct kvm *kvm);
void kvm_pic_destroy(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
+int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status);
-static inline int irqchip_split(struct kvm *kvm)
-{
- int mode = kvm->arch.irqchip_mode;
+int kvm_setup_default_ioapic_and_pic_routing(struct kvm *kvm);
- /* Matches smp_wmb() when setting irqchip_mode */
- smp_rmb();
- return mode == KVM_IRQCHIP_SPLIT;
-}
+int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
+int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip);
-static inline int irqchip_kernel(struct kvm *kvm)
+static inline int irqchip_full(struct kvm *kvm)
{
int mode = kvm->arch.irqchip_mode;
@@ -81,10 +81,26 @@ static inline int irqchip_kernel(struct kvm *kvm)
smp_rmb();
return mode == KVM_IRQCHIP_KERNEL;
}
+#else /* CONFIG_KVM_IOAPIC */
+static __always_inline int irqchip_full(struct kvm *kvm)
+{
+ return false;
+}
+#endif
static inline int pic_in_kernel(struct kvm *kvm)
{
- return irqchip_kernel(kvm);
+ return irqchip_full(kvm);
+}
+
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_SPLIT;
}
static inline int irqchip_in_kernel(struct kvm *kvm)
@@ -105,7 +121,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
-int kvm_setup_default_irq_routing(struct kvm *kvm);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
deleted file mode 100644
index 8136695f7b96..000000000000
--- a/arch/x86/kvm/irq_comm.c
+++ /dev/null
@@ -1,442 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * irq_comm.c: Common API for in kernel interrupt controller
- * Copyright (c) 2007, Intel Corporation.
- *
- * Authors:
- * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kvm_host.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/rculist.h>
-
-#include <trace/events/kvm.h>
-
-#include "irq.h"
-
-#include "ioapic.h"
-
-#include "lapic.h"
-
-#include "hyperv.h"
-#include "x86.h"
-#include "xen.h"
-
-static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
-}
-
-static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_ioapic *ioapic = kvm->arch.vioapic;
- return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
- line_status);
-}
-
-int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, struct dest_map *dest_map)
-{
- int r = -1;
- struct kvm_vcpu *vcpu, *lowest = NULL;
- unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
- unsigned int dest_vcpus = 0;
-
- if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
- return r;
-
- if (irq->dest_mode == APIC_DEST_PHYSICAL &&
- irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
- pr_info("apic: phys broadcast and lowest prio\n");
- irq->delivery_mode = APIC_DM_FIXED;
- }
-
- memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!kvm_apic_present(vcpu))
- continue;
-
- if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
- irq->dest_id, irq->dest_mode))
- continue;
-
- if (!kvm_lowest_prio_delivery(irq)) {
- if (r < 0)
- r = 0;
- r += kvm_apic_set_irq(vcpu, irq, dest_map);
- } else if (kvm_apic_sw_enabled(vcpu->arch.apic)) {
- if (!kvm_vector_hashing_enabled()) {
- if (!lowest)
- lowest = vcpu;
- else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
- lowest = vcpu;
- } else {
- __set_bit(i, dest_vcpu_bitmap);
- dest_vcpus++;
- }
- }
- }
-
- if (dest_vcpus != 0) {
- int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
- dest_vcpu_bitmap, KVM_MAX_VCPUS);
-
- lowest = kvm_get_vcpu(kvm, idx);
- }
-
- if (lowest)
- r = kvm_apic_set_irq(lowest, irq, dest_map);
-
- return r;
-}
-
-void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct kvm_lapic_irq *irq)
-{
- struct msi_msg msg = { .address_lo = e->msi.address_lo,
- .address_hi = e->msi.address_hi,
- .data = e->msi.data };
-
- trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ?
- (u64)msg.address_hi << 32 : 0), msg.data);
-
- irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format);
- irq->vector = msg.arch_data.vector;
- irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical);
- irq->trig_mode = msg.arch_data.is_level;
- irq->delivery_mode = msg.arch_data.delivery_mode << 8;
- irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint;
- irq->level = 1;
- irq->shorthand = APIC_DEST_NOSHORT;
-}
-EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
-
-static inline bool kvm_msi_route_invalid(struct kvm *kvm,
- struct kvm_kernel_irq_routing_entry *e)
-{
- return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
-}
-
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level, bool line_status)
-{
- struct kvm_lapic_irq irq;
-
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
-
- if (!level)
- return -1;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
-}
-
-#ifdef CONFIG_KVM_HYPERV
-static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- if (!level)
- return -1;
-
- return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
-}
-#endif
-
-int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id, int level,
- bool line_status)
-{
- struct kvm_lapic_irq irq;
- int r;
-
- switch (e->type) {
-#ifdef CONFIG_KVM_HYPERV
- case KVM_IRQ_ROUTING_HV_SINT:
- return kvm_hv_set_sint(e, kvm, irq_source_id, level,
- line_status);
-#endif
-
- case KVM_IRQ_ROUTING_MSI:
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
- return r;
- break;
-
-#ifdef CONFIG_KVM_XEN
- case KVM_IRQ_ROUTING_XEN_EVTCHN:
- if (!level)
- return -1;
-
- return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
-#endif
- default:
- break;
- }
-
- return -EWOULDBLOCK;
-}
-
-int kvm_request_irq_source_id(struct kvm *kvm)
-{
- unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
- int irq_source_id;
-
- mutex_lock(&kvm->irq_lock);
- irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
-
- if (irq_source_id >= BITS_PER_LONG) {
- pr_warn("exhausted allocatable IRQ sources!\n");
- irq_source_id = -EFAULT;
- goto unlock;
- }
-
- ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
- ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
- set_bit(irq_source_id, bitmap);
-unlock:
- mutex_unlock(&kvm->irq_lock);
-
- return irq_source_id;
-}
-
-void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
-{
- ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
- ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-
- mutex_lock(&kvm->irq_lock);
- if (irq_source_id < 0 ||
- irq_source_id >= BITS_PER_LONG) {
- pr_err("IRQ source ID out of range!\n");
- goto unlock;
- }
- clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
- if (!irqchip_kernel(kvm))
- goto unlock;
-
- kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
- kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
-unlock:
- mutex_unlock(&kvm->irq_lock);
-}
-
-void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn)
-{
- mutex_lock(&kvm->irq_lock);
- kimn->irq = irq;
- hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
- mutex_unlock(&kvm->irq_lock);
-}
-
-void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
- struct kvm_irq_mask_notifier *kimn)
-{
- mutex_lock(&kvm->irq_lock);
- hlist_del_rcu(&kimn->link);
- mutex_unlock(&kvm->irq_lock);
- synchronize_srcu(&kvm->irq_srcu);
-}
-
-void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
- bool mask)
-{
- struct kvm_irq_mask_notifier *kimn;
- int idx, gsi;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
- if (gsi != -1)
- hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link)
- if (kimn->irq == gsi)
- kimn->func(kimn, mask);
- srcu_read_unlock(&kvm->irq_srcu, idx);
-}
-
-bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
-{
- return irqchip_in_kernel(kvm);
-}
-
-int kvm_set_routing_entry(struct kvm *kvm,
- struct kvm_kernel_irq_routing_entry *e,
- const struct kvm_irq_routing_entry *ue)
-{
- /* We can't check irqchip_in_kernel() here as some callers are
- * currently initializing the irqchip. Other callers should therefore
- * check kvm_arch_can_set_irq_routing() before calling this function.
- */
- switch (ue->type) {
- case KVM_IRQ_ROUTING_IRQCHIP:
- if (irqchip_split(kvm))
- return -EINVAL;
- e->irqchip.pin = ue->u.irqchip.pin;
- switch (ue->u.irqchip.irqchip) {
- case KVM_IRQCHIP_PIC_SLAVE:
- e->irqchip.pin += PIC_NUM_PINS / 2;
- fallthrough;
- case KVM_IRQCHIP_PIC_MASTER:
- if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
- return -EINVAL;
- e->set = kvm_set_pic_irq;
- break;
- case KVM_IRQCHIP_IOAPIC:
- if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
- return -EINVAL;
- e->set = kvm_set_ioapic_irq;
- break;
- default:
- return -EINVAL;
- }
- e->irqchip.irqchip = ue->u.irqchip.irqchip;
- break;
- case KVM_IRQ_ROUTING_MSI:
- e->set = kvm_set_msi;
- e->msi.address_lo = ue->u.msi.address_lo;
- e->msi.address_hi = ue->u.msi.address_hi;
- e->msi.data = ue->u.msi.data;
-
- if (kvm_msi_route_invalid(kvm, e))
- return -EINVAL;
- break;
-#ifdef CONFIG_KVM_HYPERV
- case KVM_IRQ_ROUTING_HV_SINT:
- e->set = kvm_hv_set_sint;
- e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
- e->hv_sint.sint = ue->u.hv_sint.sint;
- break;
-#endif
-#ifdef CONFIG_KVM_XEN
- case KVM_IRQ_ROUTING_XEN_EVTCHN:
- return kvm_xen_setup_evtchn(kvm, e, ue);
-#endif
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
- struct kvm_vcpu **dest_vcpu)
-{
- int r = 0;
- unsigned long i;
- struct kvm_vcpu *vcpu;
-
- if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
- return true;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!kvm_apic_present(vcpu))
- continue;
-
- if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
- irq->dest_id, irq->dest_mode))
- continue;
-
- if (++r == 2)
- return false;
-
- *dest_vcpu = vcpu;
- }
-
- return r == 1;
-}
-EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
-
-#define IOAPIC_ROUTING_ENTRY(irq) \
- { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
- .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
-#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
-
-#define PIC_ROUTING_ENTRY(irq) \
- { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
- .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
-#define ROUTING_ENTRY2(irq) \
- IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
-
-static const struct kvm_irq_routing_entry default_routing[] = {
- ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
- ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
- ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
- ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
- ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
- ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
- ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
- ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
- ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
- ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
- ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
- ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
-};
-
-int kvm_setup_default_irq_routing(struct kvm *kvm)
-{
- return kvm_set_irq_routing(kvm, default_routing,
- ARRAY_SIZE(default_routing), 0);
-}
-
-void kvm_arch_post_irq_routing_update(struct kvm *kvm)
-{
- if (!irqchip_split(kvm))
- return;
- kvm_make_scan_ioapic_request(kvm);
-}
-
-void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
- ulong *ioapic_handled_vectors)
-{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_kernel_irq_routing_entry *entry;
- struct kvm_irq_routing_table *table;
- u32 i, nr_ioapic_pins;
- int idx;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
- kvm->arch.nr_reserved_ioapic_pins);
- for (i = 0; i < nr_ioapic_pins; ++i) {
- hlist_for_each_entry(entry, &table->map[i], link) {
- struct kvm_lapic_irq irq;
-
- if (entry->type != KVM_IRQ_ROUTING_MSI)
- continue;
-
- kvm_set_msi_irq(vcpu->kvm, entry, &irq);
-
- if (irq.trig_mode &&
- (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
- irq.dest_id, irq.dest_mode) ||
- kvm_apic_pending_eoi(vcpu, irq.vector)))
- __set_bit(irq.vector, ioapic_handled_vectors);
- }
- }
- srcu_read_unlock(&kvm->irq_srcu, idx);
-}
-
-void kvm_arch_irq_routing_update(struct kvm *kvm)
-{
-#ifdef CONFIG_KVM_HYPERV
- kvm_hv_irq_routing_update(kvm);
-#endif
-}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c9de81cc27e1..8172c2042dd6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/math64.h>
#include <linux/slab.h>
+#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -55,9 +56,6 @@
/* 14 is the version for Xeon and Pentium 8.4.8*/
#define APIC_VERSION 0x14UL
#define LAPIC_MMIO_LENGTH (1 << 12)
-/* followed define is not in apicdef.h */
-#define MAX_APIC_VECTOR 256
-#define APIC_VECTORS_PER_REG 32
/*
* Enable local APIC timer advancement (tscdeadline mode only) with adaptive
@@ -79,42 +77,20 @@ module_param(lapic_timer_advance, bool, 0444);
static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data);
static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data);
-static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val)
-{
- *((u32 *) (regs + reg_off)) = val;
-}
-
static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
{
- __kvm_lapic_set_reg(apic->regs, reg_off, val);
-}
-
-static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg)
-{
- BUILD_BUG_ON(reg != APIC_ICR);
- return *((u64 *) (regs + reg));
+ apic_set_reg(apic->regs, reg_off, val);
}
static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg)
{
- return __kvm_lapic_get_reg64(apic->regs, reg);
-}
-
-static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val)
-{
- BUILD_BUG_ON(reg != APIC_ICR);
- *((u64 *) (regs + reg)) = val;
+ return apic_get_reg64(apic->regs, reg);
}
static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic,
int reg, u64 val)
{
- __kvm_lapic_set_reg64(apic->regs, reg, val);
-}
-
-static inline int apic_test_vector(int vec, void *bitmap)
-{
- return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+ apic_set_reg64(apic->regs, reg, val);
}
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
@@ -125,16 +101,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
apic_test_vector(vector, apic->regs + APIC_IRR);
}
-static inline int __apic_test_and_set_vector(int vec, void *bitmap)
-{
- return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
-{
- return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
@@ -626,21 +592,6 @@ static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
[LVT_CMCI] = LVT_MASK | APIC_MODE_MASK
};
-static int find_highest_vector(void *bitmap)
-{
- int vec;
- u32 *reg;
-
- for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
- vec >= 0; vec -= APIC_VECTORS_PER_REG) {
- reg = bitmap + REG_POS(vec);
- if (*reg)
- return __fls(*reg) + vec;
- }
-
- return -1;
-}
-
static u8 count_vectors(void *bitmap)
{
int vec;
@@ -648,34 +599,36 @@ static u8 count_vectors(void *bitmap)
u8 count = 0;
for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
- reg = bitmap + REG_POS(vec);
+ reg = bitmap + APIC_VECTOR_TO_REG_OFFSET(vec);
count += hweight32(*reg);
}
return count;
}
-bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
+bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr)
{
+ unsigned long pir_vals[NR_PIR_WORDS];
+ u32 *__pir = (void *)pir_vals;
u32 i, vec;
- u32 pir_val, irr_val, prev_irr_val;
+ u32 irr_val, prev_irr_val;
int max_updated_irr;
max_updated_irr = -1;
*max_irr = -1;
+ if (!pi_harvest_pir(pir, pir_vals))
+ return false;
+
for (i = vec = 0; i <= 7; i++, vec += 32) {
u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
- irr_val = *p_irr;
- pir_val = READ_ONCE(pir[i]);
-
- if (pir_val) {
- pir_val = xchg(&pir[i], 0);
+ irr_val = READ_ONCE(*p_irr);
+ if (__pir[i]) {
prev_irr_val = irr_val;
do {
- irr_val = prev_irr_val | pir_val;
+ irr_val = prev_irr_val | __pir[i];
} while (prev_irr_val != irr_val &&
!try_cmpxchg(p_irr, &prev_irr_val, irr_val));
@@ -691,7 +644,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
}
EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
-bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr)
{
struct kvm_lapic *apic = vcpu->arch.apic;
bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
@@ -704,7 +657,7 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
static inline int apic_search_irr(struct kvm_lapic *apic)
{
- return find_highest_vector(apic->regs + APIC_IRR);
+ return apic_find_highest_vector(apic->regs + APIC_IRR);
}
static inline int apic_find_highest_irr(struct kvm_lapic *apic)
@@ -727,10 +680,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
{
if (unlikely(apic->apicv_active)) {
- kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
} else {
apic->irr_pending = false;
- kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
if (apic_search_irr(apic) != -1)
apic->irr_pending = true;
}
@@ -742,9 +695,15 @@ void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec)
}
EXPORT_SYMBOL_GPL(kvm_apic_clear_irr);
+static void *apic_vector_to_isr(int vec, struct kvm_lapic *apic)
+{
+ return apic->regs + APIC_ISR + APIC_VECTOR_TO_REG_OFFSET(vec);
+}
+
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
- if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ if (__test_and_set_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
+ apic_vector_to_isr(vec, apic)))
return;
/*
@@ -779,7 +738,7 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
if (likely(apic->highest_isr_cache != -1))
return apic->highest_isr_cache;
- result = find_highest_vector(apic->regs + APIC_ISR);
+ result = apic_find_highest_vector(apic->regs + APIC_ISR);
ASSERT(result == -1 || result >= 16);
return result;
@@ -787,7 +746,8 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
{
- if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ if (!__test_and_clear_bit(APIC_VECTOR_TO_BIT_NUMBER(vec),
+ apic_vector_to_isr(vec, apic)))
return;
/*
@@ -1330,11 +1290,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
if (trig_mode)
- kvm_lapic_set_vector(vector,
- apic->regs + APIC_TMR);
+ apic_set_vector(vector, apic->regs + APIC_TMR);
else
- kvm_lapic_clear_vector(vector,
- apic->regs + APIC_TMR);
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
}
kvm_x86_call(deliver_interrupt)(apic, delivery_mode,
@@ -1453,12 +1411,20 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
{
- int trigger_mode;
+ int __maybe_unused trigger_mode;
/* Eoi the ioapic only if the ioapic doesn't own the vector. */
if (!kvm_ioapic_handles_vector(apic, vector))
return;
+ /*
+ * If the intercepted EOI is for an IRQ that was pending from previous
+ * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely
+ * no longer need to be intercepted.
+ */
+ if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector)
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu);
+
/* Request a KVM exit to inform the userspace IOAPIC. */
if (irqchip_split(apic->vcpu->kvm)) {
apic->vcpu->arch.pending_ioapic_eoi = vector;
@@ -1466,12 +1432,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
return;
}
+#ifdef CONFIG_KVM_IOAPIC
if (apic_test_vector(vector, apic->regs + APIC_TMR))
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+#endif
}
static int apic_set_eoi(struct kvm_lapic *apic)
@@ -3074,12 +3042,12 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
if (!kvm_x86_ops.x2apic_icr_is_split) {
if (set) {
- icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
- (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
- __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ icr = apic_get_reg(s->regs, APIC_ICR) |
+ (u64)apic_get_reg(s->regs, APIC_ICR2) << 32;
+ apic_set_reg64(s->regs, APIC_ICR, icr);
} else {
- icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
- __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ icr = apic_get_reg64(s->regs, APIC_ICR);
+ apic_set_reg(s->regs, APIC_ICR2, icr >> 32);
}
}
}
@@ -3095,8 +3063,7 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
* Get calculated timer current count for remaining timer period (if
* any) and store it in the returned register set.
*/
- __kvm_lapic_set_reg(s->regs, APIC_TMCCT,
- __apic_read(vcpu->arch.apic, APIC_TMCCT));
+ apic_set_reg(s->regs, APIC_TMCCT, __apic_read(vcpu->arch.apic, APIC_TMCCT));
return kvm_apic_state_fixup(vcpu, s, false);
}
@@ -3136,8 +3103,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+#ifdef CONFIG_KVM_IOAPIC
if (ioapic_in_kernel(vcpu->kvm))
kvm_rtc_eoi_tracking_restore_one(vcpu);
+#endif
vcpu->arch.apic_arb_prio = 0;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e33c969439f7..72de14527698 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -4,6 +4,8 @@
#include <kvm/iodev.h>
+#include <asm/apic.h>
+
#include <linux/kvm_host.h>
#include "hyperv.h"
@@ -21,6 +23,8 @@
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
enum lapic_mode {
LAPIC_MODE_DISABLED = 0,
LAPIC_MODE_INVALID = X2APIC_ENABLE,
@@ -103,8 +107,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int shorthand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
-bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
-bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
+bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr);
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr);
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
@@ -145,22 +149,9 @@ void kvm_lapic_exit(void);
u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic);
-#define VEC_POS(v) ((v) & (32 - 1))
-#define REG_POS(v) (((v) >> 5) << 4)
-
-static inline void kvm_lapic_clear_vector(int vec, void *bitmap)
-{
- clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
-static inline void kvm_lapic_set_vector(int vec, void *bitmap)
-{
- set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
-}
-
static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
{
- kvm_lapic_set_vector(vec, apic->regs + APIC_IRR);
+ apic_set_vector(vec, apic->regs + APIC_IRR);
/*
* irr_pending must be true if any interrupt is pending; set it after
* APIC_IRR to avoid race with apic_clear_irr
@@ -168,14 +159,9 @@ static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
apic->irr_pending = true;
}
-static inline u32 __kvm_lapic_get_reg(char *regs, int reg_off)
-{
- return *((u32 *) (regs + reg_off));
-}
-
static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
{
- return __kvm_lapic_get_reg(apic->regs, reg_off);
+ return apic_get_reg(apic->regs, reg_off);
}
DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7b3f1783ab3c..6e838cb6c9e1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1983,14 +1983,35 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
return true;
}
+static __ro_after_init HLIST_HEAD(empty_page_hash);
+
+static struct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn)
+{
+ /*
+ * Ensure the load of the hash table pointer itself is ordered before
+ * loads to walk the table. The pointer is set at runtime outside of
+ * mmu_lock when the TDP MMU is enabled, i.e. when the hash table of
+ * shadow pages becomes necessary only when KVM needs to shadow L1's
+ * TDP for an L2 guest. Pairs with the smp_store_release() in
+ * kvm_mmu_alloc_page_hash().
+ */
+ struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash);
+
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (!page_hash)
+ return &empty_page_hash;
+
+ return &page_hash[kvm_page_table_hashfn(gfn)];
+}
+
#define for_each_valid_sp(_kvm, _sp, _list) \
hlist_for_each_entry(_sp, _list, hash_link) \
if (is_obsolete_sp((_kvm), (_sp))) { \
} else
#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
- for_each_valid_sp(_kvm, _sp, \
- &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
+ for_each_valid_sp(_kvm, _sp, kvm_get_mmu_page_hash(_kvm, _gfn)) \
if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
@@ -2358,6 +2379,12 @@ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
struct kvm_mmu_page *sp;
bool created = false;
+ /*
+ * No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as
+ * mmu_page_hash must be set prior to creating the first shadow root,
+ * i.e. reaching this point is fully serialized by slots_arch_lock.
+ */
+ BUG_ON(!kvm->arch.mmu_page_hash);
sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
@@ -3020,7 +3047,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
if (is_shadow_present_pte(*sptep)) {
- if (prefetch)
+ if (prefetch && is_last_spte(*sptep, level) &&
+ pfn == spte_to_pfn(*sptep))
return RET_PF_SPURIOUS;
/*
@@ -3034,7 +3062,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
child = spte_to_child_sp(pte);
drop_parent_pte(vcpu->kvm, child, sptep);
flush = true;
- } else if (pfn != spte_to_pfn(*sptep)) {
+ } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) {
drop_spte(vcpu->kvm, sptep);
flush = true;
} else
@@ -3881,6 +3909,28 @@ out_unlock:
return r;
}
+static int kvm_mmu_alloc_page_hash(struct kvm *kvm)
+{
+ struct hlist_head *h;
+
+ if (kvm->arch.mmu_page_hash)
+ return 0;
+
+ h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT);
+ if (!h)
+ return -ENOMEM;
+
+ /*
+ * Ensure the hash table pointer is set only after all stores to zero
+ * the memory are retired. Pairs with the smp_load_acquire() in
+ * kvm_get_mmu_page_hash(). Note, mmu_lock must be held for write to
+ * add (or remove) shadow pages, and so readers are guaranteed to see
+ * an empty list for their current mmu_lock critical section.
+ */
+ smp_store_release(&kvm->arch.mmu_page_hash, h);
+ return 0;
+}
+
static int mmu_first_shadow_root_alloc(struct kvm *kvm)
{
struct kvm_memslots *slots;
@@ -3900,9 +3950,13 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
if (kvm_shadow_root_allocated(kvm))
goto out_unlock;
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ goto out_unlock;
+
/*
- * Check if anything actually needs to be allocated, e.g. all metadata
- * will be allocated upfront if TDP is disabled.
+ * Check if memslot metadata actually needs to be allocated, e.g. all
+ * metadata will be allocated upfront if TDP is disabled.
*/
if (kvm_memslots_have_rmaps(kvm) &&
kvm_page_track_write_tracking_enabled(kvm))
@@ -4895,12 +4949,16 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
{
u64 error_code = PFERR_GUEST_FINAL_MASK;
u8 level = PG_LEVEL_4K;
+ u64 direct_bits;
u64 end;
int r;
if (!vcpu->kvm->arch.pre_fault_allowed)
return -EOPNOTSUPP;
+ if (kvm_is_gfn_alias(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ return -EINVAL;
+
/*
* reload is efficient when called repeatedly, so we can do it on
* every iteration.
@@ -4909,15 +4967,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
if (r)
return r;
+ direct_bits = 0;
if (kvm_arch_has_private_mem(vcpu->kvm) &&
kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
error_code |= PFERR_PRIVATE_ACCESS;
+ else
+ direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
/*
* Shadow paging uses GVA for kvm page fault, so restrict to
* two-dimensional paging.
*/
- r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+ r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level);
if (r < 0)
return r;
@@ -6674,15 +6735,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
}
-void kvm_mmu_init_vm(struct kvm *kvm)
+int kvm_mmu_init_vm(struct kvm *kvm)
{
+ int r;
+
kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
- if (tdp_mmu_enabled)
+ if (tdp_mmu_enabled) {
kvm_mmu_init_tdp_mmu(kvm);
+ } else {
+ r = kvm_mmu_alloc_page_hash(kvm);
+ if (r)
+ return r;
+ }
kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
@@ -6691,6 +6759,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)
kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
+ return 0;
}
static void mmu_free_vm_memory_caches(struct kvm *kvm)
@@ -6702,6 +6771,8 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm)
void kvm_mmu_uninit_vm(struct kvm *kvm)
{
+ kvfree(kvm->arch.mmu_page_hash);
+
if (tdp_mmu_enabled)
kvm_mmu_uninit_tdp_mmu(kvm);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index db8f33e4de62..65f3c89d7c5d 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -103,6 +103,9 @@ struct kvm_mmu_page {
int root_count;
refcount_t tdp_mmu_root_count;
};
+
+ bool has_mapped_host_mmio;
+
union {
/* These two members aren't used for TDP MMU */
struct {
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 68e323568e95..ed762bb4b007 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -804,9 +804,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r != RET_PF_CONTINUE)
return r;
+#if PTTYPE != PTTYPE_EPT
/*
- * Do not change pte_access if the pfn is a mmio page, otherwise
- * we will cache the incorrect access into mmio spte.
+ * Treat the guest PTE protections as writable, supervisor-only if this
+ * is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore
+ * PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO,
+ * otherwise KVM will cache incorrect access information in the SPTE.
*/
if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
!is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
@@ -822,6 +825,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (is_cr4_smep(vcpu->arch.mmu))
walker.pte_access &= ~ACC_EXEC_MASK;
}
+#endif
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index cfce03d8f123..df31039b5d63 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -104,7 +104,7 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
return spte;
}
-static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
+static bool __kvm_is_mmio_pfn(kvm_pfn_t pfn)
{
if (pfn_valid(pfn))
return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
@@ -125,6 +125,35 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
E820_TYPE_RAM);
}
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn, int *is_host_mmio)
+{
+ /*
+ * Determining if a PFN is host MMIO is relative expensive. Cache the
+ * result locally (in the sole caller) to avoid doing the full query
+ * multiple times when creating a single SPTE.
+ */
+ if (*is_host_mmio < 0)
+ *is_host_mmio = __kvm_is_mmio_pfn(pfn);
+
+ return *is_host_mmio;
+}
+
+static void kvm_track_host_mmio_mapping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (root)
+ WRITE_ONCE(root->has_mapped_host_mmio, true);
+ else
+ WRITE_ONCE(vcpu->kvm->arch.has_mapped_host_mmio, true);
+
+ /*
+ * Force vCPUs to exit and flush CPU buffers if the vCPU is using the
+ * affected root(s).
+ */
+ kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
/*
* Returns true if the SPTE needs to be updated atomically due to having bits
* that may be changed without holding mmu_lock, and for which KVM must not
@@ -162,6 +191,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
{
int level = sp->role.level;
u64 spte = SPTE_MMU_PRESENT_MASK;
+ int is_host_mmio = -1;
bool wrprot = false;
/*
@@ -209,13 +239,15 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
- spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn));
+ if (kvm_x86_ops.get_mt_mask)
+ spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn, &is_host_mmio));
if (host_writable)
spte |= shadow_host_writable_mask;
else
pte_access &= ~ACC_WRITE_MASK;
- if (shadow_me_value && !kvm_is_mmio_pfn(pfn))
+ if (shadow_me_value && !kvm_is_mmio_pfn(pfn, &is_host_mmio))
spte |= shadow_me_value;
spte |= (u64)pfn << PAGE_SHIFT;
@@ -260,6 +292,11 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
}
+ if (static_branch_unlikely(&cpu_buf_vm_clear) &&
+ !kvm_vcpu_can_access_host_mmio(vcpu) &&
+ kvm_is_mmio_pfn(pfn, &is_host_mmio))
+ kvm_track_host_mmio_mapping(vcpu);
+
*new_spte = spte;
return wrprot;
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1e94f081bdaf..3133f066927e 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -280,6 +280,16 @@ static inline bool is_mirror_sptep(tdp_ptep_t sptep)
return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep)));
}
+static inline bool kvm_vcpu_can_access_host_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (root)
+ return READ_ONCE(root->has_mapped_host_mmio);
+
+ return READ_ONCE(vcpu->kvm->arch.has_mapped_host_mmio);
+}
+
static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
{
return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 405874f4d088..7f3d7229b2c1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -378,7 +378,7 @@ static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
/* Zapping leaf spte is allowed only when write lock is held. */
lockdep_assert_held_write(&kvm->mmu_lock);
/* Because write lock is held, operation should success. */
- ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn);
+ ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn);
KVM_BUG_ON(ret, kvm);
}
@@ -485,8 +485,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
}
if (is_mirror_sp(sp) &&
- WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level,
- sp->external_spt))) {
+ WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
+ sp->external_spt))) {
/*
* Failed to free page table page in mirror page table and
* there is nothing to do further.
@@ -538,12 +538,12 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
* external page table, or leaf.
*/
if (is_leaf) {
- ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn);
+ ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn);
} else {
void *external_spt = get_external_spt(gfn, new_spte, level);
KVM_BUG_ON(!external_spt, kvm);
- ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt);
+ ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
}
if (ret)
__kvm_tdp_mmu_write_spte(sptep, old_spte);
@@ -1153,13 +1153,12 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
return RET_PF_RETRY;
- if (fault->prefetch && is_shadow_present_pte(iter->old_spte))
- return RET_PF_SPURIOUS;
-
if (is_shadow_present_pte(iter->old_spte) &&
- is_access_allowed(fault, iter->old_spte) &&
- is_last_spte(iter->old_spte, iter->level))
+ (fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
+ is_last_spte(iter->old_spte, iter->level)) {
+ WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
return RET_PF_SPURIOUS;
+ }
if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index fde0ae986003..c53b92379e6e 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -52,6 +52,10 @@
/* CPUID level 0x80000022 (EAX) */
#define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0)
+/* CPUID level 0x80000021 (ECX) */
+#define KVM_X86_FEATURE_TSA_SQ_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 1)
+#define KVM_X86_FEATURE_TSA_L1_NO KVM_X86_FEATURE(CPUID_8000_0021_ECX, 2)
+
struct cpuid_reg {
u32 function;
u32 index;
@@ -82,6 +86,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
[CPUID_7_2_EDX] = { 7, 2, CPUID_EDX},
[CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX},
+ [CPUID_8000_0021_ECX] = {0x80000021, 0, CPUID_ECX},
};
/*
@@ -121,6 +126,8 @@ static __always_inline u32 __feature_translate(int x86_feature)
KVM_X86_TRANSLATE_FEATURE(PERFMON_V2);
KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL);
KVM_X86_TRANSLATE_FEATURE(BHI_CTRL);
+ KVM_X86_TRANSLATE_FEATURE(TSA_SQ_NO);
+ KVM_X86_TRANSLATE_FEATURE(TSA_L1_NO);
default:
return x86_feature;
}
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 067f8e3f5a0d..a34c5c3b164e 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -18,6 +18,7 @@
#include <linux/hashtable.h>
#include <linux/amd-iommu.h>
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
#include <asm/irq_remapping.h>
#include <asm/msr.h>
@@ -29,36 +30,39 @@
#include "svm.h"
/*
- * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID,
- * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry
- * if an interrupt can't be delivered, e.g. because the vCPU isn't running.
+ * Encode the arbitrary VM ID and the vCPU's _index_ into the GATag so that
+ * KVM can retrieve the correct vCPU from a GALog entry if an interrupt can't
+ * be delivered, e.g. because the vCPU isn't running. Use the vCPU's index
+ * instead of its ID (a.k.a. its default APIC ID), as KVM is guaranteed a fast
+ * lookup on the index, where as vCPUs whose index doesn't match their ID need
+ * to walk the entire xarray of vCPUs in the worst case scenario.
*
- * For the vCPU ID, use however many bits are currently allowed for the max
+ * For the vCPU index, use however many bits are currently allowed for the max
* guest physical APIC ID (limited by the size of the physical ID table), and
* use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
* size of the GATag is defined by hardware (32 bits), but is an opaque value
* as far as hardware is concerned.
*/
-#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
+#define AVIC_VCPU_IDX_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
-#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+#define AVIC_GATAG_TO_VCPUIDX(x) (x & AVIC_VCPU_IDX_MASK)
-#define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
- ((vcpu_id) & AVIC_VCPU_ID_MASK))
-#define AVIC_GATAG(vm_id, vcpu_id) \
+#define __AVIC_GATAG(vm_id, vcpu_idx) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
+ ((vcpu_idx) & AVIC_VCPU_IDX_MASK))
+#define AVIC_GATAG(vm_id, vcpu_idx) \
({ \
- u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \
+ u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_idx); \
\
- WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VCPUIDX(ga_tag) != (vcpu_idx)); \
WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
ga_tag; \
})
-static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
+static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_IDX_MASK) == -1u);
static bool force_avic;
module_param_unsafe(force_avic, bool, 0444);
@@ -75,14 +79,6 @@ static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
bool x2avic_enabled;
-/*
- * This is a wrapper of struct amd_iommu_ir_data.
- */
-struct amd_svm_iommu_ir {
- struct list_head node; /* Used by SVM for per-vcpu ir_list */
- void *data; /* Storing pointer to struct amd_ir_data */
-};
-
static void avic_activate_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -147,16 +143,16 @@ int avic_ga_log_notifier(u32 ga_tag)
struct kvm_svm *kvm_svm;
struct kvm_vcpu *vcpu = NULL;
u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
- u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+ u32 vcpu_idx = AVIC_GATAG_TO_VCPUIDX(ga_tag);
- pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
- trace_kvm_avic_ga_log(vm_id, vcpu_id);
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_idx=%#x\n", __func__, vm_id, vcpu_idx);
+ trace_kvm_avic_ga_log(vm_id, vcpu_idx);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
if (kvm_svm->avic_vm_id != vm_id)
continue;
- vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
+ vcpu = kvm_get_vcpu(&kvm_svm->kvm, vcpu_idx);
break;
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
@@ -180,10 +176,8 @@ void avic_vm_destroy(struct kvm *kvm)
if (!enable_apicv)
return;
- if (kvm_svm->avic_logical_id_table_page)
- __free_page(kvm_svm->avic_logical_id_table_page);
- if (kvm_svm->avic_physical_id_table_page)
- __free_page(kvm_svm->avic_physical_id_table_page);
+ free_page((unsigned long)kvm_svm->avic_logical_id_table);
+ free_page((unsigned long)kvm_svm->avic_physical_id_table);
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
hash_del(&kvm_svm->hnode);
@@ -196,27 +190,19 @@ int avic_vm_init(struct kvm *kvm)
int err = -ENOMEM;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
struct kvm_svm *k2;
- struct page *p_page;
- struct page *l_page;
u32 vm_id;
if (!enable_apicv)
return 0;
- /* Allocating physical APIC ID table (4KB) */
- p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!p_page)
+ kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!kvm_svm->avic_physical_id_table)
goto free_avic;
- kvm_svm->avic_physical_id_table_page = p_page;
-
- /* Allocating logical APIC ID table (4KB) */
- l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!l_page)
+ kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!kvm_svm->avic_logical_id_table)
goto free_avic;
- kvm_svm->avic_logical_id_table_page = l_page;
-
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
again:
vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
@@ -242,17 +228,19 @@ free_avic:
return err;
}
+static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
+{
+ return __sme_set(__pa(svm->vcpu.arch.apic->regs));
+}
+
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
{
struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
- phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
- phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
- phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
- vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
- vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
- vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
- vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
+ vmcb->control.avic_backing_page = avic_get_backing_page_address(svm);
+ vmcb->control.avic_logical_id = __sme_set(__pa(kvm_svm->avic_logical_id_table));
+ vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
+ vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
if (kvm_apicv_activated(svm->vcpu.kvm))
avic_activate_vmcb(svm);
@@ -260,32 +248,31 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
avic_deactivate_vmcb(svm);
}
-static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
- unsigned int index)
-{
- u64 *avic_physical_id_table;
- struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
-
- if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
- (index > X2AVIC_MAX_PHYSICAL_ID))
- return NULL;
-
- avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
-
- return &avic_physical_id_table[index];
-}
-
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
- u64 *entry, new_entry;
- int id = vcpu->vcpu_id;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
+ u32 id = vcpu->vcpu_id;
+ u64 new_entry;
+ /*
+ * Inhibit AVIC if the vCPU ID is bigger than what is supported by AVIC
+ * hardware. Immediately clear apicv_active, i.e. don't wait until the
+ * KVM_REQ_APICV_UPDATE request is processed on the first KVM_RUN, as
+ * avic_vcpu_load() expects to be called if and only if the vCPU has
+ * fully initialized AVIC.
+ */
if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
- (id > X2AVIC_MAX_PHYSICAL_ID))
- return -EINVAL;
+ (id > X2AVIC_MAX_PHYSICAL_ID)) {
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
+ vcpu->arch.apic->apicv_active = false;
+ return 0;
+ }
+
+ BUILD_BUG_ON((AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE ||
+ (X2AVIC_MAX_PHYSICAL_ID + 1) * sizeof(new_entry) > PAGE_SIZE);
- if (!vcpu->arch.apic->regs)
+ if (WARN_ON_ONCE(!vcpu->arch.apic->regs))
return -EINVAL;
if (kvm_apicv_activated(vcpu->kvm)) {
@@ -302,19 +289,21 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return ret;
}
- svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs);
+ /* Note, fls64() returns the bit position, +1. */
+ BUILD_BUG_ON(__PHYSICAL_MASK_SHIFT >
+ fls64(AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK));
/* Setting AVIC backing page address in the phy APIC ID table */
- entry = avic_get_physical_id_entry(vcpu, id);
- if (!entry)
- return -EINVAL;
+ new_entry = avic_get_backing_page_address(svm) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+ svm->avic_physical_id_entry = new_entry;
- new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
- AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
- AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
- WRITE_ONCE(*entry, new_entry);
-
- svm->avic_physical_id_cache = entry;
+ /*
+ * Initialize the real table, as vCPUs must have a valid entry in order
+ * for broadcast IPIs to function correctly (broadcast IPIs ignore
+ * invalid entries, i.e. aren't guaranteed to generate a VM-Exit).
+ */
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[id], new_entry);
return 0;
}
@@ -448,7 +437,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
if (apic_x2apic_mode(source))
avic_logical_id_table = NULL;
else
- avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
+ avic_logical_id_table = kvm_svm->avic_logical_id_table;
/*
* AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
@@ -550,7 +539,6 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- u32 *logical_apic_id_table;
u32 cluster, index;
ldr = GET_APIC_LOGICAL_ID(ldr);
@@ -571,9 +559,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
return NULL;
index += (cluster << 2);
- logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
-
- return &logical_apic_id_table[index];
+ return &kvm_svm->avic_logical_id_table[index];
}
static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
@@ -722,6 +708,9 @@ int avic_init_vcpu(struct vcpu_svm *svm)
int ret;
struct kvm_vcpu *vcpu = &svm->vcpu;
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
+
if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
return 0;
@@ -729,8 +718,6 @@ int avic_init_vcpu(struct vcpu_svm *svm)
if (ret)
return ret;
- INIT_LIST_HEAD(&svm->ir_list);
- spin_lock_init(&svm->ir_list_lock);
svm->dfr_reg = APIC_DFR_FLAT;
return ret;
@@ -742,316 +729,161 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
{
- int ret = 0;
+ struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
unsigned long flags;
- struct amd_svm_iommu_ir *ir;
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm))
- return 0;
- /*
- * Here, we go through the per-vcpu ir_list to update all existing
- * interrupt remapping table entry targeting this vcpu.
- */
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
- if (list_empty(&svm->ir_list))
- goto out;
+ if (!vcpu)
+ return;
- list_for_each_entry(ir, &svm->ir_list, node) {
- if (activate)
- ret = amd_iommu_activate_guest_mode(ir->data);
- else
- ret = amd_iommu_deactivate_guest_mode(ir->data);
- if (ret)
- break;
- }
-out:
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
- return ret;
+ spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
+ list_del(&irqfd->vcpu_list);
+ spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
}
-static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
{
- unsigned long flags;
- struct amd_svm_iommu_ir *cur;
-
- spin_lock_irqsave(&svm->ir_list_lock, flags);
- list_for_each_entry(cur, &svm->ir_list, node) {
- if (cur->data != pi->ir_data)
- continue;
- list_del(&cur->node);
- kfree(cur);
- break;
- }
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-}
-
-static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
-{
- int ret = 0;
- unsigned long flags;
- struct amd_svm_iommu_ir *ir;
- u64 entry;
-
- if (WARN_ON_ONCE(!pi->ir_data))
- return -EINVAL;
-
- /**
- * In some cases, the existing irte is updated and re-set,
- * so we need to check here if it's already been * added
- * to the ir_list.
- */
- if (pi->prev_ga_tag) {
- struct kvm *kvm = svm->vcpu.kvm;
- u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
- struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
- struct vcpu_svm *prev_svm;
-
- if (!prev_vcpu) {
- ret = -EINVAL;
- goto out;
- }
-
- prev_svm = to_svm(prev_vcpu);
- svm_ir_list_del(prev_svm, pi);
- }
-
- /**
- * Allocating new amd_iommu_pi_data, which will get
- * add to the per-vcpu ir_list.
- */
- ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
- if (!ir) {
- ret = -ENOMEM;
- goto out;
- }
- ir->data = pi->ir_data;
-
- spin_lock_irqsave(&svm->ir_list_lock, flags);
-
/*
- * Update the target pCPU for IOMMU doorbells if the vCPU is running.
- * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
- * will update the pCPU info when the vCPU awkened and/or scheduled in.
- * See also avic_vcpu_load().
+ * If the IRQ was affined to a different vCPU, remove the IRTE metadata
+ * from the *previous* vCPU's list.
*/
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
- amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
- true, pi->ir_data);
-
- list_add(&ir->node, &svm->ir_list);
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-out:
- return ret;
-}
+ svm_ir_list_del(irqfd);
-/*
- * Note:
- * The HW cannot support posting multicast/broadcast
- * interrupts to a vCPU. So, we still use legacy interrupt
- * remapping for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- */
-static int
-get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
- struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
-{
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu = NULL;
-
- kvm_set_msi_irq(kvm, e, &irq);
-
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq)) {
- pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
- __func__, irq.vector);
- return -1;
- }
-
- pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
- irq.vector);
- *svm = to_svm(vcpu);
- vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
- vcpu_info->vector = irq.vector;
-
- return 0;
-}
-
-/*
- * avic_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- bool enable_remapped_mode = true;
- int idx, ret = 0;
-
- if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
- return 0;
-
- pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
- __func__, host_irq, guest_irq, set);
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
- }
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- struct vcpu_data vcpu_info;
- struct vcpu_svm *svm = NULL;
+ if (vcpu) {
+ /*
+ * Try to enable guest_mode in IRTE, unless AVIC is inhibited,
+ * in which case configure the IRTE for legacy mode, but track
+ * the IRTE metadata so that it can be converted to guest mode
+ * if AVIC is enabled/uninhibited in the future.
+ */
+ struct amd_iommu_pi_data pi_data = {
+ .ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+ vcpu->vcpu_idx),
+ .is_guest_mode = kvm_vcpu_apicv_active(vcpu),
+ .vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
+ .vector = vector,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 entry;
+ int ret;
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
+ /*
+ * Prevent the vCPU from being scheduled out or migrated until
+ * the IRTE is updated and its metadata has been added to the
+ * list of IRQs being posted to the vCPU, to ensure the IRTE
+ * isn't programmed with stale pCPU/IsRunning information.
+ */
+ guard(spinlock_irqsave)(&svm->ir_list_lock);
- /**
- * Here, we setup with legacy mode in the following cases:
- * 1. When cannot target interrupt to a specific vcpu.
- * 2. Unsetting posted interrupt.
- * 3. APIC virtualization is disabled for the vcpu.
- * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
+ /*
+ * Update the target pCPU for IOMMU doorbells if the vCPU is
+ * running. If the vCPU is NOT running, i.e. is blocking or
+ * scheduled out, KVM will update the pCPU info when the vCPU
+ * is awakened and/or scheduled in. See also avic_vcpu_load().
*/
- if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
- kvm_vcpu_apicv_active(&svm->vcpu)) {
- struct amd_iommu_pi_data pi;
-
- enable_remapped_mode = false;
-
- /* Try to enable guest_mode in IRTE */
- pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
- AVIC_HPA_MASK);
- pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
- svm->vcpu.vcpu_id);
- pi.is_guest_mode = true;
- pi.vcpu_data = &vcpu_info;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
-
- /**
- * Here, we successfully setting up vcpu affinity in
- * IOMMU guest mode. Now, we need to store the posted
- * interrupt information in a per-vcpu ir_list so that
- * we can reference to them directly when we update vcpu
- * scheduling information in IOMMU irte.
- */
- if (!ret && pi.is_guest_mode)
- svm_ir_list_add(svm, &pi);
+ entry = svm->avic_physical_id_entry;
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
+ pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ } else {
+ pi_data.cpu = -1;
+ pi_data.ga_log_intr = entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
}
- if (!ret && svm) {
- trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
- e->gsi, vcpu_info.vector,
- vcpu_info.pi_desc_addr, set);
- }
+ ret = irq_set_vcpu_affinity(host_irq, &pi_data);
+ if (ret)
+ return ret;
- if (ret < 0) {
- pr_err("%s: failed to update PI IRTE\n", __func__);
- goto out;
+ /*
+ * Revert to legacy mode if the IOMMU didn't provide metadata
+ * for the IRTE, which KVM needs to keep the IRTE up-to-date,
+ * e.g. if the vCPU is migrated or AVIC is disabled.
+ */
+ if (WARN_ON_ONCE(!pi_data.ir_data)) {
+ irq_set_vcpu_affinity(host_irq, NULL);
+ return -EIO;
}
- }
- ret = 0;
- if (enable_remapped_mode) {
- /* Use legacy mode in IRTE */
- struct amd_iommu_pi_data pi;
+ irqfd->irq_bypass_data = pi_data.ir_data;
+ list_add(&irqfd->vcpu_list, &svm->ir_list);
+ return 0;
+ }
+ return irq_set_vcpu_affinity(host_irq, NULL);
+}
- /**
- * Here, pi is used to:
- * - Tell IOMMU to use legacy mode for this interrupt.
- * - Retrieve ga_tag of prior interrupt remapping data.
- */
- pi.prev_ga_tag = 0;
- pi.is_guest_mode = false;
- ret = irq_set_vcpu_affinity(host_irq, &pi);
+enum avic_vcpu_action {
+ /*
+ * There is no need to differentiate between activate and deactivate,
+ * as KVM only refreshes AVIC state when the vCPU is scheduled in and
+ * isn't blocking, i.e. the pCPU must always be (in)valid when AVIC is
+ * being (de)activated.
+ */
+ AVIC_TOGGLE_ON_OFF = BIT(0),
+ AVIC_ACTIVATE = AVIC_TOGGLE_ON_OFF,
+ AVIC_DEACTIVATE = AVIC_TOGGLE_ON_OFF,
- /**
- * Check if the posted interrupt was previously
- * setup with the guest_mode by checking if the ga_tag
- * was cached. If so, we need to clean up the per-vcpu
- * ir_list.
- */
- if (!ret && pi.prev_ga_tag) {
- int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
- struct kvm_vcpu *vcpu;
+ /*
+ * No unique action is required to deal with a vCPU that stops/starts
+ * running. A vCPU that starts running by definition stops blocking as
+ * well, and a vCPU that stops running can't have been blocking, i.e.
+ * doesn't need to toggle GALogIntr.
+ */
+ AVIC_START_RUNNING = 0,
+ AVIC_STOP_RUNNING = 0,
- vcpu = kvm_get_vcpu_by_id(kvm, id);
- if (vcpu)
- svm_ir_list_del(to_svm(vcpu), &pi);
- }
- }
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
-}
+ /*
+ * When a vCPU starts blocking, KVM needs to set the GALogIntr flag
+ * int all associated IRTEs so that KVM can wake the vCPU if an IRQ is
+ * sent to the vCPU.
+ */
+ AVIC_START_BLOCKING = BIT(1),
+};
-static inline int
-avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
+static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
{
- int ret = 0;
- struct amd_svm_iommu_ir *ir;
+ bool ga_log_intr = (action & AVIC_START_BLOCKING);
struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_kernel_irqfd *irqfd;
lockdep_assert_held(&svm->ir_list_lock);
- if (!kvm_arch_has_assigned_device(vcpu->kvm))
- return 0;
-
/*
* Here, we go through the per-vcpu ir_list to update all existing
* interrupt remapping table entry targeting this vcpu.
*/
if (list_empty(&svm->ir_list))
- return 0;
+ return;
- list_for_each_entry(ir, &svm->ir_list, node) {
- ret = amd_iommu_update_ga(cpu, r, ir->data);
- if (ret)
- return ret;
+ list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
+ void *data = irqfd->irq_bypass_data;
+
+ if (!(action & AVIC_TOGGLE_ON_OFF))
+ WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, ga_log_intr));
+ else if (cpu >= 0)
+ WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, ga_log_intr));
+ else
+ WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
}
- return 0;
}
-void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
+ enum avic_vcpu_action action)
{
- u64 entry;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
+ u64 entry;
lockdep_assert_preemption_disabled();
if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
- /*
- * No need to update anything if the vCPU is blocking, i.e. if the vCPU
- * is being scheduled in after being preempted. The CPU entries in the
- * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
- * If the vCPU was migrated, its new CPU value will be stuffed when the
- * vCPU unblocks.
- */
- if (kvm_vcpu_is_blocking(vcpu))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
return;
/*
@@ -1063,38 +895,57 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ entry = svm->avic_physical_id_entry;
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry &= ~(AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK |
+ AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
- avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
+ svm->avic_physical_id_entry = entry;
+
+ /*
+ * If IPI virtualization is disabled, clear IsRunning when updating the
+ * actual Physical ID table, so that the CPU never sees IsRunning=1.
+ * Keep the APIC ID up-to-date in the entry to minimize the chances of
+ * things going sideways if hardware peeks at the ID.
+ */
+ if (!enable_ipiv)
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
-void avic_vcpu_put(struct kvm_vcpu *vcpu)
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- u64 entry;
+ /*
+ * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+ * is being scheduled in after being preempted. The CPU entries in the
+ * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+ * If the vCPU was migrated, its new CPU value will be stuffed when the
+ * vCPU unblocks.
+ */
+ if (kvm_vcpu_is_blocking(vcpu))
+ return;
+
+ __avic_vcpu_load(vcpu, cpu, AVIC_START_RUNNING);
+}
+
+static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
unsigned long flags;
+ u64 entry = svm->avic_physical_id_entry;
lockdep_assert_preemption_disabled();
- /*
- * Note, reading the Physical ID entry outside of ir_list_lock is safe
- * as only the pCPU that has loaded (or is loading) the vCPU is allowed
- * to modify the entry, and preemption is disabled. I.e. the vCPU
- * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
- * recursively.
- */
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
-
- /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
- if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+ if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE))
return;
/*
@@ -1107,13 +958,62 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
*/
spin_lock_irqsave(&svm->ir_list_lock, flags);
- avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+ avic_update_iommu_vcpu_affinity(vcpu, -1, action);
+
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR);
+ /*
+ * Keep the previous APIC ID in the entry so that a rogue doorbell from
+ * hardware is at least restricted to a CPU associated with the vCPU.
+ */
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+
+ if (enable_ipiv)
+ WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
+
+ /*
+ * Note! Don't set AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR in the table as
+ * it's a synthetic flag that usurps an unused should-be-zero bit.
+ */
+ if (action & AVIC_START_BLOCKING)
+ entry |= AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR;
+
+ svm->avic_physical_id_entry = entry;
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note, reading the Physical ID entry outside of ir_list_lock is safe
+ * as only the pCPU that has loaded (or is loading) the vCPU is allowed
+ * to modify the entry, and preemption is disabled. I.e. the vCPU
+ * can't be scheduled out and thus avic_vcpu_{put,load}() can't run
+ * recursively.
+ */
+ u64 entry = to_svm(vcpu)->avic_physical_id_entry;
+
+ /*
+ * Nothing to do if IsRunning == '0' due to vCPU blocking, i.e. if the
+ * vCPU is preempted while its in the process of blocking. WARN if the
+ * vCPU wasn't running and isn't blocking, KVM shouldn't attempt to put
+ * the AVIC if it wasn't previously loaded.
+ */
+ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) {
+ if (WARN_ON_ONCE(!kvm_vcpu_is_blocking(vcpu)))
+ return;
+ /*
+ * The vCPU was preempted while blocking, ensure its IRTEs are
+ * configured to generate GA Log Interrupts.
+ */
+ if (!(WARN_ON_ONCE(!(entry & AVIC_PHYSICAL_ID_ENTRY_GA_LOG_INTR))))
+ return;
+ }
+
+ __avic_vcpu_put(vcpu, kvm_vcpu_is_blocking(vcpu) ? AVIC_START_BLOCKING :
+ AVIC_STOP_RUNNING);
}
void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
@@ -1142,19 +1042,18 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
- bool activated = kvm_vcpu_apicv_active(vcpu);
-
if (!enable_apicv)
return;
+ /* APICv should only be toggled on/off while the vCPU is running. */
+ WARN_ON_ONCE(kvm_vcpu_is_blocking(vcpu));
+
avic_refresh_virtual_apic_mode(vcpu);
- if (activated)
- avic_vcpu_load(vcpu, vcpu->cpu);
+ if (kvm_vcpu_apicv_active(vcpu))
+ __avic_vcpu_load(vcpu, vcpu->cpu, AVIC_ACTIVATE);
else
- avic_vcpu_put(vcpu);
-
- avic_set_pi_irte_mode(vcpu, activated);
+ __avic_vcpu_put(vcpu, AVIC_DEACTIVATE);
}
void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
@@ -1162,20 +1061,25 @@ void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_apicv_active(vcpu))
return;
- /*
- * Unload the AVIC when the vCPU is about to block, _before_
- * the vCPU actually blocks.
- *
- * Any IRQs that arrive before IsRunning=0 will not cause an
- * incomplete IPI vmexit on the source, therefore vIRR will also
- * be checked by kvm_vcpu_check_block() before blocking. The
- * memory barrier implicit in set_current_state orders writing
- * IsRunning=0 before reading the vIRR. The processor needs a
- * matching memory barrier on interrupt delivery between writing
- * IRR and reading IsRunning; the lack of this barrier might be
- * the cause of errata #1235).
- */
- avic_vcpu_put(vcpu);
+ /*
+ * Unload the AVIC when the vCPU is about to block, _before_ the vCPU
+ * actually blocks.
+ *
+ * Note, any IRQs that arrive before IsRunning=0 will not cause an
+ * incomplete IPI vmexit on the source; kvm_vcpu_check_block() handles
+ * this by checking vIRR one last time before blocking. The memory
+ * barrier implicit in set_current_state orders writing IsRunning=0
+ * before reading the vIRR. The processor needs a matching memory
+ * barrier on interrupt delivery between writing IRR and reading
+ * IsRunning; the lack of this barrier might be the cause of errata #1235).
+ *
+ * Clear IsRunning=0 even if guest IRQs are disabled, i.e. even if KVM
+ * doesn't need to detect events for scheduling purposes. The doorbell
+ * used to signal running vCPUs cannot be blocked, i.e. will perturb the
+ * CPU and cause noisy neighbor problems if the VM is sending interrupts
+ * to the vCPU while it's scheduled out.
+ */
+ __avic_vcpu_put(vcpu, AVIC_START_BLOCKING);
}
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
@@ -1228,6 +1132,14 @@ bool avic_hardware_setup(void)
if (x2avic_enabled)
pr_info("x2AVIC enabled\n");
+ /*
+ * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
+ * due to erratum 1235, which results in missed VM-Exits on the sender
+ * and thus missed wake events for blocking vCPUs due to the CPU
+ * failing to see a software update to clear IsRunning.
+ */
+ enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
+
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
return true;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 834b67672d50..b7fd2e869998 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -185,12 +185,87 @@ void recalc_intercepts(struct vcpu_svm *svm)
}
/*
+ * This array (and its actual size) holds the set of offsets (indexing by chunk
+ * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the
+ * set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g.
+ * based on CPUID features. This array only tracks MSRs that *might* be passed
+ * through to the guest.
+ *
+ * Hardcode the capacity of the array based on the maximum number of _offsets_.
+ * MSRs are batched together, so there are fewer offsets than MSRs.
+ */
+static int nested_svm_msrpm_merge_offsets[7] __ro_after_init;
+static int nested_svm_nr_msrpm_merge_offsets __ro_after_init;
+typedef unsigned long nsvm_msrpm_merge_t;
+
+int __init nested_svm_init_msrpm_merge_offsets(void)
+{
+ static const u32 merge_msrs[] __initconst = {
+ MSR_STAR,
+ MSR_IA32_SYSENTER_CS,
+ MSR_IA32_SYSENTER_EIP,
+ MSR_IA32_SYSENTER_ESP,
+ #ifdef CONFIG_X86_64
+ MSR_GS_BASE,
+ MSR_FS_BASE,
+ MSR_KERNEL_GS_BASE,
+ MSR_LSTAR,
+ MSR_CSTAR,
+ MSR_SYSCALL_MASK,
+ #endif
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_PRED_CMD,
+ MSR_IA32_FLUSH_CMD,
+ MSR_IA32_APERF,
+ MSR_IA32_MPERF,
+ MSR_IA32_LASTBRANCHFROMIP,
+ MSR_IA32_LASTBRANCHTOIP,
+ MSR_IA32_LASTINTFROMIP,
+ MSR_IA32_LASTINTTOIP,
+ };
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) {
+ int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]);
+ u32 offset;
+
+ if (WARN_ON(bit_nr < 0))
+ return -EIO;
+
+ /*
+ * Merging is done in chunks to reduce the number of accesses
+ * to L1's bitmap.
+ */
+ offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t);
+
+ for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) {
+ if (nested_svm_msrpm_merge_offsets[j] == offset)
+ break;
+ }
+
+ if (j < nested_svm_nr_msrpm_merge_offsets)
+ continue;
+
+ if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets)))
+ return -EIO;
+
+ nested_svm_msrpm_merge_offsets[j] = offset;
+ nested_svm_nr_msrpm_merge_offsets++;
+ }
+
+ return 0;
+}
+
+/*
* Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
* is optimized in that it only merges the parts where KVM MSR permission bitmap
* may contain zero bits.
*/
-static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
+static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm;
+ nsvm_msrpm_merge_t *msrpm01 = svm->msrpm;
int i;
/*
@@ -205,7 +280,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
if (!svm->nested.force_msr_bitmap_recalc) {
struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
- if (kvm_hv_hypercall_enabled(&svm->vcpu) &&
+ if (kvm_hv_hypercall_enabled(vcpu) &&
hve->hv_enlightenments_control.msr_bitmap &&
(svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
goto set_msrpm_base_pa;
@@ -215,25 +290,17 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
- for (i = 0; i < MSRPM_OFFSETS; i++) {
- u32 value, p;
- u64 offset;
-
- if (msrpm_offsets[i] == 0xffffffff)
- break;
-
- p = msrpm_offsets[i];
+ for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) {
+ const int p = nested_svm_msrpm_merge_offsets[i];
+ nsvm_msrpm_merge_t l1_val;
+ gpa_t gpa;
- /* x2apic msrs are intercepted always for the nested guest */
- if (is_x2apic_msrpm_offset(p))
- continue;
-
- offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
+ gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val));
- if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
+ if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val)))
return false;
- svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ msrpm02[p] = msrpm01[p] | l1_val;
}
svm->nested.force_msr_bitmap_recalc = false;
@@ -678,6 +745,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
+ /*
+ * Stash vmcb02's counter if the guest hasn't moved past the guilty
+ * instruction; otherwise, reset the counter to '0'.
+ *
+ * In order to detect if L2 has made forward progress or not, track the
+ * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP
+ * is changed, guest has clearly made forward progress, bus_lock_counter
+ * still remained '1', so reset bus_lock_counter to '0'. Eg. In the
+ * scenario, where a buslock happened in L1 before VMRUN, the bus lock
+ * firmly happened on an instruction in the past. Even if vmcb01's
+ * counter is still '1', (because the guilty instruction got patched),
+ * the vCPU has clearly made forward progress and so KVM should reset
+ * vmcb02's counter to '0'.
+ *
+ * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN
+ * to prevent the same guilty instruction from triggering a VM-Exit. Eg.
+ * if userspace rate-limits the vCPU, then it's entirely possible that
+ * L1's tick interrupt is pending by the time userspace re-runs the
+ * vCPU. If KVM unconditionally clears the counter on VMRUN, then when
+ * L1 re-enters L2, the same instruction will trigger a VM-Exit and the
+ * entire cycle start over.
+ */
+ if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip))
+ vmcb02->control.bus_lock_counter = 1;
+ else
+ vmcb02->control.bus_lock_counter = 0;
+
/* Done at vmrun: asid. */
/* Also overwritten later if necessary. */
@@ -910,7 +1004,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
goto out_exit_err;
- if (nested_svm_vmrun_msrpm(svm))
+ if (nested_svm_merge_msrpm(vcpu))
goto out;
out_exit_err:
@@ -1039,8 +1133,17 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
}
+ /*
+ * Invalidate bus_lock_rip unless KVM is still waiting for the guest
+ * to make forward progress before re-enabling bus lock detection.
+ */
+ if (!vmcb02->control.bus_lock_counter)
+ svm->nested.ctl.bus_lock_rip = INVALID_GPA;
+
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+ kvm_nested_vmexit_handle_ibrs(vcpu);
+
svm_switch_vmcb(svm, &svm->vmcb01);
/*
@@ -1194,7 +1297,6 @@ int svm_allocate_nested(struct vcpu_svm *svm)
svm->nested.msrpm = svm_vcpu_alloc_msrpm();
if (!svm->nested.msrpm)
goto err_free_vmcb02;
- svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
svm->nested.initialized = true;
return 0;
@@ -1254,26 +1356,26 @@ void svm_leave_nested(struct kvm_vcpu *vcpu)
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{
- u32 offset, msr, value;
- int write, mask;
+ gpa_t base = svm->nested.ctl.msrpm_base_pa;
+ int write, bit_nr;
+ u8 value, mask;
+ u32 msr;
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return NESTED_EXIT_HOST;
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
- offset = svm_msrpm_offset(msr);
+ bit_nr = svm_msrpm_bit_nr(msr);
write = svm->vmcb->control.exit_info_1 & 1;
- mask = 1 << ((2 * (msr & 0xf)) + write);
- if (offset == MSR_INVALID)
+ if (bit_nr < 0)
return NESTED_EXIT_DONE;
- /* Offset is in 32 bit units but need in 8 bit units */
- offset *= 4;
-
- if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
+ if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE,
+ &value, sizeof(value)))
return NESTED_EXIT_DONE;
+ mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1));
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
@@ -1783,13 +1885,11 @@ out_free:
static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
if (WARN_ON(!is_guest_mode(vcpu)))
return true;
if (!vcpu->arch.pdptrs_from_userspace &&
- !nested_npt_enabled(svm) && is_pae_paging(vcpu))
+ !nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu))
/*
* Reload the guest's PDPTRs since after a migration
* the guest CR3 might be restored prior to setting the nested
@@ -1798,7 +1898,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
return false;
- if (!nested_svm_vmrun_msrpm(svm)) {
+ if (!nested_svm_merge_msrpm(vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_EMULATION;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 1aa0f07d3a63..2fbdebf79fbb 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -117,6 +117,7 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
*/
down_write(&sev_deactivate_lock);
+ /* SNP firmware requires use of WBINVD for ASID recycling. */
wbinvd_on_all_cpus();
if (sev_snp_enabled)
@@ -446,7 +447,12 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
init_args.probe = false;
ret = sev_platform_init(&init_args);
if (ret)
- goto e_free;
+ goto e_free_asid;
+
+ if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto e_free_asid;
+ }
/* This needs to happen after SEV/SNP firmware initialization. */
if (vm_type == KVM_X86_SNP_VM) {
@@ -464,6 +470,8 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
return 0;
e_free:
+ free_cpumask_var(sev->have_run_cpus);
+e_free_asid:
argp->error = init_args.error;
sev_asid_free(sev);
sev->asid = 0;
@@ -561,6 +569,8 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
+ sev->policy = params.policy;
+
memset(&start, 0, sizeof(start));
dh_blob = NULL;
@@ -706,6 +716,33 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
}
}
+static void sev_writeback_caches(struct kvm *kvm)
+{
+ /*
+ * Note, the caller is responsible for ensuring correctness if the mask
+ * can be modified, e.g. if a CPU could be doing VMRUN.
+ */
+ if (cpumask_empty(to_kvm_sev_info(kvm)->have_run_cpus))
+ return;
+
+ /*
+ * Ensure that all dirty guest tagged cache entries are written back
+ * before releasing the pages back to the system for use. CLFLUSH will
+ * not do this without SME_COHERENT, and flushing many cache lines
+ * individually is slower than blasting WBINVD for large VMs, so issue
+ * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported)
+ * on CPUs that have done VMRUN, i.e. may have dirtied data using the
+ * VM's ASID.
+ *
+ * For simplicity, never remove CPUs from the bitmap. Ideally, KVM
+ * would clear the mask when flushing caches, but doing so requires
+ * serializing multiple calls and having responding CPUs (to the IPI)
+ * mark themselves as still running if they are running (or about to
+ * run) a vCPU for the VM.
+ */
+ wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
+}
+
static unsigned long get_num_contig_pages(unsigned long idx,
struct page **inpages, unsigned long npages)
{
@@ -1593,11 +1630,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* allocate memory for header and transport buffer */
ret = -ENOMEM;
- hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+ hdr = kzalloc(params.hdr_len, GFP_KERNEL);
if (!hdr)
goto e_unpin;
- trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+ trans_data = kzalloc(params.trans_len, GFP_KERNEL);
if (!trans_data)
goto e_free_hdr;
@@ -1883,70 +1920,6 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
atomic_set_release(&src_sev->migration_in_progress, 0);
}
-/* vCPU mutex subclasses. */
-enum sev_migration_role {
- SEV_MIGRATION_SOURCE = 0,
- SEV_MIGRATION_TARGET,
- SEV_NR_MIGRATION_ROLES,
-};
-
-static int sev_lock_vcpus_for_migration(struct kvm *kvm,
- enum sev_migration_role role)
-{
- struct kvm_vcpu *vcpu;
- unsigned long i, j;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (mutex_lock_killable_nested(&vcpu->mutex, role))
- goto out_unlock;
-
-#ifdef CONFIG_PROVE_LOCKING
- if (!i)
- /*
- * Reset the role to one that avoids colliding with
- * the role used for the first vcpu mutex.
- */
- role = SEV_NR_MIGRATION_ROLES;
- else
- mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
-#endif
- }
-
- return 0;
-
-out_unlock:
-
- kvm_for_each_vcpu(j, vcpu, kvm) {
- if (i == j)
- break;
-
-#ifdef CONFIG_PROVE_LOCKING
- if (j)
- mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
-#endif
-
- mutex_unlock(&vcpu->mutex);
- }
- return -EINTR;
-}
-
-static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- unsigned long i;
- bool first = true;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (first)
- first = false;
- else
- mutex_acquire(&vcpu->mutex.dep_map,
- SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
-
- mutex_unlock(&vcpu->mutex);
- }
-}
-
static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
{
struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm);
@@ -2033,6 +2006,10 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src)
struct kvm_vcpu *src_vcpu;
unsigned long i;
+ if (src->created_vcpus != atomic_read(&src->online_vcpus) ||
+ dst->created_vcpus != atomic_read(&dst->online_vcpus))
+ return -EBUSY;
+
if (!sev_es_guest(src))
return 0;
@@ -2084,10 +2061,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
charged = true;
}
- ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
+ ret = kvm_lock_all_vcpus(kvm);
if (ret)
goto out_dst_cgroup;
- ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
+ ret = kvm_lock_all_vcpus(source_kvm);
if (ret)
goto out_dst_vcpu;
@@ -2095,15 +2072,26 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
if (ret)
goto out_source_vcpu;
+ /*
+ * Allocate a new have_run_cpus for the destination, i.e. don't copy
+ * the set of CPUs from the source. If a CPU was used to run a vCPU in
+ * the source VM but is never used for the destination VM, then the CPU
+ * can only have cached memory that was accessible to the source VM.
+ */
+ if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto out_source_vcpu;
+ }
+
sev_migrate_from(kvm, source_kvm);
kvm_vm_dead(source_kvm);
cg_cleanup_sev = src_sev;
ret = 0;
out_source_vcpu:
- sev_unlock_vcpus_for_migration(source_kvm);
+ kvm_unlock_all_vcpus(source_kvm);
out_dst_vcpu:
- sev_unlock_vcpus_for_migration(kvm);
+ kvm_unlock_all_vcpus(kvm);
out_dst_cgroup:
/* Operates on the source on success, on the destination on failure. */
if (charged)
@@ -2193,12 +2181,10 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
return -EINVAL;
/* Check for policy bits that must be set */
- if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) ||
- !(params.policy & SNP_POLICY_MASK_SMT))
+ if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO))
return -EINVAL;
- if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
- return -EINVAL;
+ sev->policy = params.policy;
sev->snp_context = snp_context_create(kvm, argp);
if (!sev->snp_context)
@@ -2754,12 +2740,7 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
goto failed;
}
- /*
- * Ensure that all guest tagged cache entries are flushed before
- * releasing the pages back to the system for use. CLFLUSH will
- * not do this, so issue a WBINVD.
- */
- wbinvd_on_all_cpus();
+ sev_writeback_caches(kvm);
__unregister_enc_region_locked(kvm, region);
@@ -2801,13 +2782,18 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
goto e_unlock;
}
+ mirror_sev = to_kvm_sev_info(kvm);
+ if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
+ ret = -ENOMEM;
+ goto e_unlock;
+ }
+
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
source_sev = to_kvm_sev_info(source_kvm);
kvm_get_kvm(source_kvm);
- mirror_sev = to_kvm_sev_info(kvm);
list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
/* Set enc_context_owner and copy its encryption context over */
@@ -2869,7 +2855,13 @@ void sev_vm_destroy(struct kvm *kvm)
WARN_ON(!list_empty(&sev->mirror_vms));
- /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
+ free_cpumask_var(sev->have_run_cpus);
+
+ /*
+ * If this is a mirror VM, remove it from the owner's list of a mirrors
+ * and skip ASID cleanup (the ASID is tied to the lifetime of the owner).
+ * Note, mirror VMs don't support registering encrypted regions.
+ */
if (is_mirroring_enc_context(kvm)) {
struct kvm *owner_kvm = sev->enc_context_owner;
@@ -2880,12 +2872,6 @@ void sev_vm_destroy(struct kvm *kvm)
return;
}
- /*
- * Ensure that all guest tagged cache entries are flushed before
- * releasing the pages back to the system for use. CLFLUSH will
- * not do this, so issue a WBINVD.
- */
- wbinvd_on_all_cpus();
/*
* if userspace was terminated before unregistering the memory regions
@@ -2931,6 +2917,33 @@ void __init sev_set_cpu_caps(void)
}
}
+static bool is_sev_snp_initialized(void)
+{
+ struct sev_user_data_snp_status *status;
+ struct sev_data_snp_addr buf;
+ bool initialized = false;
+ int ret, error = 0;
+
+ status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO);
+ if (!status)
+ return false;
+
+ buf.address = __psp_pa(status);
+ ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error);
+ if (ret) {
+ pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n",
+ ret, error, error);
+ goto out;
+ }
+
+ initialized = !!status->state;
+
+out:
+ snp_free_firmware_page(status);
+
+ return initialized;
+}
+
void __init sev_hardware_setup(void)
{
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
@@ -3035,6 +3048,14 @@ void __init sev_hardware_setup(void)
sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
out:
+ if (sev_enabled) {
+ init_args.probe = true;
+ if (sev_platform_init(&init_args))
+ sev_supported = sev_es_supported = sev_snp_supported = false;
+ else if (sev_snp_supported)
+ sev_snp_supported = is_sev_snp_initialized();
+ }
+
if (boot_cpu_has(X86_FEATURE_SEV))
pr_info("SEV %s (ASIDs %u - %u)\n",
sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" :
@@ -3061,15 +3082,6 @@ out:
sev_supported_vmsa_features = 0;
if (sev_es_debug_swap_enabled)
sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
-
- if (!sev_enabled)
- return;
-
- /*
- * Do both SNP and SEV initialization at KVM module load.
- */
- init_args.probe = true;
- sev_platform_init(&init_args);
}
void sev_hardware_unsetup(void)
@@ -3129,30 +3141,29 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
/*
* VM Page Flush takes a host virtual address and a guest ASID. Fall
- * back to WBINVD if this faults so as not to make any problems worse
- * by leaving stale encrypted data in the cache.
+ * back to full writeback of caches if this faults so as not to make
+ * any problems worse by leaving stale encrypted data in the cache.
*/
if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
- goto do_wbinvd;
+ goto do_sev_writeback_caches;
return;
-do_wbinvd:
- wbinvd_on_all_cpus();
+do_sev_writeback_caches:
+ sev_writeback_caches(vcpu->kvm);
}
void sev_guest_memory_reclaimed(struct kvm *kvm)
{
/*
* With SNP+gmem, private/encrypted memory is unreachable via the
- * hva-based mmu notifiers, so these events are only actually
- * pertaining to shared pages where there is no need to perform
- * the WBINVD to flush associated caches.
+ * hva-based mmu notifiers, i.e. these events are explicitly scoped to
+ * shared pages, where there's no need to flush caches.
*/
if (!sev_guest(kvm) || sev_snp_guest(kvm))
return;
- wbinvd_on_all_cpus();
+ sev_writeback_caches(kvm);
}
void sev_free_vcpu(struct kvm_vcpu *vcpu)
@@ -3484,6 +3495,15 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu)
if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa))
return -EINVAL;
+ /*
+ * To optimize cache flushes when memory is reclaimed from an SEV VM,
+ * track physical CPUs that enter the guest for SEV VMs and thus can
+ * have encrypted, dirty data in the cache, and flush caches only for
+ * CPUs that have entered the guest.
+ */
+ if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
+ cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
+
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
@@ -3916,9 +3936,9 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
* From this point forward, the VMSA will always be a guest-mapped page
* rather than the initial one allocated by KVM in svm->sev_es.vmsa. In
* theory, svm->sev_es.vmsa could be free'd and cleaned up here, but
- * that involves cleanups like wbinvd_on_all_cpus() which would ideally
- * be handled during teardown rather than guest boot. Deferring that
- * also allows the existing logic for SEV-ES VMSAs to be re-used with
+ * that involves cleanups like flushing caches, which would ideally be
+ * handled during teardown rather than guest boot. Deferring that also
+ * allows the existing logic for SEV-ES VMSAs to be re-used with
* minimal SNP-specific changes.
*/
svm->sev_es.snp_has_guest_vmsa = true;
@@ -4007,10 +4027,8 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
* Unless Creation is deferred until INIT, signal the vCPU to update
* its state.
*/
- if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) {
- kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
- kvm_vcpu_kick(target_vcpu);
- }
+ if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT)
+ kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
return 0;
}
@@ -4422,16 +4440,17 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
count, in);
}
-static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
+void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ /* Clear intercepts on MSRs that are context switched by hardware. */
+ svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW);
- if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
- bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
-
- set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
- }
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX))
+ svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID));
/*
* For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
@@ -4445,11 +4464,9 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
* XSAVES being exposed to the guest so that KVM can at least honor
* guest CPUID for RDMSR and WRMSR.
*/
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1);
- else
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) ||
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES));
}
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
@@ -4461,15 +4478,12 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
if (best)
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
-
- if (sev_es_guest(svm->vcpu.kvm))
- sev_es_vcpu_after_set_cpuid(svm);
}
static void sev_es_init_vmcb(struct vcpu_svm *svm)
{
+ struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
struct vmcb *vmcb = svm->vmcb01.ptr;
- struct kvm_vcpu *vcpu = &svm->vcpu;
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
@@ -4480,8 +4494,16 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
* the VMSA will be NULL if this vCPU is the destination for intrahost
* migration, and will be copied later.
*/
- if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa)
- svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
+ if (!svm->sev_es.snp_has_guest_vmsa) {
+ if (svm->sev_es.vmsa)
+ svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
+ else
+ svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+ }
+
+ if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
+ svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
+ VMCB_ALLOWED_SEV_FEATURES_VALID;
/* Can't intercept CR register access, HV can't modify CR registers */
svm_clr_intercept(svm, INTERCEPT_CR0_READ);
@@ -4519,10 +4541,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
/* Can't intercept XSETBV, HV can't modify XCR0 directly */
svm_clr_intercept(svm, INTERCEPT_XSETBV);
-
- /* Clear intercepts on selected MSRs */
- set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1);
}
void sev_init_vmcb(struct vcpu_svm *svm)
@@ -4911,7 +4929,7 @@ void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
/*
* SEV-ES avoids host/guest cache coherency issues through
- * WBINVD hooks issued via MMU notifiers during run-time, and
+ * WBNOINVD hooks issued via MMU notifiers during run-time, and
* KVM's VM destroy path at shutdown. Those MMU notifier events
* don't cover gmem since there is no requirement to map pages
* to a HVA in order to use them for a running guest. While the
@@ -4943,3 +4961,97 @@ int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
return level;
}
+
+struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_save_area *vmsa;
+ struct kvm_sev_info *sev;
+ int error = 0;
+ int ret;
+
+ if (!sev_es_guest(vcpu->kvm))
+ return NULL;
+
+ /*
+ * If the VMSA has not yet been encrypted, return a pointer to the
+ * current un-encrypted VMSA.
+ */
+ if (!vcpu->arch.guest_state_protected)
+ return (struct vmcb_save_area *)svm->sev_es.vmsa;
+
+ sev = to_kvm_sev_info(vcpu->kvm);
+
+ /* Check if the SEV policy allows debugging */
+ if (sev_snp_guest(vcpu->kvm)) {
+ if (!(sev->policy & SNP_POLICY_DEBUG))
+ return NULL;
+ } else {
+ if (sev->policy & SEV_POLICY_NODBG)
+ return NULL;
+ }
+
+ if (sev_snp_guest(vcpu->kvm)) {
+ struct sev_data_snp_dbg dbg = {0};
+
+ vmsa = snp_alloc_firmware_page(__GFP_ZERO);
+ if (!vmsa)
+ return NULL;
+
+ dbg.gctx_paddr = __psp_pa(sev->snp_context);
+ dbg.src_addr = svm->vmcb->control.vmsa_pa;
+ dbg.dst_addr = __psp_pa(vmsa);
+
+ ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error);
+
+ /*
+ * Return the target page to a hypervisor page no matter what.
+ * If this fails, the page can't be used, so leak it and don't
+ * try to use it.
+ */
+ if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa))))
+ return NULL;
+
+ if (ret) {
+ pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n",
+ ret, error, error);
+ free_page((unsigned long)vmsa);
+
+ return NULL;
+ }
+ } else {
+ struct sev_data_dbg dbg = {0};
+ struct page *vmsa_page;
+
+ vmsa_page = alloc_page(GFP_KERNEL);
+ if (!vmsa_page)
+ return NULL;
+
+ vmsa = page_address(vmsa_page);
+
+ dbg.handle = sev->handle;
+ dbg.src_addr = svm->vmcb->control.vmsa_pa;
+ dbg.dst_addr = __psp_pa(vmsa);
+ dbg.len = PAGE_SIZE;
+
+ ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error);
+ if (ret) {
+ pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n",
+ ret, error, error);
+ __free_page(vmsa_page);
+
+ return NULL;
+ }
+ }
+
+ return vmsa;
+}
+
+void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
+{
+ /* If the VMSA has not yet been encrypted, nothing was allocated */
+ if (!vcpu->arch.guest_state_protected || !vmsa)
+ return;
+
+ free_page((unsigned long)vmsa);
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ffb34dadff1c..d9931c6c4bc6 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -29,6 +29,7 @@
#include <linux/cc_platform.h>
#include <linux/smp.h>
#include <linux/string_choices.h>
+#include <linux/mutex.h>
#include <asm/apic.h>
#include <asm/msr.h>
@@ -71,8 +72,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
static bool erratum_383_found __read_mostly;
-u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
-
/*
* Set osvw_len to higher value when updated Revision Guides
* are published and we know what the new status bits are
@@ -81,72 +80,6 @@ static uint64_t osvw_len = 4, osvw_status;
static DEFINE_PER_CPU(u64, current_tsc_ratio);
-#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4))
-
-static const struct svm_direct_access_msrs {
- u32 index; /* Index of the MSR */
- bool always; /* True if intercept is initially cleared */
-} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
- { .index = MSR_STAR, .always = true },
- { .index = MSR_IA32_SYSENTER_CS, .always = true },
- { .index = MSR_IA32_SYSENTER_EIP, .always = false },
- { .index = MSR_IA32_SYSENTER_ESP, .always = false },
-#ifdef CONFIG_X86_64
- { .index = MSR_GS_BASE, .always = true },
- { .index = MSR_FS_BASE, .always = true },
- { .index = MSR_KERNEL_GS_BASE, .always = true },
- { .index = MSR_LSTAR, .always = true },
- { .index = MSR_CSTAR, .always = true },
- { .index = MSR_SYSCALL_MASK, .always = true },
-#endif
- { .index = MSR_IA32_SPEC_CTRL, .always = false },
- { .index = MSR_IA32_PRED_CMD, .always = false },
- { .index = MSR_IA32_FLUSH_CMD, .always = false },
- { .index = MSR_IA32_DEBUGCTLMSR, .always = false },
- { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
- { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
- { .index = MSR_IA32_LASTINTFROMIP, .always = false },
- { .index = MSR_IA32_LASTINTTOIP, .always = false },
- { .index = MSR_IA32_XSS, .always = false },
- { .index = MSR_EFER, .always = false },
- { .index = MSR_IA32_CR_PAT, .always = false },
- { .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
- { .index = MSR_TSC_AUX, .always = false },
- { .index = X2APIC_MSR(APIC_ID), .always = false },
- { .index = X2APIC_MSR(APIC_LVR), .always = false },
- { .index = X2APIC_MSR(APIC_TASKPRI), .always = false },
- { .index = X2APIC_MSR(APIC_ARBPRI), .always = false },
- { .index = X2APIC_MSR(APIC_PROCPRI), .always = false },
- { .index = X2APIC_MSR(APIC_EOI), .always = false },
- { .index = X2APIC_MSR(APIC_RRR), .always = false },
- { .index = X2APIC_MSR(APIC_LDR), .always = false },
- { .index = X2APIC_MSR(APIC_DFR), .always = false },
- { .index = X2APIC_MSR(APIC_SPIV), .always = false },
- { .index = X2APIC_MSR(APIC_ISR), .always = false },
- { .index = X2APIC_MSR(APIC_TMR), .always = false },
- { .index = X2APIC_MSR(APIC_IRR), .always = false },
- { .index = X2APIC_MSR(APIC_ESR), .always = false },
- { .index = X2APIC_MSR(APIC_ICR), .always = false },
- { .index = X2APIC_MSR(APIC_ICR2), .always = false },
-
- /*
- * Note:
- * AMD does not virtualize APIC TSC-deadline timer mode, but it is
- * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
- * the AVIC hardware would generate GP fault. Therefore, always
- * intercept the MSR 0x832, and do not setup direct_access_msr.
- */
- { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false },
- { .index = X2APIC_MSR(APIC_LVTPC), .always = false },
- { .index = X2APIC_MSR(APIC_LVT0), .always = false },
- { .index = X2APIC_MSR(APIC_LVT1), .always = false },
- { .index = X2APIC_MSR(APIC_LVTERR), .always = false },
- { .index = X2APIC_MSR(APIC_TMICT), .always = false },
- { .index = X2APIC_MSR(APIC_TMCCT), .always = false },
- { .index = X2APIC_MSR(APIC_TDCR), .always = false },
- { .index = MSR_INVALID, .always = false },
-};
-
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* pause_filter_count: On processors that support Pause filtering(indicated
@@ -231,6 +164,9 @@ module_param(tsc_scaling, int, 0444);
*/
static bool avic;
module_param(avic, bool, 0444);
+module_param(enable_ipiv, bool, 0444);
+
+module_param(enable_device_posted_irqs, bool, 0444);
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
@@ -250,6 +186,8 @@ static unsigned long iopm_base;
DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
+static DEFINE_MUTEX(vmcb_dump_mutex);
+
/*
* Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
* the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
@@ -259,33 +197,6 @@ DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
*/
static int tsc_aux_uret_slot __read_mostly = -1;
-static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
-
-#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
-#define MSRS_RANGE_SIZE 2048
-#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
-
-u32 svm_msrpm_offset(u32 msr)
-{
- u32 offset;
- int i;
-
- for (i = 0; i < NUM_MSR_MAPS; i++) {
- if (msr < msrpm_ranges[i] ||
- msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
- continue;
-
- offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
- offset += (i * MSRS_RANGE_SIZE); /* add range offset */
-
- /* Now we have the u8 offset - but need the u32 offset */
- return offset / 4;
- }
-
- /* MSR not in any range */
- return MSR_INVALID;
-}
-
static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
@@ -752,50 +663,8 @@ static void clr_dr_intercepts(struct vcpu_svm *svm)
recalc_intercepts(svm);
}
-static int direct_access_msr_slot(u32 msr)
-{
- u32 i;
-
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
- if (direct_access_msrs[i].index == msr)
- return i;
-
- return -ENOENT;
-}
-
-static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
- int write)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- int slot = direct_access_msr_slot(msr);
-
- if (slot == -ENOENT)
- return;
-
- /* Set the shadow bitmaps to the desired intercept states */
- if (read)
- set_bit(slot, svm->shadow_msr_intercept.read);
- else
- clear_bit(slot, svm->shadow_msr_intercept.read);
-
- if (write)
- set_bit(slot, svm->shadow_msr_intercept.write);
- else
- clear_bit(slot, svm->shadow_msr_intercept.write);
-}
-
-static bool valid_msr_intercept(u32 index)
-{
- return direct_access_msr_slot(index) != -ENOENT;
-}
-
static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
{
- u8 bit_write;
- unsigned long tmp;
- u32 offset;
- u32 *msrpm;
-
/*
* For non-nested case:
* If the L01 MSR bitmap does not intercept the MSR, then we need to
@@ -805,90 +674,102 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
- to_svm(vcpu)->msrpm;
-
- offset = svm_msrpm_offset(msr);
- bit_write = 2 * (msr & 0x0f) + 1;
- tmp = msrpm[offset];
+ void *msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm :
+ to_svm(vcpu)->msrpm;
- BUG_ON(offset == MSR_INVALID);
-
- return test_bit(bit_write, &tmp);
+ return svm_test_msr_bitmap_write(msrpm, msr);
}
-static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
- u32 msr, int read, int write)
+void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u8 bit_read, bit_write;
- unsigned long tmp;
- u32 offset;
-
- /*
- * If this warning triggers extend the direct_access_msrs list at the
- * beginning of the file
- */
- WARN_ON(!valid_msr_intercept(msr));
-
- /* Enforce non allowed MSRs to trap */
- if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
- read = 0;
+ void *msrpm = svm->msrpm;
- if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
- write = 0;
-
- offset = svm_msrpm_offset(msr);
- bit_read = 2 * (msr & 0x0f);
- bit_write = 2 * (msr & 0x0f) + 1;
- tmp = msrpm[offset];
-
- BUG_ON(offset == MSR_INVALID);
-
- read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
- write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+ /* Don't disable interception for MSRs userspace wants to handle. */
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ svm_clear_msr_bitmap_read(msrpm, msr);
+ else
+ svm_set_msr_bitmap_read(msrpm, msr);
+ }
- msrpm[offset] = tmp;
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ svm_clear_msr_bitmap_write(msrpm, msr);
+ else
+ svm_set_msr_bitmap_write(msrpm, msr);
+ }
svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
svm->nested.force_msr_bitmap_recalc = true;
}
-void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
- int read, int write)
-{
- set_shadow_msr_intercept(vcpu, msr, read, write);
- set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
-}
-
-u32 *svm_vcpu_alloc_msrpm(void)
+void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
{
- unsigned int order = get_order(MSRPM_SIZE);
- struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
- u32 *msrpm;
+ unsigned int order = get_order(size);
+ struct page *pages = alloc_pages(gfp_mask, order);
+ void *pm;
if (!pages)
return NULL;
- msrpm = page_address(pages);
- memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
+ /*
+ * Set all bits in the permissions map so that all MSR and I/O accesses
+ * are intercepted by default.
+ */
+ pm = page_address(pages);
+ memset(pm, 0xff, PAGE_SIZE * (1 << order));
- return msrpm;
+ return pm;
}
-void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
+static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
{
- int i;
+ bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- if (!direct_access_msrs[i].always)
- continue;
- set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
- }
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTFROMIP, MSR_TYPE_RW, intercept);
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTINTTOIP, MSR_TYPE_RW, intercept);
+
+ if (sev_es_guest(vcpu->kvm))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);
}
void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
{
+ static const u32 x2avic_passthrough_msrs[] = {
+ X2APIC_MSR(APIC_ID),
+ X2APIC_MSR(APIC_LVR),
+ X2APIC_MSR(APIC_TASKPRI),
+ X2APIC_MSR(APIC_ARBPRI),
+ X2APIC_MSR(APIC_PROCPRI),
+ X2APIC_MSR(APIC_EOI),
+ X2APIC_MSR(APIC_RRR),
+ X2APIC_MSR(APIC_LDR),
+ X2APIC_MSR(APIC_DFR),
+ X2APIC_MSR(APIC_SPIV),
+ X2APIC_MSR(APIC_ISR),
+ X2APIC_MSR(APIC_TMR),
+ X2APIC_MSR(APIC_IRR),
+ X2APIC_MSR(APIC_ESR),
+ X2APIC_MSR(APIC_ICR),
+ X2APIC_MSR(APIC_ICR2),
+
+ /*
+ * Note! Always intercept LVTT, as TSC-deadline timer mode
+ * isn't virtualized by hardware, and the CPU will generate a
+ * #GP instead of a #VMEXIT.
+ */
+ X2APIC_MSR(APIC_LVTTHMR),
+ X2APIC_MSR(APIC_LVTPC),
+ X2APIC_MSR(APIC_LVT0),
+ X2APIC_MSR(APIC_LVT1),
+ X2APIC_MSR(APIC_LVTERR),
+ X2APIC_MSR(APIC_TMICT),
+ X2APIC_MSR(APIC_TMCCT),
+ X2APIC_MSR(APIC_TDCR),
+ };
int i;
if (intercept == svm->x2avic_msrs_intercepted)
@@ -897,84 +778,79 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
if (!x2avic_enabled)
return;
- for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
- int index = direct_access_msrs[i].index;
-
- if ((index < APIC_BASE_MSR) ||
- (index > APIC_BASE_MSR + 0xff))
- continue;
- set_msr_interception(&svm->vcpu, svm->msrpm, index,
- !intercept, !intercept);
- }
+ for (i = 0; i < ARRAY_SIZE(x2avic_passthrough_msrs); i++)
+ svm_set_intercept_for_msr(&svm->vcpu, x2avic_passthrough_msrs[i],
+ MSR_TYPE_RW, intercept);
svm->x2avic_msrs_intercepted = intercept;
}
-void svm_vcpu_free_msrpm(u32 *msrpm)
+void svm_vcpu_free_msrpm(void *msrpm)
{
__free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
}
-static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
+static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u32 i;
- /*
- * Set intercept permissions for all direct access MSRs again. They
- * will automatically get filtered through the MSR filter, so we are
- * back in sync after this.
- */
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- u32 msr = direct_access_msrs[i].index;
- u32 read = test_bit(i, svm->shadow_msr_intercept.read);
- u32 write = test_bit(i, svm->shadow_msr_intercept.write);
+ svm_disable_intercept_for_msr(vcpu, MSR_STAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
- }
-}
-
-static void add_msr_offset(u32 offset)
-{
- int i;
-
- for (i = 0; i < MSRPM_OFFSETS; ++i) {
-
- /* Offset already in list? */
- if (msrpm_offsets[i] == offset)
- return;
+#ifdef CONFIG_X86_64
+ svm_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_LSTAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_CSTAR, MSR_TYPE_RW);
+ svm_disable_intercept_for_msr(vcpu, MSR_SYSCALL_MASK, MSR_TYPE_RW);
+#endif
- /* Slot used by another offset? */
- if (msrpm_offsets[i] != MSR_INVALID)
- continue;
+ if (lbrv)
+ svm_recalc_lbr_msr_intercepts(vcpu);
- /* Add offset to list */
- msrpm_offsets[i] = offset;
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
- return;
- }
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
/*
- * If this BUG triggers the msrpm_offsets table has an overflow. Just
- * increase MSRPM_OFFSETS in this case.
+ * Disable interception of SPEC_CTRL if KVM doesn't need to manually
+ * context switch the MSR (SPEC_CTRL is virtualized by the CPU), or if
+ * the guest has a non-zero SPEC_CTRL value, i.e. is likely actively
+ * using SPEC_CTRL.
*/
- BUG();
-}
-
-static void init_msrpm_offsets(void)
-{
- int i;
-
- memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+ if (cpu_feature_enabled(X86_FEATURE_V_SPEC_CTRL))
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !guest_has_spec_ctrl_msr(vcpu));
+ else
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !svm->spec_ctrl);
- for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
- u32 offset;
+ /*
+ * Intercept SYSENTER_EIP and SYSENTER_ESP when emulating an Intel CPU,
+ * as AMD hardware only store 32 bits, whereas Intel CPUs track 64 bits.
+ */
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW,
+ guest_cpuid_is_intel_compatible(vcpu));
+ svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW,
+ guest_cpuid_is_intel_compatible(vcpu));
+
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
+ }
- offset = svm_msrpm_offset(direct_access_msrs[i].index);
- BUG_ON(offset == MSR_INVALID);
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_recalc_msr_intercepts(vcpu);
- add_msr_offset(offset);
- }
+ /*
+ * x2APIC intercepts are modified on-demand and cannot be filtered by
+ * userspace.
+ */
}
void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
@@ -993,13 +869,7 @@ void svm_enable_lbrv(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
-
- if (sev_es_guest(vcpu->kvm))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1);
+ svm_recalc_lbr_msr_intercepts(vcpu);
/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
if (is_guest_mode(vcpu))
@@ -1011,12 +881,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
-
svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+ svm_recalc_lbr_msr_intercepts(vcpu);
/*
* Move the LBR msrs back to the vmcb01 to avoid copying them
@@ -1171,9 +1037,10 @@ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
}
/* Evaluate instruction intercepts that depend on guest CPUID features. */
-static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
- struct vcpu_svm *svm)
+static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
/*
* Intercept INVPCID if shadow paging is enabled to sync/free shadow
* roots, or if INVPCID is disabled in the guest to inject #UD.
@@ -1192,24 +1059,11 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
else
svm_set_intercept(svm, INTERCEPT_RDTSCP);
}
-}
-
-static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
if (guest_cpuid_is_intel_compatible(vcpu)) {
- /*
- * We must intercept SYSENTER_EIP and SYSENTER_ESP
- * accesses because the processor only stores 32 bits.
- * For the same reason we cannot use virtual VMLOAD/VMSAVE.
- */
svm_set_intercept(svm, INTERCEPT_VMLOAD);
svm_set_intercept(svm, INTERCEPT_VMSAVE);
svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
-
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
} else {
/*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
@@ -1220,12 +1074,15 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_VMSAVE);
svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
}
- /* No need to intercept these MSRs */
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
}
}
+static void svm_recalc_intercepts_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ svm_recalc_instruction_intercepts(vcpu);
+ svm_recalc_msr_intercepts(vcpu);
+}
+
static void init_vmcb(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1348,15 +1205,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_PAUSE);
}
- svm_recalc_instruction_intercepts(vcpu, svm);
-
- /*
- * If the host supports V_SPEC_CTRL then disable the interception
- * of MSR_IA32_SPEC_CTRL.
- */
- if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
-
if (kvm_vcpu_apicv_active(vcpu))
avic_init_vmcb(svm, vmcb);
@@ -1369,11 +1217,15 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}
+ if (vcpu->kvm->arch.bus_lock_detection_enabled)
+ svm_set_intercept(svm, INTERCEPT_BUSLOCK);
+
if (sev_guest(vcpu->kvm))
sev_init_vmcb(svm);
svm_hv_init_vmcb(vmcb);
- init_vmcb_after_set_cpuid(vcpu);
+
+ svm_recalc_intercepts_after_set_cpuid(vcpu);
vmcb_mark_all_dirty(vmcb);
@@ -1384,8 +1236,6 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm_vcpu_init_msrpm(vcpu, svm->msrpm);
-
svm_init_osvw(vcpu);
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
@@ -1478,24 +1328,11 @@ out:
return err;
}
-static void svm_clear_current_vmcb(struct vmcb *vmcb)
-{
- int i;
-
- for_each_online_cpu(i)
- cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
-}
-
static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- /*
- * The vmcb page can be recycled, causing a false negative in
- * svm_vcpu_load(). So, ensure that no logical CPU has this
- * vmcb page recorded as its current vmcb.
- */
- svm_clear_current_vmcb(svm->vmcb);
+ WARN_ON_ONCE(!list_empty(&svm->ir_list));
svm_leave_nested(vcpu);
svm_free_nested(svm);
@@ -1503,7 +1340,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
sev_free_vcpu(vcpu);
__free_page(__sme_pa_to_page(svm->vmcb01.pa));
- __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
+ svm_vcpu_free_msrpm(svm->msrpm);
}
#ifdef CONFIG_CPU_MITIGATIONS
@@ -1610,19 +1447,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
-
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
- if (sd->current_vmcb != svm->vmcb) {
- sd->current_vmcb = svm->vmcb;
-
- if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) &&
- static_branch_likely(&switch_vcpu_ibpb))
- indirect_branch_prediction_barrier();
- }
if (kvm_vcpu_apicv_active(vcpu))
avic_vcpu_load(vcpu, cpu);
}
@@ -2897,12 +2724,11 @@ static int svm_get_feature_msr(u32 msr, u64 *data)
return 0;
}
-static bool
-sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info)
{
return sev_es_guest(vcpu->kvm) &&
vcpu->arch.guest_state_protected &&
- svm_msrpm_offset(msr_info->index) != MSR_INVALID &&
!msr_write_intercepted(vcpu, msr_info->index);
}
@@ -3133,11 +2959,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
- * nested_svm_vmrun_msrpm.
+ * nested_svm_merge_msrpm().
* We update the L1 MSR bit as well since it will end up
* touching the MSR anyway now.
*/
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ svm_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW);
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr->host_initiated &&
@@ -3203,8 +3029,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
/*
* TSC_AUX is usually changed only during boot and never read
- * directly. Intercept TSC_AUX instead of exposing it to the
- * guest via direct_access_msrs, and switch it via user return.
+ * directly. Intercept TSC_AUX and switch it via user return.
*/
preempt_disable();
ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
@@ -3221,17 +3046,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
}
/*
- * AMD changed the architectural behavior of bits 5:2. On CPUs
- * without BusLockTrap, bits 5:2 control "external pins", but
- * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap
- * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed
- * the guest to set bits 5:2 despite not actually virtualizing
- * Performance-Monitoring/Breakpoint external pins. Drop bits
- * 5:2 for backwards compatibility.
- */
- data &= ~GENMASK(5, 2);
-
- /*
* Suppress BTF as KVM doesn't virtualize BTF, but there's no
* way to communicate lack of support to the guest.
*/
@@ -3361,6 +3175,37 @@ static int invpcid_interception(struct kvm_vcpu *vcpu)
return kvm_handle_invpcid(vcpu, type, gva);
}
+static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If userspace has NOT changed RIP, then KVM's ABI is to let the guest
+ * execute the bus-locking instruction. Set the bus lock counter to '1'
+ * to effectively step past the bus lock.
+ */
+ if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))
+ svm->vmcb->control.bus_lock_counter = 1;
+
+ return 1;
+}
+
+static int bus_lock_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_userspace_buslock;
+
+ if (is_guest_mode(vcpu))
+ svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip;
+
+ return 0;
+}
+
static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@@ -3430,6 +3275,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_INVPCID] = invpcid_interception,
[SVM_EXIT_IDLE_HLT] = kvm_emulate_halt,
[SVM_EXIT_NPF] = npf_interception,
+ [SVM_EXIT_BUS_LOCK] = bus_lock_exit,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
@@ -3444,14 +3290,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
+ char *vm_type;
if (!dump_invalid_vmcb) {
pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
return;
}
- pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
- svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
+ guard(mutex)(&vmcb_dump_mutex);
+
+ vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" :
+ sev_es_guest(vcpu->kvm) ? "SEV-ES" :
+ sev_guest(vcpu->kvm) ? "SEV" : "SVM";
+
+ pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n",
+ vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
pr_err("VMCB Control Area:\n");
pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
@@ -3489,6 +3342,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
+ pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features);
+ pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features);
+
+ if (sev_es_guest(vcpu->kvm)) {
+ save = sev_decrypt_vmsa(vcpu);
+ if (!save)
+ goto no_vmsa;
+
+ save01 = save;
+ }
+
pr_err("VMCB State Save Area:\n");
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"es:",
@@ -3559,6 +3423,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-15s %016llx %-13s %016llx\n",
"excp_from:", save->last_excp_from,
"excp_to:", save->last_excp_to);
+
+ if (sev_es_guest(vcpu->kvm)) {
+ struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save;
+
+ pr_err("%-15s %016llx\n",
+ "sev_features", vmsa->sev_features);
+
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rax:", vmsa->rax, "rbx:", vmsa->rbx);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rcx:", vmsa->rcx, "rdx:", vmsa->rdx);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rsi:", vmsa->rsi, "rdi:", vmsa->rdi);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rbp:", vmsa->rbp, "rsp:", vmsa->rsp);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r8:", vmsa->r8, "r9:", vmsa->r9);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r10:", vmsa->r10, "r11:", vmsa->r11);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r12:", vmsa->r12, "r13:", vmsa->r13);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r14:", vmsa->r14, "r15:", vmsa->r15);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "xcr0:", vmsa->xcr0, "xss:", vmsa->xss);
+ } else {
+ pr_err("%-15s %016llx %-13s %016lx\n",
+ "rax:", save->rax, "rbx:",
+ vcpu->arch.regs[VCPU_REGS_RBX]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "rcx:", vcpu->arch.regs[VCPU_REGS_RCX],
+ "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "rsi:", vcpu->arch.regs[VCPU_REGS_RSI],
+ "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]);
+ pr_err("%-15s %016lx %-13s %016llx\n",
+ "rbp:", vcpu->arch.regs[VCPU_REGS_RBP],
+ "rsp:", save->rsp);
+#ifdef CONFIG_X86_64
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r8:", vcpu->arch.regs[VCPU_REGS_R8],
+ "r9:", vcpu->arch.regs[VCPU_REGS_R9]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r10:", vcpu->arch.regs[VCPU_REGS_R10],
+ "r11:", vcpu->arch.regs[VCPU_REGS_R11]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r12:", vcpu->arch.regs[VCPU_REGS_R12],
+ "r13:", vcpu->arch.regs[VCPU_REGS_R13]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r14:", vcpu->arch.regs[VCPU_REGS_R14],
+ "r15:", vcpu->arch.regs[VCPU_REGS_R15]);
+#endif
+ }
+
+no_vmsa:
+ if (sev_es_guest(vcpu->kvm))
+ sev_free_decrypted_vmsa(vcpu, save);
}
static bool svm_check_exit_valid(u64 exit_code)
@@ -3595,6 +3516,10 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
return kvm_emulate_halt(vcpu);
else if (exit_code == SVM_EXIT_NPF)
return npf_interception(vcpu);
+#ifdef CONFIG_KVM_AMD_SEV
+ else if (exit_code == SVM_EXIT_VMGEXIT)
+ return sev_handle_vmgexit(vcpu);
+#endif
#endif
return svm_exit_handlers[exit_code](vcpu);
}
@@ -4306,9 +4231,9 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_exit_irqoff();
}
-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
- bool force_immediate_exit)
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_svm *svm = to_svm(vcpu);
bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
@@ -4355,10 +4280,13 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
svm_hv_update_vp_id(svm->vmcb, vcpu);
/*
- * Run with all-zero DR6 unless needed, so that we can get the exact cause
- * of a #DB.
+ * Run with all-zero DR6 unless the guest can write DR6 freely, so that
+ * KVM can get the exact cause of a #DB. Note, loading guest DR6 from
+ * KVM's snapshot is only necessary when DR accesses won't exit.
*/
- if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ if (unlikely(run_flags & KVM_RUN_LOAD_GUEST_DR6))
+ svm_set_dr6(vcpu, vcpu->arch.dr6);
+ else if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
@@ -4538,20 +4466,10 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (guest_cpuid_is_intel_compatible(vcpu))
guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
- svm_recalc_instruction_intercepts(vcpu, svm);
-
- if (boot_cpu_has(X86_FEATURE_IBPB))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
- !!guest_has_pred_cmd_msr(vcpu));
-
- if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
- !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
-
if (sev_guest(vcpu->kvm))
sev_vcpu_after_set_cpuid(svm);
- init_vmcb_after_set_cpuid(vcpu);
+ svm_recalc_intercepts_after_set_cpuid(vcpu);
}
static bool svm_has_wbinvd_exit(void)
@@ -5102,7 +5020,7 @@ static int svm_vm_init(struct kvm *kvm)
}
if (!pause_filter_count || !pause_filter_thresh)
- kvm->arch.pause_in_guest = true;
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
if (enable_apicv) {
int ret = avic_vm_init(kvm);
@@ -5169,7 +5087,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
- .set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
@@ -5254,7 +5171,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
- .msr_filter_changed = svm_msr_filter_changed,
+ .recalc_msr_intercepts = svm_recalc_msr_intercepts,
.complete_emulated_msr = svm_complete_emulated_msr,
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
@@ -5356,6 +5273,9 @@ static __init void svm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
+ if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD))
+ kvm_caps.has_bus_lock_exit = true;
+
/* CPUID 0x80000008 */
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
@@ -5387,11 +5307,8 @@ static __init void svm_set_cpu_caps(void)
static __init int svm_hardware_setup(void)
{
- int cpu;
- struct page *iopm_pages;
void *iopm_va;
- int r;
- unsigned int order = get_order(IOPM_SIZE);
+ int cpu, r;
/*
* NX is required for shadow paging and for NPT if the NX huge pages
@@ -5403,17 +5320,6 @@ static __init int svm_hardware_setup(void)
}
kvm_enable_efer_bits(EFER_NX);
- iopm_pages = alloc_pages(GFP_KERNEL, order);
-
- if (!iopm_pages)
- return -ENOMEM;
-
- iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
- iopm_base = __sme_page_pa(iopm_pages);
-
- init_msrpm_offsets();
-
kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
XFEATURE_MASK_BNDCSR);
@@ -5447,6 +5353,10 @@ static __init int svm_hardware_setup(void)
if (nested) {
pr_info("Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+
+ r = nested_svm_init_msrpm_merge_offsets();
+ if (r)
+ return r;
}
/*
@@ -5478,6 +5388,13 @@ static __init int svm_hardware_setup(void)
else
pr_info("LBR virtualization supported\n");
}
+
+ iopm_va = svm_alloc_permissions_map(IOPM_SIZE, GFP_KERNEL);
+ if (!iopm_va)
+ return -ENOMEM;
+
+ iopm_base = __sme_set(__pa(iopm_va));
+
/*
* Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
* may be modified by svm_adjust_mmio_mask()), as well as nrips.
@@ -5495,6 +5412,7 @@ static __init int svm_hardware_setup(void)
enable_apicv = avic = avic && avic_hardware_setup();
if (!enable_apicv) {
+ enable_ipiv = false;
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
@@ -5576,6 +5494,8 @@ static int __init svm_init(void)
{
int r;
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm);
+
__unused_size_checks();
if (!kvm_is_svm_supported())
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index f16b068c4228..58b9d168e0c8 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -44,9 +44,6 @@ static inline struct page *__sme_pa_to_page(unsigned long pa)
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
-#define MAX_DIRECT_ACCESS_MSRS 48
-#define MSRPM_OFFSETS 32
-extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
extern int nrips;
extern int vgif;
@@ -98,6 +95,7 @@ struct kvm_sev_info {
unsigned int asid; /* ASID used for this guest */
unsigned int handle; /* SEV firmware handle */
int fd; /* SEV device fd */
+ unsigned long policy;
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
@@ -112,15 +110,19 @@ struct kvm_sev_info {
void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */
void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */
struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
+ cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */
};
+#define SEV_POLICY_NODBG BIT_ULL(0)
+#define SNP_POLICY_DEBUG BIT_ULL(19)
+
struct kvm_svm {
struct kvm kvm;
/* Struct members for AVIC */
u32 avic_vm_id;
- struct page *avic_logical_id_table_page;
- struct page *avic_physical_id_table_page;
+ u32 *avic_logical_id_table;
+ u64 *avic_physical_id_table;
struct hlist_node hnode;
struct kvm_sev_info sev_info;
@@ -169,6 +171,7 @@ struct vmcb_ctrl_area_cached {
u64 nested_cr3;
u64 virt_ext;
u32 clean;
+ u64 bus_lock_rip;
union {
#if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV)
struct hv_vmcb_enlightenments hv_enlightenments;
@@ -184,8 +187,11 @@ struct svm_nested_state {
u64 vmcb12_gpa;
u64 last_vmcb12_gpa;
- /* These are the merged vectors */
- u32 *msrpm;
+ /*
+ * The MSR permissions map used for vmcb02, which is the merge result
+ * of vmcb01 and vmcb12
+ */
+ void *msrpm;
/* A VMRUN has started but has not yet been performed, so
* we cannot inject a nested vmexit yet. */
@@ -266,7 +272,7 @@ struct vcpu_svm {
*/
u64 virt_spec_ctrl;
- u32 *msrpm;
+ void *msrpm;
ulong nmi_iret_rip;
@@ -301,24 +307,26 @@ struct vcpu_svm {
u32 ldr_reg;
u32 dfr_reg;
- struct page *avic_backing_page;
- u64 *avic_physical_id_cache;
+
+ /* This is essentially a shadow of the vCPU's actual entry in the
+ * Physical ID table that is programmed into the VMCB, i.e. that is
+ * seen by the CPU. If IPI virtualization is disabled, IsRunning is
+ * only ever set in the shadow, i.e. is never propagated to the "real"
+ * table, so that hardware never sees IsRunning=1.
+ */
+ u64 avic_physical_id_entry;
/*
- * Per-vcpu list of struct amd_svm_iommu_ir:
- * This is used mainly to store interrupt remapping information used
- * when update the vcpu affinity. This avoids the need to scan for
- * IRTE and try to match ga_tag in the IOMMU driver.
+ * Per-vCPU list of irqfds that are eligible to post IRQs directly to
+ * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass). The list
+ * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the
+ * target pCPU), when AVIC is toggled on/off (to (de)activate bypass),
+ * and if the irqfd becomes ineligible for posting (to put the IRTE
+ * back into remapped mode).
*/
struct list_head ir_list;
spinlock_t ir_list_lock;
- /* Save desired MSR intercept (read: pass-through) state */
- struct {
- DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS);
- DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS);
- } shadow_msr_intercept;
-
struct vcpu_sev_es_state sev_es;
bool guest_state_loaded;
@@ -340,8 +348,6 @@ struct svm_cpu_data {
struct vmcb *save_area;
unsigned long save_area_pa;
- struct vmcb *current_vmcb;
-
/* index = sev_asid, value = vmcb pointer */
struct vmcb **sev_vmcbs;
};
@@ -610,17 +616,74 @@ static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data)
svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data);
}
-/* svm.c */
-#define MSR_INVALID 0xffffffffU
+/*
+ * The MSRPM is 8KiB in size, divided into four 2KiB ranges (the fourth range
+ * is reserved). Each MSR within a range is covered by two bits, one each for
+ * read (bit 0) and write (bit 1), where a bit value of '1' means intercepted.
+ */
+#define SVM_MSRPM_BYTES_PER_RANGE 2048
+#define SVM_BITS_PER_MSR 2
+#define SVM_MSRS_PER_BYTE (BITS_PER_BYTE / SVM_BITS_PER_MSR)
+#define SVM_MSRS_PER_RANGE (SVM_MSRPM_BYTES_PER_RANGE * SVM_MSRS_PER_BYTE)
+static_assert(SVM_MSRS_PER_RANGE == 8192);
+#define SVM_MSRPM_OFFSET_MASK (SVM_MSRS_PER_RANGE - 1)
+
+static __always_inline int svm_msrpm_bit_nr(u32 msr)
+{
+ int range_nr;
+
+ switch (msr & ~SVM_MSRPM_OFFSET_MASK) {
+ case 0:
+ range_nr = 0;
+ break;
+ case 0xc0000000:
+ range_nr = 1;
+ break;
+ case 0xc0010000:
+ range_nr = 2;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return range_nr * SVM_MSRPM_BYTES_PER_RANGE * BITS_PER_BYTE +
+ (msr & SVM_MSRPM_OFFSET_MASK) * SVM_BITS_PER_MSR;
+}
+
+#define __BUILD_SVM_MSR_BITMAP_HELPER(rtype, action, bitop, access, bit_rw) \
+static inline rtype svm_##action##_msr_bitmap_##access(unsigned long *bitmap, \
+ u32 msr) \
+{ \
+ int bit_nr; \
+ \
+ bit_nr = svm_msrpm_bit_nr(msr); \
+ if (bit_nr < 0) \
+ return (rtype)true; \
+ \
+ return bitop##_bit(bit_nr + bit_rw, bitmap); \
+}
+
+#define BUILD_SVM_MSR_BITMAP_HELPERS(ret_type, action, bitop) \
+ __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0) \
+ __BUILD_SVM_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 1)
+
+BUILD_SVM_MSR_BITMAP_HELPERS(bool, test, test)
+BUILD_SVM_MSR_BITMAP_HELPERS(void, clear, __clear)
+BUILD_SVM_MSR_BITMAP_HELPERS(void, set, __set)
#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)
+/* svm.c */
extern bool dump_invalid_vmcb;
-u32 svm_msrpm_offset(u32 msr);
-u32 *svm_vcpu_alloc_msrpm(void);
-void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
-void svm_vcpu_free_msrpm(u32 *msrpm);
+void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask);
+
+static inline void *svm_vcpu_alloc_msrpm(void)
+{
+ return svm_alloc_permissions_map(MSRPM_SIZE, GFP_KERNEL_ACCOUNT);
+}
+
+void svm_vcpu_free_msrpm(void *msrpm);
void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
void svm_enable_lbrv(struct kvm_vcpu *vcpu);
void svm_update_lbrv(struct kvm_vcpu *vcpu);
@@ -640,6 +703,20 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable);
void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
int trig_mode, int vec);
+void svm_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set);
+
+static inline void svm_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ svm_set_intercept_for_msr(vcpu, msr, type, false);
+}
+
+static inline void svm_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ svm_set_intercept_for_msr(vcpu, msr, type, true);
+}
+
/* nested.c */
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -668,6 +745,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
+int __init nested_svm_init_msrpm_merge_offsets(void);
+
int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
void svm_leave_nested(struct kvm_vcpu *vcpu);
@@ -718,7 +797,8 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \
- BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \
+ BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG) \
)
bool avic_hardware_setup(void);
@@ -733,8 +813,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
@@ -749,6 +830,7 @@ void sev_init_vmcb(struct vcpu_svm *svm);
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_vcpu_reset(struct vcpu_svm *svm);
+void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
@@ -785,6 +867,8 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
+struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu);
+void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa);
#else
static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
{
@@ -816,6 +900,11 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
return 0;
}
+static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
+{
+ return NULL;
+}
+static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {}
#endif
/* vmenter.S */
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 0c61153b275f..235c4af6b692 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -169,6 +169,9 @@ SYM_FUNC_START(__svm_vcpu_run)
#endif
mov VCPU_RDI(%_ASM_DI), %_ASM_DI
+ /* Clobbers EFLAGS.ZF */
+ VM_CLEAR_CPU_BUFFERS
+
/* Enter guest mode */
3: vmrun %_ASM_AX
4:
@@ -335,6 +338,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
mov SVM_current_vmcb(%rdi), %rax
mov KVM_VMCB_pa(%rax), %rax
+ /* Clobbers EFLAGS.ZF */
+ VM_CLEAR_CPU_BUFFERS
+
/* Enter guest mode */
1: vmrun %rax
2:
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ba736cbb0587..57d79fd31df0 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -260,6 +260,86 @@ TRACE_EVENT(kvm_cpuid,
__entry->used_max_basic ? ", used max basic" : "")
);
+#define kvm_deliver_mode \
+ {0x0, "Fixed"}, \
+ {0x1, "LowPrio"}, \
+ {0x2, "SMI"}, \
+ {0x3, "Res3"}, \
+ {0x4, "NMI"}, \
+ {0x5, "INIT"}, \
+ {0x6, "SIPI"}, \
+ {0x7, "ExtINT"}
+
+#ifdef CONFIG_KVM_IOAPIC
+TRACE_EVENT(kvm_ioapic_set_irq,
+ TP_PROTO(__u64 e, int pin, bool coalesced),
+ TP_ARGS(e, pin, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ __field( int, pin )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ __entry->pin = pin;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
+ __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+ TP_PROTO(__u64 e),
+ TP_ARGS(e),
+
+ TP_STRUCT__entry(
+ __field( __u64, e )
+ ),
+
+ TP_fast_assign(
+ __entry->e = e;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s%s)",
+ (u8)(__entry->e >> 56), (u8)__entry->e,
+ __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->e & (1<<11)) ? "logical" : "physical",
+ (__entry->e & (1<<15)) ? "level" : "edge",
+ (__entry->e & (1<<16)) ? "|masked" : "")
+);
+#endif
+
+TRACE_EVENT(kvm_msi_set_irq,
+ TP_PROTO(__u64 address, __u64 data),
+ TP_ARGS(address, data),
+
+ TP_STRUCT__entry(
+ __field( __u64, address )
+ __field( __u64, data )
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ __entry->data = data;
+ ),
+
+ TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+ (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+ (u8)__entry->data,
+ __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
+ (__entry->address & (1<<2)) ? "logical" : "physical",
+ (__entry->data & (1<<15)) ? "level" : "edge",
+ (__entry->address & (1<<3)) ? "|rh" : "")
+);
+
#define AREG(x) { APIC_##x, "APIC_" #x }
#define kvm_trace_symbol_apic \
@@ -1096,37 +1176,32 @@ TRACE_EVENT(kvm_smm_transition,
* Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC.
*/
TRACE_EVENT(kvm_pi_irte_update,
- TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
- unsigned int gsi, unsigned int gvec,
- u64 pi_desc_addr, bool set),
- TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
+ TP_PROTO(unsigned int host_irq, struct kvm_vcpu *vcpu,
+ unsigned int gsi, unsigned int gvec, bool set),
+ TP_ARGS(host_irq, vcpu, gsi, gvec, set),
TP_STRUCT__entry(
__field( unsigned int, host_irq )
- __field( unsigned int, vcpu_id )
+ __field( int, vcpu_id )
__field( unsigned int, gsi )
__field( unsigned int, gvec )
- __field( u64, pi_desc_addr )
__field( bool, set )
),
TP_fast_assign(
__entry->host_irq = host_irq;
- __entry->vcpu_id = vcpu_id;
+ __entry->vcpu_id = vcpu ? vcpu->vcpu_id : -1;
__entry->gsi = gsi;
__entry->gvec = gvec;
- __entry->pi_desc_addr = pi_desc_addr;
__entry->set = set;
),
- TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
- "gvec: 0x%x, pi_desc_addr: 0x%llx",
+ TP_printk("PI is %s for irq %u, vcpu %d, gsi: 0x%x, gvec: 0x%x",
__entry->set ? "enabled and being updated" : "disabled",
__entry->host_irq,
__entry->vcpu_id,
__entry->gsi,
- __entry->gvec,
- __entry->pi_desc_addr)
+ __entry->gvec)
);
/*
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index cb6588238f46..5316c27f6099 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -15,7 +15,6 @@ extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
-extern bool __read_mostly enable_ipiv;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index 8f46a06e2c44..bc5ece76533a 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -53,8 +53,6 @@ struct vcpu_vt {
#ifdef CONFIG_X86_64
u64 msr_host_kernel_gs_base;
#endif
-
- unsigned long host_debugctlmsr;
};
#ifdef CONFIG_KVM_INTEL_TDX
@@ -71,8 +69,8 @@ static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu)
#else
-static inline bool is_td(struct kvm *kvm) { return false; }
-static inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; }
+static __always_inline bool is_td(struct kvm *kvm) { return false; }
+static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; }
#endif
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 94d5d907d37b..dbab1c15b0cd 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -12,7 +12,6 @@
#ifdef CONFIG_KVM_INTEL_TDX
static_assert(offsetof(struct vcpu_vmx, vt) == offsetof(struct vcpu_tdx, vt));
-#endif
static void vt_disable_virtualization_cpu(void)
{
@@ -30,40 +29,8 @@ static __init int vt_hardware_setup(void)
if (ret)
return ret;
- /*
- * Update vt_x86_ops::vm_size here so it is ready before
- * kvm_ops_update() is called in kvm_x86_vendor_init().
- *
- * Note, the actual bringing up of TDX must be done after
- * kvm_ops_update() because enabling TDX requires enabling
- * hardware virtualization first, i.e., all online CPUs must
- * be in post-VMXON state. This means the @vm_size here
- * may be updated to TDX's size but TDX may fail to enable
- * at later time.
- *
- * The VMX/VT code could update kvm_x86_ops::vm_size again
- * after bringing up TDX, but this would require exporting
- * either kvm_x86_ops or kvm_ops_update() from the base KVM
- * module, which looks overkill. Anyway, the worst case here
- * is KVM may allocate couple of more bytes than needed for
- * each VM.
- */
- if (enable_tdx) {
- vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size,
- sizeof(struct kvm_tdx));
- /*
- * Note, TDX may fail to initialize in a later time in
- * vt_init(), in which case it is not necessary to setup
- * those callbacks. But making them valid here even
- * when TDX fails to init later is fine because those
- * callbacks won't be called if the VM isn't TDX guest.
- */
- vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
- vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
- vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
- vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
- vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
- }
+ if (enable_tdx)
+ tdx_hardware_setup();
return 0;
}
@@ -176,12 +143,12 @@ static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu)
return vmx_vcpu_pre_run(vcpu);
}
-static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
if (is_td_vcpu(vcpu))
- return tdx_vcpu_run(vcpu, force_immediate_exit);
+ return tdx_vcpu_run(vcpu, run_flags);
- return vmx_vcpu_run(vcpu, force_immediate_exit);
+ return vmx_vcpu_run(vcpu, run_flags);
}
static int vt_handle_exit(struct kvm_vcpu *vcpu,
@@ -221,7 +188,7 @@ static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return vmx_get_msr(vcpu, msr_info);
}
-static void vt_msr_filter_changed(struct kvm_vcpu *vcpu)
+static void vt_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
/*
* TDX doesn't allow VMM to configure interception of MSR accesses.
@@ -232,7 +199,7 @@ static void vt_msr_filter_changed(struct kvm_vcpu *vcpu)
if (is_td_vcpu(vcpu))
return;
- vmx_msr_filter_changed(vcpu);
+ vmx_recalc_msr_intercepts(vcpu);
}
static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
@@ -240,7 +207,7 @@ static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
if (is_td_vcpu(vcpu))
return tdx_complete_emulated_msr(vcpu, err);
- return kvm_complete_insn_gp(vcpu, err);
+ return vmx_complete_emulated_msr(vcpu, err);
}
#ifdef CONFIG_KVM_SMM
@@ -315,14 +282,6 @@ static void vt_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
return vmx_set_virtual_apic_mode(vcpu);
}
-static void vt_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi = vcpu_to_pi_desc(vcpu);
-
- pi_clear_on(pi);
- memset(pi->pir, 0, sizeof(pi->pir));
-}
-
static void vt_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
if (is_td_vcpu(vcpu))
@@ -498,14 +457,6 @@ static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
vmx_set_gdt(vcpu, dt);
}
-static void vt_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
-{
- if (is_td_vcpu(vcpu))
- return;
-
- vmx_set_dr6(vcpu, val);
-}
-
static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
if (is_td_vcpu(vcpu))
@@ -888,6 +839,13 @@ static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
return 0;
}
+#define vt_op(name) vt_##name
+#define vt_op_tdx_only(name) vt_##name
+#else /* CONFIG_KVM_INTEL_TDX */
+#define vt_op(name) vmx_##name
+#define vt_op_tdx_only(name) NULL
+#endif /* CONFIG_KVM_INTEL_TDX */
+
#define VMX_REQUIRED_APICV_INHIBITS \
(BIT(APICV_INHIBIT_REASON_DISABLED) | \
BIT(APICV_INHIBIT_REASON_ABSENT) | \
@@ -905,152 +863,153 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.hardware_unsetup = vmx_hardware_unsetup,
.enable_virtualization_cpu = vmx_enable_virtualization_cpu,
- .disable_virtualization_cpu = vt_disable_virtualization_cpu,
+ .disable_virtualization_cpu = vt_op(disable_virtualization_cpu),
.emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,
- .has_emulated_msr = vt_has_emulated_msr,
+ .has_emulated_msr = vt_op(has_emulated_msr),
.vm_size = sizeof(struct kvm_vmx),
- .vm_init = vt_vm_init,
- .vm_pre_destroy = vt_vm_pre_destroy,
- .vm_destroy = vt_vm_destroy,
+ .vm_init = vt_op(vm_init),
+ .vm_destroy = vt_op(vm_destroy),
+ .vm_pre_destroy = vt_op_tdx_only(vm_pre_destroy),
+
+ .vcpu_precreate = vt_op(vcpu_precreate),
+ .vcpu_create = vt_op(vcpu_create),
+ .vcpu_free = vt_op(vcpu_free),
+ .vcpu_reset = vt_op(vcpu_reset),
- .vcpu_precreate = vt_vcpu_precreate,
- .vcpu_create = vt_vcpu_create,
- .vcpu_free = vt_vcpu_free,
- .vcpu_reset = vt_vcpu_reset,
+ .prepare_switch_to_guest = vt_op(prepare_switch_to_guest),
+ .vcpu_load = vt_op(vcpu_load),
+ .vcpu_put = vt_op(vcpu_put),
- .prepare_switch_to_guest = vt_prepare_switch_to_guest,
- .vcpu_load = vt_vcpu_load,
- .vcpu_put = vt_vcpu_put,
+ .HOST_OWNED_DEBUGCTL = VMX_HOST_OWNED_DEBUGCTL_BITS,
- .update_exception_bitmap = vt_update_exception_bitmap,
+ .update_exception_bitmap = vt_op(update_exception_bitmap),
.get_feature_msr = vmx_get_feature_msr,
- .get_msr = vt_get_msr,
- .set_msr = vt_set_msr,
-
- .get_segment_base = vt_get_segment_base,
- .get_segment = vt_get_segment,
- .set_segment = vt_set_segment,
- .get_cpl = vt_get_cpl,
- .get_cpl_no_cache = vt_get_cpl_no_cache,
- .get_cs_db_l_bits = vt_get_cs_db_l_bits,
- .is_valid_cr0 = vt_is_valid_cr0,
- .set_cr0 = vt_set_cr0,
- .is_valid_cr4 = vt_is_valid_cr4,
- .set_cr4 = vt_set_cr4,
- .set_efer = vt_set_efer,
- .get_idt = vt_get_idt,
- .set_idt = vt_set_idt,
- .get_gdt = vt_get_gdt,
- .set_gdt = vt_set_gdt,
- .set_dr6 = vt_set_dr6,
- .set_dr7 = vt_set_dr7,
- .sync_dirty_debug_regs = vt_sync_dirty_debug_regs,
- .cache_reg = vt_cache_reg,
- .get_rflags = vt_get_rflags,
- .set_rflags = vt_set_rflags,
- .get_if_flag = vt_get_if_flag,
-
- .flush_tlb_all = vt_flush_tlb_all,
- .flush_tlb_current = vt_flush_tlb_current,
- .flush_tlb_gva = vt_flush_tlb_gva,
- .flush_tlb_guest = vt_flush_tlb_guest,
-
- .vcpu_pre_run = vt_vcpu_pre_run,
- .vcpu_run = vt_vcpu_run,
- .handle_exit = vt_handle_exit,
+ .get_msr = vt_op(get_msr),
+ .set_msr = vt_op(set_msr),
+
+ .get_segment_base = vt_op(get_segment_base),
+ .get_segment = vt_op(get_segment),
+ .set_segment = vt_op(set_segment),
+ .get_cpl = vt_op(get_cpl),
+ .get_cpl_no_cache = vt_op(get_cpl_no_cache),
+ .get_cs_db_l_bits = vt_op(get_cs_db_l_bits),
+ .is_valid_cr0 = vt_op(is_valid_cr0),
+ .set_cr0 = vt_op(set_cr0),
+ .is_valid_cr4 = vt_op(is_valid_cr4),
+ .set_cr4 = vt_op(set_cr4),
+ .set_efer = vt_op(set_efer),
+ .get_idt = vt_op(get_idt),
+ .set_idt = vt_op(set_idt),
+ .get_gdt = vt_op(get_gdt),
+ .set_gdt = vt_op(set_gdt),
+ .set_dr7 = vt_op(set_dr7),
+ .sync_dirty_debug_regs = vt_op(sync_dirty_debug_regs),
+ .cache_reg = vt_op(cache_reg),
+ .get_rflags = vt_op(get_rflags),
+ .set_rflags = vt_op(set_rflags),
+ .get_if_flag = vt_op(get_if_flag),
+
+ .flush_tlb_all = vt_op(flush_tlb_all),
+ .flush_tlb_current = vt_op(flush_tlb_current),
+ .flush_tlb_gva = vt_op(flush_tlb_gva),
+ .flush_tlb_guest = vt_op(flush_tlb_guest),
+
+ .vcpu_pre_run = vt_op(vcpu_pre_run),
+ .vcpu_run = vt_op(vcpu_run),
+ .handle_exit = vt_op(handle_exit),
.skip_emulated_instruction = vmx_skip_emulated_instruction,
.update_emulated_instruction = vmx_update_emulated_instruction,
- .set_interrupt_shadow = vt_set_interrupt_shadow,
- .get_interrupt_shadow = vt_get_interrupt_shadow,
- .patch_hypercall = vt_patch_hypercall,
- .inject_irq = vt_inject_irq,
- .inject_nmi = vt_inject_nmi,
- .inject_exception = vt_inject_exception,
- .cancel_injection = vt_cancel_injection,
- .interrupt_allowed = vt_interrupt_allowed,
- .nmi_allowed = vt_nmi_allowed,
- .get_nmi_mask = vt_get_nmi_mask,
- .set_nmi_mask = vt_set_nmi_mask,
- .enable_nmi_window = vt_enable_nmi_window,
- .enable_irq_window = vt_enable_irq_window,
- .update_cr8_intercept = vt_update_cr8_intercept,
+ .set_interrupt_shadow = vt_op(set_interrupt_shadow),
+ .get_interrupt_shadow = vt_op(get_interrupt_shadow),
+ .patch_hypercall = vt_op(patch_hypercall),
+ .inject_irq = vt_op(inject_irq),
+ .inject_nmi = vt_op(inject_nmi),
+ .inject_exception = vt_op(inject_exception),
+ .cancel_injection = vt_op(cancel_injection),
+ .interrupt_allowed = vt_op(interrupt_allowed),
+ .nmi_allowed = vt_op(nmi_allowed),
+ .get_nmi_mask = vt_op(get_nmi_mask),
+ .set_nmi_mask = vt_op(set_nmi_mask),
+ .enable_nmi_window = vt_op(enable_nmi_window),
+ .enable_irq_window = vt_op(enable_irq_window),
+ .update_cr8_intercept = vt_op(update_cr8_intercept),
.x2apic_icr_is_split = false,
- .set_virtual_apic_mode = vt_set_virtual_apic_mode,
- .set_apic_access_page_addr = vt_set_apic_access_page_addr,
- .refresh_apicv_exec_ctrl = vt_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vt_load_eoi_exitmap,
- .apicv_pre_state_restore = vt_apicv_pre_state_restore,
+ .set_virtual_apic_mode = vt_op(set_virtual_apic_mode),
+ .set_apic_access_page_addr = vt_op(set_apic_access_page_addr),
+ .refresh_apicv_exec_ctrl = vt_op(refresh_apicv_exec_ctrl),
+ .load_eoi_exitmap = vt_op(load_eoi_exitmap),
+ .apicv_pre_state_restore = pi_apicv_pre_state_restore,
.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
- .hwapic_isr_update = vt_hwapic_isr_update,
- .sync_pir_to_irr = vt_sync_pir_to_irr,
- .deliver_interrupt = vt_deliver_interrupt,
+ .hwapic_isr_update = vt_op(hwapic_isr_update),
+ .sync_pir_to_irr = vt_op(sync_pir_to_irr),
+ .deliver_interrupt = vt_op(deliver_interrupt),
.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
- .set_tss_addr = vt_set_tss_addr,
- .set_identity_map_addr = vt_set_identity_map_addr,
+ .set_tss_addr = vt_op(set_tss_addr),
+ .set_identity_map_addr = vt_op(set_identity_map_addr),
.get_mt_mask = vmx_get_mt_mask,
- .get_exit_info = vt_get_exit_info,
- .get_entry_info = vt_get_entry_info,
+ .get_exit_info = vt_op(get_exit_info),
+ .get_entry_info = vt_op(get_entry_info),
- .vcpu_after_set_cpuid = vt_vcpu_after_set_cpuid,
+ .vcpu_after_set_cpuid = vt_op(vcpu_after_set_cpuid),
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
- .get_l2_tsc_offset = vt_get_l2_tsc_offset,
- .get_l2_tsc_multiplier = vt_get_l2_tsc_multiplier,
- .write_tsc_offset = vt_write_tsc_offset,
- .write_tsc_multiplier = vt_write_tsc_multiplier,
+ .get_l2_tsc_offset = vt_op(get_l2_tsc_offset),
+ .get_l2_tsc_multiplier = vt_op(get_l2_tsc_multiplier),
+ .write_tsc_offset = vt_op(write_tsc_offset),
+ .write_tsc_multiplier = vt_op(write_tsc_multiplier),
- .load_mmu_pgd = vt_load_mmu_pgd,
+ .load_mmu_pgd = vt_op(load_mmu_pgd),
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
- .update_cpu_dirty_logging = vt_update_cpu_dirty_logging,
+ .update_cpu_dirty_logging = vt_op(update_cpu_dirty_logging),
.nested_ops = &vmx_nested_ops,
.pi_update_irte = vmx_pi_update_irte,
- .pi_start_assignment = vmx_pi_start_assignment,
+ .pi_start_bypass = vmx_pi_start_bypass,
#ifdef CONFIG_X86_64
- .set_hv_timer = vt_set_hv_timer,
- .cancel_hv_timer = vt_cancel_hv_timer,
+ .set_hv_timer = vt_op(set_hv_timer),
+ .cancel_hv_timer = vt_op(cancel_hv_timer),
#endif
- .setup_mce = vt_setup_mce,
+ .setup_mce = vt_op(setup_mce),
#ifdef CONFIG_KVM_SMM
- .smi_allowed = vt_smi_allowed,
- .enter_smm = vt_enter_smm,
- .leave_smm = vt_leave_smm,
- .enable_smi_window = vt_enable_smi_window,
+ .smi_allowed = vt_op(smi_allowed),
+ .enter_smm = vt_op(enter_smm),
+ .leave_smm = vt_op(leave_smm),
+ .enable_smi_window = vt_op(enable_smi_window),
#endif
- .check_emulate_instruction = vt_check_emulate_instruction,
- .apic_init_signal_blocked = vt_apic_init_signal_blocked,
+ .check_emulate_instruction = vt_op(check_emulate_instruction),
+ .apic_init_signal_blocked = vt_op(apic_init_signal_blocked),
.migrate_timers = vmx_migrate_timers,
- .msr_filter_changed = vt_msr_filter_changed,
- .complete_emulated_msr = vt_complete_emulated_msr,
+ .recalc_msr_intercepts = vt_op(recalc_msr_intercepts),
+ .complete_emulated_msr = vt_op(complete_emulated_msr),
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
.get_untagged_addr = vmx_get_untagged_addr,
- .mem_enc_ioctl = vt_mem_enc_ioctl,
- .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
+ .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl),
+ .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),
- .private_max_mapping_level = vt_gmem_private_max_mapping_level
+ .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level)
};
struct kvm_x86_init_ops vt_init_ops __initdata = {
- .hardware_setup = vt_hardware_setup,
+ .hardware_setup = vt_op(hardware_setup),
.handle_intel_pt_intr = NULL,
.runtime_ops = &vt_x86_ops,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 71701e2414a4..b8ea1969113d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -302,7 +302,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
cpu = get_cpu();
prev = vmx->loaded_vmcs;
vmx->loaded_vmcs = vmcs;
- vmx_vcpu_load_vmcs(vcpu, cpu, prev);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
@@ -715,6 +715,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_APERF, MSR_TYPE_R);
+
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_MPERF, MSR_TYPE_R);
+
kvm_vcpu_unmap(vcpu, &map);
vmx->nested.force_msr_bitmap_recalc = false;
@@ -825,12 +831,30 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
return 0;
}
+static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
+ vmx->nested.msrs.misc_high);
+
+ return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
+}
+
static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
u32 count, u64 addr)
{
if (count == 0)
return 0;
+ /*
+ * Exceeding the limit results in architecturally _undefined_ behavior,
+ * i.e. KVM is allowed to do literally anything in response to a bad
+ * limit. Immediately generate a consistency check so that code that
+ * consumes the count doesn't need to worry about extreme edge cases.
+ */
+ if (count > nested_vmx_max_atomic_switch_msrs(vcpu))
+ return -EINVAL;
+
if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
!kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
return -EINVAL;
@@ -941,15 +965,6 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
return 0;
}
-static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
- vmx->nested.msrs.misc_high);
-
- return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
-}
-
/*
* Load guest's/host's msr at nested entry/exit.
* return 0 for success, entry index for failure.
@@ -966,7 +981,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
for (i = 0; i < count; i++) {
- if (unlikely(i >= max_msr_list_size))
+ if (WARN_ON_ONCE(i >= max_msr_list_size))
goto fail;
if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
@@ -1054,7 +1069,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
for (i = 0; i < count; i++) {
- if (unlikely(i >= max_msr_list_size))
+ if (WARN_ON_ONCE(i >= max_msr_list_size))
return -EINVAL;
if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
@@ -2654,10 +2669,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (vmx->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl &
+ vmx_get_supported_debugctl(vcpu, false));
} else {
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl);
+ vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl);
}
if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
@@ -3147,7 +3163,8 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
- CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
+ (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
+ CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
return -EINVAL;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
@@ -3521,7 +3538,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
- vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read();
if (kvm_mpx_supported() &&
(!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
@@ -4521,12 +4538,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
cpu = get_cpu();
vmx->loaded_vmcs = &vmx->nested.vmcs02;
- vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
vmx->loaded_vmcs = &vmx->vmcs01;
- vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
put_cpu();
}
@@ -4599,6 +4616,12 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+ /*
+ * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02.
+ * Writes to DEBUGCTL that aren't intercepted by L1 are immediately
+ * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into
+ * vmcs02 doesn't strictly track vmcs12.
+ */
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
vmcs12->guest_dr7 = vcpu->arch.dr7;
@@ -4789,7 +4812,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
__vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
kvm_set_dr(vcpu, 7, 0x400);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+ vmx_guest_debugctl_write(vcpu, 0);
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count))
@@ -4844,6 +4867,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
}
+ /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
+ vmx_reload_guest_debugctl(vcpu);
+
/*
* Note that calling vmx_set_{efer,cr0,cr4} is important as they
* handle a variety of side effects to KVM's software model.
@@ -5021,16 +5047,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- /*
- * If IBRS is advertised to the vCPU, KVM must flush the indirect
- * branch predictors when transitioning from L2 to L1, as L1 expects
- * hardware (KVM in this case) to provide separate predictor modes.
- * Bare metal isolates VMX root (host) from VMX non-root (guest), but
- * doesn't isolate different VMCSs, i.e. in this case, doesn't provide
- * separate modes for L2 vs L1.
- */
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL))
- indirect_branch_prediction_barrier();
+ kvm_nested_vmexit_handle_ibrs(vcpu);
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index bbf4509f32d0..0b173602821b 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -653,11 +653,11 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
*/
static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
{
- u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ u64 data = vmx_guest_debugctl_read();
if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
data &= ~DEBUGCTLMSR_LBR;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ vmx_guest_debugctl_write(vcpu, data);
}
}
@@ -730,7 +730,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
if (!lbr_desc->event) {
vmx_disable_lbr_msrs_passthrough(vcpu);
- if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+ if (vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR)
goto warn;
if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
goto warn;
@@ -752,7 +752,7 @@ warn:
static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
{
- if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+ if (!(vmx_guest_debugctl_read() & DEBUGCTLMSR_LBR))
intel_pmu_release_guest_lbr_event(vcpu);
}
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 99d1d599ff8c..4a6d9a17da23 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -2,6 +2,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
#include <asm/irq_remapping.h>
#include <asm/cpu.h>
@@ -34,7 +35,7 @@ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING
-struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
return &(to_vt(vcpu)->pi_desc);
}
@@ -72,13 +73,10 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
/*
* If the vCPU wasn't on the wakeup list and wasn't migrated, then the
* full update can be skipped as neither the vector nor the destination
- * needs to be changed.
+ * needs to be changed. Clear SN even if there is no assigned device,
+ * again for simplicity.
*/
if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
- /*
- * Clear SN if it was set due to being preempted. Again, do
- * this even if there is no assigned device for simplicity.
- */
if (pi_test_and_clear_sn(pi_desc))
goto after_clear_sn;
return;
@@ -148,9 +146,13 @@ after_clear_sn:
static bool vmx_can_use_vtd_pi(struct kvm *kvm)
{
- return irqchip_in_kernel(kvm) && enable_apicv &&
- kvm_arch_has_assigned_device(kvm) &&
- irq_remapping_cap(IRQ_POSTING_CAP);
+ /*
+ * Note, reading the number of possible bypass IRQs can race with a
+ * bypass IRQ being attached to the VM. vmx_pi_start_bypass() ensures
+ * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK.
+ */
+ return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() &&
+ READ_ONCE(kvm->arch.nr_possible_bypass_irqs);
}
/*
@@ -225,17 +227,23 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
if (!vmx_needs_pi_wakeup(vcpu))
return;
- if (kvm_vcpu_is_blocking(vcpu) &&
+ /*
+ * If the vCPU is blocking with IRQs enabled and ISN'T being preempted,
+ * enable the wakeup handler so that notification IRQ wakes the vCPU as
+ * expected. There is no need to enable the wakeup handler if the vCPU
+ * is preempted between setting its wait state and manually scheduling
+ * out, as the task is still runnable, i.e. doesn't need a wake event
+ * from KVM to be scheduled in.
+ *
+ * If the wakeup handler isn't being enabled, Suppress Notifications as
+ * the cost of propagating PIR.IRR to PID.ON is negligible compared to
+ * the cost of a spurious IRQ, and vCPU put/load is a slow path.
+ */
+ if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) &&
((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
(!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
pi_enable_wakeup_handler(vcpu);
-
- /*
- * Set SN when the vCPU is preempted. Note, the vCPU can both be seen
- * as blocking and preempted, e.g. if it's preempted between setting
- * its wait state and manually scheduling out.
- */
- if (vcpu->preempted)
+ else
pi_set_sn(pi_desc);
}
@@ -264,6 +272,14 @@ void __init pi_init_cpu(int cpu)
raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}
+void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi = vcpu_to_pi_desc(vcpu);
+
+ pi_clear_on(pi);
+ memset(pi->pir, 0, sizeof(pi->pir));
+}
+
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -274,99 +290,30 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
/*
- * Bail out of the block loop if the VM has an assigned
- * device, but the blocking vCPU didn't reconfigure the
- * PI.NV to the wakeup vector, i.e. the assigned device
- * came along after the initial check in vmx_vcpu_pi_put().
+ * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as
+ * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup
+ * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put().
*/
-void vmx_pi_start_assignment(struct kvm *kvm)
+void vmx_pi_start_bypass(struct kvm *kvm)
{
- if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm)))
return;
kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
}
-/*
- * vmx_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector)
{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- bool enable_remapped_mode = true;
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu;
- struct vcpu_data vcpu_info;
- int idx, ret = 0;
-
- if (!vmx_can_use_vtd_pi(kvm))
- return 0;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
- }
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
- /*
- * VT-d PI cannot support posting multicast/broadcast
- * interrupts to a vCPU, we still use interrupt remapping
- * for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- *
- * We will support full lowest-priority interrupt later.
- *
- * In addition, we can only inject generic interrupts using
- * the PI mechanism, refuse to route others through it.
- */
-
- kvm_set_msi_irq(kvm, e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq))
- continue;
-
- vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
- vcpu_info.vector = irq.vector;
-
- trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
- vcpu_info.vector, vcpu_info.pi_desc_addr, set);
-
- if (!set)
- continue;
-
- enable_remapped_mode = false;
-
- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- if (ret < 0) {
- printk(KERN_INFO "%s: failed to update PI IRTE\n",
- __func__);
- goto out;
- }
+ if (vcpu) {
+ struct intel_iommu_pi_data pi_data = {
+ .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)),
+ .vector = vector,
+ };
+
+ return irq_set_vcpu_affinity(host_irq, &pi_data);
+ } else {
+ return irq_set_vcpu_affinity(host_irq, NULL);
}
-
- if (enable_remapped_mode)
- ret = irq_set_vcpu_affinity(host_irq, NULL);
-
- ret = 0;
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
}
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 68605ca7ef68..a4af39948cf0 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -3,24 +3,27 @@
#define __KVM_X86_VMX_POSTED_INTR_H
#include <linux/bitmap.h>
-#include <asm/posted_intr.h>
+#include <linux/find.h>
+#include <linux/kvm_host.h>
-struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu);
+#include <asm/posted_intr.h>
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
+void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set);
-void vmx_pi_start_assignment(struct kvm *kvm);
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+ unsigned int host_irq, uint32_t guest_irq,
+ struct kvm_vcpu *vcpu, u32 vector);
+void vmx_pi_start_bypass(struct kvm *kvm);
static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
{
int vec;
- vec = find_last_bit((unsigned long *)pi_desc->pir, 256);
+ vec = find_last_bit(pi_desc->pir, 256);
return vec < 256 ? vec : -1;
}
diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
index 6a9bfdfbb6e5..2f20fb170def 100644
--- a/arch/x86/kvm/vmx/run_flags.h
+++ b/arch/x86/kvm/vmx/run_flags.h
@@ -2,10 +2,12 @@
#ifndef __KVM_X86_VMX_RUN_FLAGS_H
#define __KVM_X86_VMX_RUN_FLAGS_H
-#define VMX_RUN_VMRESUME_SHIFT 0
-#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
+#define VMX_RUN_VMRESUME_SHIFT 0
+#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1
+#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT 2
-#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
-#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
+#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT)
+#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT)
+#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT)
#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index b952bc673271..66744f5768c8 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -173,6 +173,8 @@ static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char i
tdx_clear_unsupported_cpuid(entry);
}
+#define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1)
+
static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
struct kvm_tdx_capabilities *caps)
{
@@ -188,6 +190,9 @@ static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
caps->cpuid.nent = td_conf->num_cpuid_config;
+ caps->user_tdvmcallinfo_1_r11 =
+ TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
+
for (i = 0; i < td_conf->num_cpuid_config; i++)
td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
@@ -738,7 +743,7 @@ bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
!to_tdx(vcpu)->vp_enter_args.r12;
}
-bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
+static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
u64 vcpu_state_details;
@@ -778,8 +783,6 @@ void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
else
vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
- vt->host_debugctlmsr = get_debugctlmsr();
-
vt->guest_state_loaded = true;
}
@@ -1020,20 +1023,20 @@ static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
DEBUGCTLMSR_FREEZE_IN_SMM)
-fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
struct vcpu_tdx *tdx = to_tdx(vcpu);
struct vcpu_vt *vt = to_vt(vcpu);
/*
- * force_immediate_exit requires vCPU entering for events injection with
- * an immediately exit followed. But The TDX module doesn't guarantee
- * entry, it's already possible for KVM to _think_ it completely entry
- * to the guest without actually having done so.
- * Since KVM never needs to force an immediate exit for TDX, and can't
- * do direct injection, just warn on force_immediate_exit.
+ * WARN if KVM wants to force an immediate exit, as the TDX module does
+ * not guarantee entry into the guest, i.e. it's possible for KVM to
+ * _think_ it completed entry to the guest and forced an immediate exit
+ * without actually having done so. Luckily, KVM never needs to force
+ * an immediate exit for TDX (KVM can't do direct event injection, so
+ * just WARN and continue on.
*/
- WARN_ON_ONCE(force_immediate_exit);
+ WARN_ON_ONCE(run_flags);
/*
* Wait until retry of SEPT-zap-related SEAMCALL completes before
@@ -1043,7 +1046,7 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
return EXIT_FASTPATH_EXIT_HANDLED;
- trace_kvm_entry(vcpu, force_immediate_exit);
+ trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
if (pi_test_on(&vt->pi_desc)) {
apic->send_IPI_self(POSTED_INTR_VECTOR);
@@ -1055,8 +1058,8 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
tdx_vcpu_enter_exit(vcpu);
- if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED)
- update_debugctlmsr(vt->host_debugctlmsr);
+ if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
tdx_load_host_xsave_state(vcpu);
tdx->guest_entered = true;
@@ -1212,11 +1215,13 @@ static int tdx_map_gpa(struct kvm_vcpu *vcpu)
/*
* Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
* userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
- * bit set. If not, the error code is not defined in GHCI for TDX, use
- * TDVMCALL_STATUS_INVALID_OPERAND for this case.
+ * bit set. This is a base call so it should always be supported, but
+ * KVM has no way to ensure that userspace implements the GHCI correctly.
+ * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
+ * to the guest.
*/
if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
- ret = TDVMCALL_STATUS_INVALID_OPERAND;
+ ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
goto error;
}
@@ -1449,20 +1454,106 @@ error:
return 1;
}
+static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
+
+ /*
+ * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
+ * directly without the support from userspace, just set the value
+ * returned from userspace.
+ */
+ tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
+ tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
+ tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
+ tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
+
+ return 1;
+}
+
static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
{
struct vcpu_tdx *tdx = to_tdx(vcpu);
- if (tdx->vp_enter_args.r12)
- tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
- else {
+ switch (tdx->vp_enter_args.r12) {
+ case 0:
tdx->vp_enter_args.r11 = 0;
+ tdx->vp_enter_args.r12 = 0;
tdx->vp_enter_args.r13 = 0;
tdx->vp_enter_args.r14 = 0;
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
+ return 1;
+ case 1:
+ vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
+ vcpu->run->exit_reason = KVM_EXIT_TDX;
+ vcpu->run->tdx.flags = 0;
+ vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
+ vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
+ vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
+ vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
+ vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
+ vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
+ vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
+ return 0;
+ default:
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
}
+}
+
+static int tdx_complete_simple(struct kvm_vcpu *vcpu)
+{
+ tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
return 1;
}
+static int tdx_get_quote(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 gpa = tdx->vp_enter_args.r12;
+ u64 size = tdx->vp_enter_args.r13;
+
+ /* The gpa of buffer must have shared bit set. */
+ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_TDX;
+ vcpu->run->tdx.flags = 0;
+ vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
+ vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
+ vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
+ vcpu->run->tdx.get_quote.size = size;
+
+ vcpu->arch.complete_userspace_io = tdx_complete_simple;
+
+ return 0;
+}
+
+static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 vector = tdx->vp_enter_args.r12;
+
+ if (vector < 32 || vector > 255) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_TDX;
+ vcpu->run->tdx.flags = 0;
+ vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
+ vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
+ vcpu->run->tdx.setup_event_notify.vector = vector;
+
+ vcpu->arch.complete_userspace_io = tdx_complete_simple;
+
+ return 0;
+}
+
static int handle_tdvmcall(struct kvm_vcpu *vcpu)
{
switch (tdvmcall_leaf(vcpu)) {
@@ -1472,11 +1563,15 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu)
return tdx_report_fatal_error(vcpu);
case TDVMCALL_GET_TD_VM_CALL_INFO:
return tdx_get_td_vm_call_info(vcpu);
+ case TDVMCALL_GET_QUOTE:
+ return tdx_get_quote(vcpu);
+ case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
+ return tdx_setup_event_notify_interrupt(vcpu);
default:
break;
}
- tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
return 1;
}
@@ -1543,8 +1638,8 @@ static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
return 0;
}
-int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct page *page = pfn_to_page(pfn);
@@ -1624,8 +1719,8 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
return 0;
}
-int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt)
+static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
{
int tdx_level = pg_level_to_tdx_sept_level(level);
gpa_t gpa = gfn_to_gpa(gfn);
@@ -1760,8 +1855,8 @@ static void tdx_track(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}
-int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt)
+static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
@@ -1783,8 +1878,8 @@ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
return tdx_reclaim_page(virt_to_page(private_spt));
}
-int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn)
+static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
{
struct page *page = pfn_to_page(pfn);
int ret;
@@ -2172,25 +2267,26 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
struct kvm_tdx_capabilities __user *user_caps;
struct kvm_tdx_capabilities *caps = NULL;
+ u32 nr_user_entries;
int ret = 0;
/* flags is reserved for future use */
if (cmd->flags)
return -EINVAL;
- caps = kmalloc(sizeof(*caps) +
+ caps = kzalloc(sizeof(*caps) +
sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
GFP_KERNEL);
if (!caps)
return -ENOMEM;
user_caps = u64_to_user_ptr(cmd->data);
- if (copy_from_user(caps, user_caps, sizeof(*caps))) {
+ if (get_user(nr_user_entries, &user_caps->cpuid.nent)) {
ret = -EFAULT;
goto out;
}
- if (caps->cpuid.nent < td_conf->num_cpuid_config) {
+ if (nr_user_entries < td_conf->num_cpuid_config) {
ret = -E2BIG;
goto out;
}
@@ -3507,10 +3603,14 @@ int __init tdx_bringup(void)
r = __tdx_bringup();
if (r) {
/*
- * Disable TDX only but don't fail to load module if
- * the TDX module could not be loaded. No need to print
- * message saying "module is not loaded" because it was
- * printed when the first SEAMCALL failed.
+ * Disable TDX only but don't fail to load module if the TDX
+ * module could not be loaded. No need to print message saying
+ * "module is not loaded" because it was printed when the first
+ * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or
+ * vm_size, as kvm_x86_ops have already been finalized (and are
+ * intentionally not exported). The S-EPT code is unreachable,
+ * and allocating a few more bytes per VM in a should-be-rare
+ * failure scenario is a non-issue.
*/
if (r == -ENODEV)
goto success_disable_tdx;
@@ -3524,3 +3624,20 @@ success_disable_tdx:
enable_tdx = 0;
return 0;
}
+
+void __init tdx_hardware_setup(void)
+{
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
+
+ /*
+ * Note, if the TDX module can't be loaded, KVM TDX support will be
+ * disabled but KVM will continue loading (see tdx_bringup()).
+ */
+ vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
+
+ vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
+ vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
+ vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
+ vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
+ vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
+}
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 51f98443e8a2..ca39a9391db1 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -8,6 +8,7 @@
#ifdef CONFIG_KVM_INTEL_TDX
#include "common.h"
+void tdx_hardware_setup(void);
int tdx_bringup(void);
void tdx_cleanup(void);
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index f6986dee6f8c..0a6cf5bff2aa 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -59,8 +59,7 @@
* without the explicit restore, thinks the stack is getting walloped.
* Using an unwind hint is problematic due to x86-64's dynamic alignment.
*/
- mov %_ASM_BP, %_ASM_SP
- pop %_ASM_BP
+ leave
RET
.endm
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b12414108cbf..aa157fe5b7b3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -75,6 +75,8 @@
#include "vmx_onhyperv.h"
#include "posted_intr.h"
+#include "mmu/spte.h"
+
MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
@@ -113,10 +115,10 @@ static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, 0444);
module_param(enable_apicv, bool, 0444);
-
-bool __read_mostly enable_ipiv = true;
module_param(enable_ipiv, bool, 0444);
+module_param(enable_device_posted_irqs, bool, 0444);
+
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -166,31 +168,6 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
RTIT_STATUS_BYTECNT))
/*
- * List of MSRs that can be directly passed to the guest.
- * In addition to these x2apic, PT and LBR MSRs are handled specially.
- */
-static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
- MSR_IA32_SPEC_CTRL,
- MSR_IA32_PRED_CMD,
- MSR_IA32_FLUSH_CMD,
- MSR_IA32_TSC,
-#ifdef CONFIG_X86_64
- MSR_FS_BASE,
- MSR_GS_BASE,
- MSR_KERNEL_GS_BASE,
- MSR_IA32_XFD,
- MSR_IA32_XFD_ERR,
-#endif
- MSR_IA32_SYSENTER_CS,
- MSR_IA32_SYSENTER_ESP,
- MSR_IA32_SYSENTER_EIP,
- MSR_CORE_C1_RES,
- MSR_CORE_C3_RESIDENCY,
- MSR_CORE_C6_RESIDENCY,
- MSR_CORE_C7_RESIDENCY,
-};
-
-/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
* executions of PAUSE in a loop. Also indicate if ple enabled.
@@ -672,40 +649,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static int vmx_get_passthrough_msr_slot(u32 msr)
-{
- int i;
-
- switch (msr) {
- case 0x800 ... 0x8ff:
- /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
- return -ENOENT;
- case MSR_IA32_RTIT_STATUS:
- case MSR_IA32_RTIT_OUTPUT_BASE:
- case MSR_IA32_RTIT_OUTPUT_MASK:
- case MSR_IA32_RTIT_CR3_MATCH:
- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
- /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
- case MSR_LBR_SELECT:
- case MSR_LBR_TOS:
- case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
- case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
- case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
- case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
- case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
- /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
- return -ENOENT;
- }
-
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
- if (vmx_possible_passthrough_msrs[i] == msr)
- return i;
- }
-
- WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
- return -ENOENT;
-}
-
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -772,8 +715,11 @@ void vmx_emergency_disable_virtualization_cpu(void)
return;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
+ loaded_vmcss_on_cpu_link) {
vmcs_clear(v->vmcs);
+ if (v->shadow_vmcs)
+ vmcs_clear(v->shadow_vmcs);
+ }
kvm_cpu_vmxoff();
}
@@ -958,6 +904,10 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
flags |= VMX_RUN_SAVE_SPEC_CTRL;
+ if (static_branch_unlikely(&cpu_buf_vm_clear) &&
+ kvm_vcpu_can_access_host_mmio(&vmx->vcpu))
+ flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO;
+
return flags;
}
@@ -1445,8 +1395,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
- struct loaded_vmcs *buddy)
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
@@ -1473,17 +1422,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
if (prev != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
-
- /*
- * No indirect branch prediction barrier needed when switching
- * the active VMCS within a vCPU, unless IBRS is advertised to
- * the vCPU. To minimize the number of IBPBs executed, KVM
- * performs IBPB on nested VM-Exit (a single nested transition
- * may switch the active VMCS multiple times).
- */
- if (static_branch_likely(&switch_vcpu_ibpb) &&
- (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)))
- indirect_branch_prediction_barrier();
}
if (!already_loaded) {
@@ -1522,7 +1460,7 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
- vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
vmx_vcpu_pi_load(vcpu, cpu);
}
@@ -2156,7 +2094,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ msr_info->data = vmx_guest_debugctl_read();
break;
default:
find_uret_msr:
@@ -2181,7 +2119,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
return (unsigned long)data;
}
-static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
{
u64 debugctl = 0;
@@ -2193,9 +2131,25 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
(host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ if (boot_cpu_has(X86_FEATURE_RTM) &&
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
+ debugctl |= DEBUGCTLMSR_RTM_DEBUG;
+
return debugctl;
}
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated)
+{
+ u64 invalid;
+
+ invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated);
+ if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR);
+ }
+ return !invalid;
+}
+
/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -2264,29 +2218,22 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_DEBUGCTLMSR: {
- u64 invalid;
-
- invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
- if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
- kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
- data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
- }
-
- if (invalid)
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated))
return 1;
+ data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
+
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ vmx_guest_debugctl_write(vcpu, data);
+
if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
(data & DEBUGCTLMSR_LBR))
intel_pmu_create_guest_lbr_event(vcpu);
return 0;
- }
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
@@ -4020,76 +3967,29 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
vmx->nested.force_msr_bitmap_recalc = true;
}
-void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
- int idx;
if (!cpu_has_vmx_msr_bitmap())
return;
vmx_msr_bitmap_l01_changed(vmx);
- /*
- * Mark the desired intercept state in shadow bitmap, this is needed
- * for resync when the MSR filters change.
- */
- idx = vmx_get_passthrough_msr_slot(msr);
- if (idx >= 0) {
- if (type & MSR_TYPE_R)
- clear_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- clear_bit(idx, vmx->shadow_msr_intercept.write);
- }
-
- if ((type & MSR_TYPE_R) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
- type &= ~MSR_TYPE_R;
- }
-
- if ((type & MSR_TYPE_W) &&
- !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
- type &= ~MSR_TYPE_W;
+ if (type & MSR_TYPE_R) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
}
- if (type & MSR_TYPE_R)
- vmx_clear_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_clear_msr_bitmap_write(msr_bitmap, msr);
-}
-
-void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
- int idx;
-
- if (!cpu_has_vmx_msr_bitmap())
- return;
-
- vmx_msr_bitmap_l01_changed(vmx);
-
- /*
- * Mark the desired intercept state in shadow bitmap, this is needed
- * for resync when the MSR filter changes.
- */
- idx = vmx_get_passthrough_msr_slot(msr);
- if (idx >= 0) {
- if (type & MSR_TYPE_R)
- set_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- set_bit(idx, vmx->shadow_msr_intercept.write);
+ if (type & MSR_TYPE_W) {
+ if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
+ else
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-
- if (type & MSR_TYPE_R)
- vmx_set_msr_bitmap_read(msr_bitmap, msr);
-
- if (type & MSR_TYPE_W)
- vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
@@ -4168,35 +4068,57 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 i;
-
if (!cpu_has_vmx_msr_bitmap())
return;
- /*
- * Redo intercept permissions for MSRs that KVM is passing through to
- * the guest. Disabling interception will check the new MSR filter and
- * ensure that KVM enables interception if usersepace wants to filter
- * the MSR. MSRs that KVM is already intercepting don't need to be
- * refreshed since KVM is going to intercept them regardless of what
- * userspace wants.
- */
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
- u32 msr = vmx_possible_passthrough_msrs[i];
-
- if (!test_bit(i, vmx->shadow_msr_intercept.read))
- vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
-
- if (!test_bit(i, vmx->shadow_msr_intercept.write))
- vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
+ vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ if (kvm_cstate_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ }
+ if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
}
/* PT MSRs can be passed through iff PT is exposed to the guest. */
if (vmx_pt_mode_is_host_guest())
pt_update_intercept_for_msr(vcpu);
+
+ if (vcpu->arch.xfd_no_write_intercept)
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW);
+
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW,
+ !to_vmx(vcpu)->spec_ctrl);
+
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
+
+ if (cpu_feature_enabled(X86_FEATURE_IBPB))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
+
+ if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
+ /*
+ * x2APIC and LBR MSR intercepts are modified on-demand and cannot be
+ * filtered by userspace.
+ */
}
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -4797,7 +4719,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write32(GUEST_SYSENTER_CS, 0);
vmcs_writel(GUEST_SYSENTER_ESP, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ vmx_guest_debugctl_write(&vmx->vcpu, 0);
if (cpu_has_vmx_tpr_shadow()) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
@@ -5613,12 +5536,6 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
-void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
-{
- lockdep_assert_irqs_disabled();
- set_debugreg(vcpu->arch.dr6, 6);
-}
-
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
@@ -7297,8 +7214,8 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&cpu_buf_vm_clear) &&
- kvm_arch_has_assigned_device(vcpu->kvm))
- mds_clear_cpu_buffers();
+ (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO))
+ x86_clear_cpu_buffers();
vmx_disable_fb_clear(vmx);
@@ -7330,8 +7247,9 @@ out:
guest_state_exit_irqoff();
}
-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
{
+ bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7376,6 +7294,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
vcpu->arch.regs_dirty = 0;
+ if (run_flags & KVM_RUN_LOAD_GUEST_DR6)
+ set_debugreg(vcpu->arch.dr6, 6);
+
+ if (run_flags & KVM_RUN_LOAD_DEBUGCTL)
+ vmx_reload_guest_debugctl(vcpu);
+
/*
* Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
* prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
@@ -7550,26 +7474,6 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu)
evmcs->hv_enlightenments_control.msr_bitmap = 1;
}
- /* The MSR bitmap starts with all ones */
- bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
-
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
-#ifdef CONFIG_X86_64
- vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-#endif
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
- if (kvm_cstate_in_guest(vcpu->kvm)) {
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
- }
-
vmx->loaded_vmcs = &vmx->vmcs01;
if (cpu_need_virtualize_apic_accesses(vcpu)) {
@@ -7619,7 +7523,7 @@ free_vpid:
int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
- kvm->arch.pause_in_guest = true;
+ kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
switch (l1tf_mitigation) {
@@ -7856,18 +7760,6 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
}
}
- if (kvm_cpu_cap_has(X86_FEATURE_XFD))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
- !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
-
- if (boot_cpu_has(X86_FEATURE_IBPB))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
- !guest_has_pred_cmd_msr(vcpu));
-
- if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
- vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
- !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
-
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
@@ -7883,6 +7775,9 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx->msr_ia32_feature_control_valid_bits &=
~FEAT_CTL_SGX_LC_ENABLED;
+ /* Recalc MSR interception to account for feature changes. */
+ vmx_recalc_msr_intercepts(vcpu);
+
/* Refresh #PF interception to account for MAXPHYADDR changes. */
vmx_update_exception_bitmap(vcpu);
}
@@ -8657,6 +8552,8 @@ int __init vmx_init(void)
{
int r, cpu;
+ KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
+
if (!kvm_is_vmx_supported())
return -EOPNOTSUPP;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 6d1e40ecc024..d3389baf3ab3 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -19,8 +19,6 @@
#include "../mmu.h"
#include "common.h"
-#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
-
#ifdef CONFIG_X86_64
#define MAX_NR_USER_RETURN_MSRS 7
#else
@@ -296,13 +294,6 @@ struct vcpu_vmx {
struct pt_desc pt_desc;
struct lbr_desc lbr_desc;
- /* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16
- struct {
- DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- } shadow_msr_intercept;
-
/* ve_info must be page aligned. */
struct vmx_ve_information *ve_info;
};
@@ -354,8 +345,7 @@ static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
return vt->exit_intr_info;
}
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
- struct loaded_vmcs *buddy);
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu);
int allocate_vpid(void);
void free_vpid(int vpid);
void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
@@ -396,24 +386,54 @@ bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs,
int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
-void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
-void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set);
+
+static inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ vmx_set_intercept_for_msr(vcpu, msr, type, false);
+}
+
+static inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type)
+{
+ vmx_set_intercept_for_msr(vcpu, msr, type, true);
+}
u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
-static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
- int type, bool value)
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+
+u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated);
+bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated);
+
+#define VMX_HOST_OWNED_DEBUGCTL_BITS (DEBUGCTLMSR_FREEZE_IN_SMM)
+
+static inline void vmx_guest_debugctl_write(struct kvm_vcpu *vcpu, u64 val)
{
- if (value)
- vmx_enable_intercept_for_msr(vcpu, msr, type);
- else
- vmx_disable_intercept_for_msr(vcpu, msr, type);
+ WARN_ON_ONCE(val & VMX_HOST_OWNED_DEBUGCTL_BITS);
+
+ val |= vcpu->arch.host_debugctl & VMX_HOST_OWNED_DEBUGCTL_BITS;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, val);
}
-void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+static inline u64 vmx_guest_debugctl_read(void)
+{
+ return vmcs_read64(GUEST_IA32_DEBUGCTL) & ~VMX_HOST_OWNED_DEBUGCTL_BITS;
+}
+
+static inline void vmx_reload_guest_debugctl(struct kvm_vcpu *vcpu)
+{
+ u64 val = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+ if (!((val ^ vcpu->arch.host_debugctl) & VMX_HOST_OWNED_DEBUGCTL_BITS))
+ return;
+
+ vmx_guest_debugctl_write(vcpu, val & ~VMX_HOST_OWNED_DEBUGCTL_BITS);
+}
/*
* Note, early Intel manuals have the write-low and read-high bitmap offsets
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 6bf8be570b2e..2b3424f638db 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -21,7 +21,7 @@ void vmx_vm_destroy(struct kvm *kvm);
int vmx_vcpu_precreate(struct kvm *kvm);
int vmx_vcpu_create(struct kvm_vcpu *vcpu);
int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu);
-fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
void vmx_vcpu_free(struct kvm_vcpu *vcpu);
void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
@@ -52,11 +52,12 @@ void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
int trig_mode, int vector);
void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
bool vmx_has_emulated_msr(struct kvm *kvm, u32 index);
-void vmx_msr_filter_changed(struct kvm_vcpu *vcpu);
+void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu);
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
int vmx_get_feature_msr(u32 msr, u64 *data);
int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+#define vmx_complete_emulated_msr kvm_complete_insn_gp
u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -132,10 +133,9 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void tdx_vcpu_free(struct kvm_vcpu *vcpu);
void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu);
-fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void tdx_vcpu_put(struct kvm_vcpu *vcpu);
-bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu);
int tdx_handle_exit(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion fastpath);
@@ -150,84 +150,10 @@ int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
-int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt);
-int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, void *private_spt);
-int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn);
-int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level, kvm_pfn_t pfn);
-
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
-#else
-static inline void tdx_disable_virtualization_cpu(void) {}
-static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
-static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
-static inline void tdx_vm_destroy(struct kvm *kvm) {}
-static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
-
-static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
-static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
-static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
-static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {}
-static inline int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
-static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
-{
- return EXIT_FASTPATH_NONE;
-}
-static inline void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) {}
-static inline void tdx_vcpu_put(struct kvm_vcpu *vcpu) {}
-static inline bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) { return false; }
-static inline int tdx_handle_exit(struct kvm_vcpu *vcpu,
- enum exit_fastpath_completion fastpath) { return 0; }
-
-static inline void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
- int trig_mode, int vector) {}
-static inline void tdx_inject_nmi(struct kvm_vcpu *vcpu) {}
-static inline void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1,
- u64 *info2, u32 *intr_info, u32 *error_code) {}
-static inline bool tdx_has_emulated_msr(u32 index) { return false; }
-static inline int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; }
-static inline int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; }
-
-static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
-
-static inline int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level,
- void *private_spt)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level,
- void *private_spt)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level,
- kvm_pfn_t pfn)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level,
- kvm_pfn_t pfn)
-{
- return -EOPNOTSUPP;
-}
-
-static inline void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) {}
-static inline void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) {}
-static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
-static inline int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) { return 0; }
#endif
#endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 570e7f8cbf64..a1c49bc681c4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -226,6 +226,12 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
EXPORT_SYMBOL_GPL(enable_apicv);
+bool __read_mostly enable_ipiv = true;
+EXPORT_SYMBOL_GPL(enable_ipiv);
+
+bool __read_mostly enable_device_posted_irqs = true;
+EXPORT_SYMBOL_GPL(enable_device_posted_irqs);
+
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
@@ -3255,9 +3261,11 @@ int kvm_guest_time_update(struct kvm_vcpu *v)
/* With all the info we got, fill in the values */
- if (kvm_caps.has_tsc_control)
+ if (kvm_caps.has_tsc_control) {
tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
v->arch.l1_tsc_scaling_ratio);
+ tgt_tsc_khz = tgt_tsc_khz ? : 1;
+ }
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
@@ -4574,6 +4582,9 @@ static u64 kvm_get_allowed_disable_exits(void)
{
u64 r = KVM_X86_DISABLE_EXITS_PAUSE;
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+ r |= KVM_X86_DISABLE_EXITS_APERFMPERF;
+
if (!mitigate_smt_rsb) {
r |= KVM_X86_DISABLE_EXITS_HLT |
KVM_X86_DISABLE_EXITS_CSTATE;
@@ -4629,17 +4640,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXT_CPUID:
case KVM_CAP_EXT_EMUL_CPUID:
case KVM_CAP_CLOCKSOURCE:
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CAP_PIT:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_REINJECT_CONTROL:
+#endif
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_USER_NMI:
- case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_IOEVENTFD_NO_LENGTH:
- case KVM_CAP_PIT2:
- case KVM_CAP_PIT_STATE2:
+
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
case KVM_CAP_VCPU_EVENTS:
#ifdef CONFIG_KVM_HYPERV
@@ -4980,16 +4994,13 @@ out:
return r;
}
-static void wbinvd_ipi(void *garbage)
-{
- wbinvd();
-}
-
static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
}
+static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu);
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -5006,12 +5017,24 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_x86_call(has_wbinvd_exit)())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
- smp_call_function_single(vcpu->cpu,
- wbinvd_ipi, NULL, 1);
+ wbinvd_on_cpu(vcpu->cpu);
}
kvm_x86_call(vcpu_load)(vcpu, cpu);
+ if (vcpu != per_cpu(last_vcpu, cpu)) {
+ /*
+ * Flush the branch predictor when switching vCPUs on the same
+ * physical CPU, as each vCPU needs its own branch prediction
+ * domain. No IBPB is needed when switching between L1 and L2
+ * on the same vCPU unless IBRS is advertised to the vCPU; that
+ * is handled on the nested VM-Exit path.
+ */
+ if (static_branch_likely(&switch_vcpu_ibpb))
+ indirect_branch_prediction_barrier();
+ per_cpu(last_vcpu, cpu) = vcpu;
+ }
+
/* Save host pkru register if supported */
vcpu->arch.host_pkru = read_pkru();
@@ -5469,12 +5492,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
(events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
return -EINVAL;
- /* INITs are latched while in SMM */
- if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
- (events->smi.smm || events->smi.pending) &&
- vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
- return -EINVAL;
-
process_nmi(vcpu);
/*
@@ -6168,6 +6185,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
u32 user_tsc_khz;
r = -EINVAL;
+
+ if (vcpu->arch.guest_tsc_protected)
+ goto out;
+
user_tsc_khz = (u32)arg;
if (kvm_caps.has_tsc_control &&
@@ -6377,135 +6398,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy(&chip->chip.pic, &pic->pics[0],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy(&chip->chip.pic, &pic->pics[1],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- kvm_get_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- return r;
-}
-
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic->lock);
- memcpy(&pic->pics[0], &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic->lock);
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic->lock);
- memcpy(&pic->pics[1], &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic->lock);
- break;
- case KVM_IRQCHIP_IOAPIC:
- kvm_set_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- kvm_pic_update_irq(pic);
- return r;
-}
-
-static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
-
- BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
-
- mutex_lock(&kps->lock);
- memcpy(ps, &kps->channels, sizeof(*ps));
- mutex_unlock(&kps->lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- int i;
- struct kvm_pit *pit = kvm->arch.vpit;
-
- mutex_lock(&pit->pit_state.lock);
- memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
- for (i = 0; i < 3; i++)
- kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
- mutex_unlock(&pit->pit_state.lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
- sizeof(ps->channels));
- ps->flags = kvm->arch.vpit->pit_state.flags;
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- memset(&ps->reserved, 0, sizeof(ps->reserved));
- return 0;
-}
-
-static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- int start = 0;
- int i;
- u32 prev_legacy, cur_legacy;
- struct kvm_pit *pit = kvm->arch.vpit;
-
- mutex_lock(&pit->pit_state.lock);
- prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
- cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
- if (!prev_legacy && cur_legacy)
- start = 1;
- memcpy(&pit->pit_state.channels, &ps->channels,
- sizeof(pit->pit_state.channels));
- pit->pit_state.flags = ps->flags;
- for (i = 0; i < 3; i++)
- kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
- start && i == 0);
- mutex_unlock(&pit->pit_state.lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_reinject(struct kvm *kvm,
- struct kvm_reinject_control *control)
-{
- struct kvm_pit *pit = kvm->arch.vpit;
-
- /* pit->pit_state.lock was overloaded to prevent userspace from getting
- * an inconsistent state after running multiple KVM_REINJECT_CONTROL
- * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
- */
- mutex_lock(&pit->pit_state.lock);
- kvm_pit_set_reinject(pit, control->pit_reinject);
- mutex_unlock(&pit->pit_state.lock);
-
- return 0;
-}
-
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
@@ -6525,18 +6417,6 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
kvm_vcpu_kick(vcpu);
}
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
- bool line_status)
-{
- if (!irqchip_in_kernel(kvm))
- return -ENXIO;
-
- irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event->irq, irq_event->level,
- line_status);
- return 0;
-}
-
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
@@ -6601,17 +6481,11 @@ split_irqchip_unlock:
if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) &&
cpu_smt_possible() &&
- (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
+ (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
+ KVM_X86_DISABLE_EXITS_APERFMPERF)))
pr_warn_once(SMT_RSB_MSG);
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
- kvm->arch.pause_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT)
- kvm->arch.mwait_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
- kvm->arch.hlt_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
- kvm->arch.cstate_in_guest = true;
+ kvm_disable_exits(kvm, cap->args[0]);
r = 0;
disable_exits_unlock:
mutex_unlock(&kvm->lock);
@@ -7048,9 +6922,11 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -ENOTTY;
+
+#ifdef CONFIG_KVM_IOAPIC
/*
* This union makes it completely explicit to gcc-3.x
- * that these two variables' stack usage should be
+ * that these three variables' stack usage should be
* combined, not added together.
*/
union {
@@ -7058,6 +6934,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm_pit_state2 ps2;
struct kvm_pit_config pit_config;
} u;
+#endif
switch (ioctl) {
case KVM_SET_TSS_ADDR:
@@ -7081,6 +6958,7 @@ set_identity_unlock:
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
break;
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CREATE_IRQCHIP: {
mutex_lock(&kvm->lock);
@@ -7102,7 +6980,7 @@ set_identity_unlock:
goto create_irqchip_unlock;
}
- r = kvm_setup_default_irq_routing(kvm);
+ r = kvm_setup_default_ioapic_and_pic_routing(kvm);
if (r) {
kvm_ioapic_destroy(kvm);
kvm_pic_destroy(kvm);
@@ -7150,7 +7028,7 @@ set_identity_unlock:
}
r = -ENXIO;
- if (!irqchip_kernel(kvm))
+ if (!irqchip_full(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@@ -7174,7 +7052,7 @@ set_identity_unlock:
}
r = -ENXIO;
- if (!irqchip_kernel(kvm))
+ if (!irqchip_full(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
set_irqchip_out:
@@ -7247,6 +7125,7 @@ set_pit2_out:
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
+#endif
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
@@ -7317,19 +7196,25 @@ set_pit2_out:
if (user_tsc_khz == 0)
user_tsc_khz = tsc_khz;
- WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
- r = 0;
-
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
goto out;
}
case KVM_GET_TSC_KHZ: {
r = READ_ONCE(kvm->arch.default_tsc_khz);
goto out;
}
- case KVM_MEMORY_ENCRYPT_OP: {
+ case KVM_MEMORY_ENCRYPT_OP:
+ r = -ENOTTY;
+ if (!kvm_x86_ops.mem_enc_ioctl)
+ goto out;
+
r = kvm_x86_call(mem_enc_ioctl)(kvm, argp);
break;
- }
case KVM_MEMORY_ENCRYPT_REG_REGION: {
struct kvm_enc_region region;
@@ -8023,7 +7908,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
return rc;
if (!vcpu->mmio_nr_fragments)
- return rc;
+ return X86EMUL_CONTINUE;
gpa = vcpu->mmio_fragments[0].gpa;
@@ -8268,8 +8153,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
- wbinvd_ipi, NULL, 1);
+ wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask);
put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
} else
@@ -9361,7 +9245,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
{
vcpu->arch.pio.count = 0;
- if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)))
return 1;
return kvm_skip_emulated_instruction(vcpu);
@@ -9386,7 +9270,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
complete_fast_pio_out_port_0x7e;
kvm_skip_emulated_instruction(vcpu);
} else {
- vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_out;
}
return 0;
@@ -9399,7 +9283,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
/* We should only ever be called with arch.pio.count equal to 1 */
BUG_ON(vcpu->arch.pio.count != 1);
- if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) {
vcpu->arch.pio.count = 0;
return 1;
}
@@ -9428,7 +9312,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
return ret;
}
- vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_in;
return 0;
@@ -9811,6 +9695,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (r != 0)
goto out_mmu_exit;
+ enable_device_posted_irqs &= enable_apicv &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
+
kvm_ops_update(ops);
for_each_online_cpu(cpu) {
@@ -10694,13 +10581,16 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
return;
bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
+ vcpu->arch.highest_stale_pending_ioapic_eoi = -1;
kvm_x86_call(sync_pir_to_irr)(vcpu);
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+#ifdef CONFIG_KVM_IOAPIC
else if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+#endif
if (is_guest_mode(vcpu))
vcpu->arch.load_eoi_exitmap_pending = true;
@@ -10754,6 +10644,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
fastpath_t exit_fastpath;
+ u64 run_flags, debug_ctl;
bool req_immediate_exit = false;
@@ -10901,8 +10792,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_vcpu_update_apicv(vcpu);
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
+
+ /*
+ * Recalc MSR intercepts as userspace may want to intercept
+ * accesses to MSRs that KVM would otherwise pass through to
+ * the guest.
+ */
if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
- kvm_x86_call(msr_filter_changed)(vcpu);
+ kvm_x86_call(recalc_msr_intercepts)(vcpu);
if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
kvm_x86_call(update_cpu_dirty_logging)(vcpu);
@@ -10998,8 +10895,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
- if (req_immediate_exit)
+ run_flags = 0;
+ if (req_immediate_exit) {
+ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -11010,19 +10910,29 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (unlikely(vcpu->arch.switch_db_regs &&
!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
- set_debugreg(0, 7);
+ set_debugreg(DR7_FIXED_1, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
/* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
+ run_flags |= KVM_RUN_LOAD_GUEST_DR6;
} else if (unlikely(hw_breakpoint_active())) {
- set_debugreg(0, 7);
+ set_debugreg(DR7_FIXED_1, 7);
}
- vcpu->arch.host_debugctl = get_debugctlmsr();
+ /*
+ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
+ * can be modified in IRQ context, e.g. via SMP function calls. Inform
+ * vendor code if any host-owned bits were changed, e.g. so that the
+ * value loaded into hardware while running the guest can be updated.
+ */
+ debug_ctl = get_debugctlmsr();
+ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
+ !vcpu->arch.guest_state_protected)
+ run_flags |= KVM_RUN_LOAD_DEBUGCTL;
+ vcpu->arch.host_debugctl = debug_ctl;
guest_timing_enter_irqoff();
@@ -11036,8 +10946,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = kvm_x86_call(vcpu_run)(vcpu,
- req_immediate_exit);
+ exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -11049,6 +10958,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
break;
}
+ run_flags = 0;
+
/* Note, VM-Exits that go down the "slow" path are accounted below. */
++vcpu->stat.exits;
}
@@ -11522,6 +11433,28 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
trace_kvm_fpu(0);
}
+static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ /*
+ * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
+ * tracks the pending SIPI separately. SIPI_RECEIVED is still accepted
+ * by KVM_SET_VCPU_EVENTS for backwards compatibility, but should be
+ * converted to INIT_RECEIVED.
+ */
+ if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED))
+ return -EINVAL;
+
+ /*
+ * Disallow running the vCPU if userspace forced it into an impossible
+ * MP_STATE, e.g. if the vCPU is in WFS but SIPI is blocked.
+ */
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED &&
+ !kvm_apic_init_sipi_allowed(vcpu))
+ return -EINVAL;
+
+ return kvm_x86_call(vcpu_pre_run)(vcpu);
+}
+
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
@@ -11624,7 +11557,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
- r = kvm_x86_call(vcpu_pre_run)(vcpu);
+ r = kvm_x86_vcpu_pre_run(vcpu);
if (r <= 0)
goto out;
@@ -11868,21 +11801,16 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
}
/*
- * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
- * forcing the guest into INIT/SIPI if those events are supposed to be
- * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state
- * if an SMI is pending as well.
+ * SIPI_RECEIVED is obsolete and no longer used internally; KVM instead
+ * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI.
+ * Translate SIPI_RECEIVED as appropriate for backwards compatibility.
*/
- if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
- (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
- mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
- goto out;
-
if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
- kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
+ mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED;
set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
- } else
- kvm_set_mp_state(vcpu, mp_state->mp_state);
+ }
+
+ kvm_set_mp_state(vcpu, mp_state->mp_state);
kvm_make_request(KVM_REQ_EVENT, vcpu);
ret = 0;
@@ -12419,13 +12347,16 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- int idx;
+ int idx, cpu;
kvm_clear_async_pf_completion_queue(vcpu);
kvm_mmu_unload(vcpu);
kvmclock_reset(vcpu);
+ for_each_possible_cpu(cpu)
+ cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL);
+
kvm_x86_call(vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
@@ -12761,21 +12692,16 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto out;
- kvm_mmu_init_vm(kvm);
+ ret = kvm_mmu_init_vm(kvm);
+ if (ret)
+ goto out_cleanup_page_track;
ret = kvm_x86_call(vm_init)(kvm);
if (ret)
goto out_uninit_mmu;
- INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
- /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
- set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
- /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
- set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
- &kvm->arch.irq_sources_bitmap);
-
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
@@ -12814,6 +12740,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
out_uninit_mmu:
kvm_mmu_uninit_vm(kvm);
+out_cleanup_page_track:
kvm_page_track_cleanup(kvm);
out:
return ret;
@@ -12906,7 +12833,9 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
+#ifdef CONFIG_KVM_IOAPIC
kvm_free_pit(kvm);
+#endif
kvm_mmu_pre_destroy_vm(kvm);
static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
@@ -12930,8 +12859,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
}
kvm_destroy_vcpus(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
+#ifdef CONFIG_KVM_IOAPIC
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
+#endif
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
@@ -13541,25 +13472,6 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
}
-void kvm_arch_start_assignment(struct kvm *kvm)
-{
- if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
- kvm_x86_call(pi_start_assignment)(kvm);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
-
-void kvm_arch_end_assignment(struct kvm *kvm)
-{
- atomic_dec(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
-
-bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
-{
- return raw_atomic_read(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
-
static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
{
/*
@@ -13595,77 +13507,6 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
-int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
- struct irq_bypass_producer *prod)
-{
- struct kvm_kernel_irqfd *irqfd =
- container_of(cons, struct kvm_kernel_irqfd, consumer);
- struct kvm *kvm = irqfd->kvm;
- int ret;
-
- kvm_arch_start_assignment(irqfd->kvm);
-
- spin_lock_irq(&kvm->irqfds.lock);
- irqfd->producer = prod;
-
- ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
- if (ret)
- kvm_arch_end_assignment(irqfd->kvm);
-
- spin_unlock_irq(&kvm->irqfds.lock);
-
-
- return ret;
-}
-
-void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
- struct irq_bypass_producer *prod)
-{
- int ret;
- struct kvm_kernel_irqfd *irqfd =
- container_of(cons, struct kvm_kernel_irqfd, consumer);
- struct kvm *kvm = irqfd->kvm;
-
- WARN_ON(irqfd->producer != prod);
-
- /*
- * When producer of consumer is unregistered, we change back to
- * remapped mode, so we can re-use the current implementation
- * when the irq is masked/disabled or the consumer side (KVM
- * int this case doesn't want to receive the interrupts.
- */
- spin_lock_irq(&kvm->irqfds.lock);
- irqfd->producer = NULL;
-
- ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 0);
- if (ret)
- printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
- " fails: %d\n", irqfd->consumer.token, ret);
-
- spin_unlock_irq(&kvm->irqfds.lock);
-
-
- kvm_arch_end_assignment(irqfd->kvm);
-}
-
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
-}
-
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
- struct kvm_kernel_irq_routing_entry *new)
-{
- if (old->type != KVM_IRQ_ROUTING_MSI ||
- new->type != KVM_IRQ_ROUTING_MSI)
- return true;
-
- return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
-}
-
bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
@@ -14065,7 +13906,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 88a9475899c8..bcfd9b719ada 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -55,6 +55,28 @@ struct kvm_host_values {
void kvm_spurious_fault(void);
+#define SIZE_OF_MEMSLOTS_HASHTABLE \
+ (sizeof(((struct kvm_memslots *)0)->id_hash) * 2 * KVM_MAX_NR_ADDRESS_SPACES)
+
+/* Sanity check the size of the memslot hash tables. */
+static_assert(SIZE_OF_MEMSLOTS_HASHTABLE ==
+ (1024 * (1 + IS_ENABLED(CONFIG_X86_64)) * (1 + IS_ENABLED(CONFIG_KVM_SMM))));
+
+/*
+ * Assert that "struct kvm_{svm,vmx,tdx}" is an order-0 or order-1 allocation.
+ * Spilling over to an order-2 allocation isn't fundamentally problematic, but
+ * isn't expected to happen in the foreseeable future (O(years)). Assert that
+ * the size is an order-0 allocation when ignoring the memslot hash tables, to
+ * help detect and debug unexpected size increases.
+ */
+#define KVM_SANITY_CHECK_VM_STRUCT_SIZE(x) \
+do { \
+ BUILD_BUG_ON(get_order(sizeof(struct x) - SIZE_OF_MEMSLOTS_HASHTABLE) && \
+ !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \
+ BUILD_BUG_ON(get_order(sizeof(struct x)) > 1 && \
+ !IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \
+} while (0)
+
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
({ \
bool failed = (consistency_check); \
@@ -121,6 +143,24 @@ static inline void kvm_leave_nested(struct kvm_vcpu *vcpu)
kvm_x86_ops.nested_ops->leave_nested(vcpu);
}
+/*
+ * If IBRS is advertised to the vCPU, KVM must flush the indirect branch
+ * predictors when transitioning from L2 to L1, as L1 expects hardware (KVM in
+ * this case) to provide separate predictor modes. Bare metal isolates the host
+ * from the guest, but doesn't isolate different guests from one another (in
+ * this case L1 and L2). The exception is if bare metal supports same mode IBRS,
+ * which offers protection within the same mode, and hence protects L1 from L2.
+ */
+static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu)
+{
+ if (cpu_feature_enabled(X86_FEATURE_AMD_IBRS_SAME_MODE))
+ return;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS))
+ indirect_branch_prediction_barrier();
+}
+
static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
{
return vcpu->arch.last_vmentry_cpu != -1;
@@ -481,24 +521,34 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
__rem; \
})
+static inline void kvm_disable_exits(struct kvm *kvm, u64 mask)
+{
+ kvm->arch.disabled_exits |= mask;
+}
+
static inline bool kvm_mwait_in_guest(struct kvm *kvm)
{
- return kvm->arch.mwait_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_MWAIT;
}
static inline bool kvm_hlt_in_guest(struct kvm *kvm)
{
- return kvm->arch.hlt_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_HLT;
}
static inline bool kvm_pause_in_guest(struct kvm *kvm)
{
- return kvm->arch.pause_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_PAUSE;
}
static inline bool kvm_cstate_in_guest(struct kvm *kvm)
{
- return kvm->arch.cstate_in_guest;
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_CSTATE;
+}
+
+static inline bool kvm_aperfmperf_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_APERFMPERF;
}
static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 38b33cdd4232..d6b2a665b499 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1526,7 +1526,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports,
sched_poll.nr_ports * sizeof(*ports), &e)) {
*r = -EFAULT;
- return true;
+ goto out;
}
for (i = 0; i < sched_poll.nr_ports; i++) {
@@ -1571,7 +1571,8 @@ out:
static void cancel_evtchn_poll(struct timer_list *t)
{
- struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer);
+ struct kvm_vcpu *vcpu = timer_container_of(vcpu, t,
+ arch.xen.poll_timer);
kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
kvm_vcpu_kick(vcpu);
@@ -1970,8 +1971,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
{
struct kvm_vcpu *vcpu;
- if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
- return -EINVAL;
+ /*
+ * Don't check for the port being within range of max_evtchn_port().
+ * Userspace can configure what ever targets it likes; events just won't
+ * be delivered if/while the target is invalid, just like userspace can
+ * configure MSIs which target non-existent APICs.
+ *
+ * This allow on Live Migration and Live Update, the IRQ routing table
+ * can be restored *independently* of other things like creating vCPUs,
+ * without imposing an ordering dependency on userspace. In this
+ * particular case, the problematic ordering would be with setting the
+ * Xen 'long mode' flag, which changes max_evtchn_port() to allow 4096
+ * instead of 1024 event channels.
+ */
/* We only support 2 level event channels for now */
if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)