diff options
Diffstat (limited to 'arch')
83 files changed, 1080 insertions, 844 deletions
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 35a159d131b5..113e20fdbb56 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -66,8 +66,8 @@ enum kvm_mode kvm_get_mode(void); DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); -extern unsigned int kvm_sve_max_vl; -int kvm_arm_init_sve(void); +extern unsigned int __ro_after_init kvm_sve_max_vl; +int __init kvm_arm_init_sve(void); u32 __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); @@ -877,7 +877,7 @@ int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); -int kvm_sys_reg_table_init(void); +int __init kvm_sys_reg_table_init(void); /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); @@ -908,9 +908,9 @@ int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); -extern unsigned int kvm_arm_vmid_bits; -int kvm_arm_vmid_alloc_init(void); -void kvm_arm_vmid_alloc_free(void); +extern unsigned int __ro_after_init kvm_arm_vmid_bits; +int __init kvm_arm_vmid_alloc_init(void); +void __init kvm_arm_vmid_alloc_free(void); void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); void kvm_arm_vmid_clear_active(void); @@ -943,7 +943,6 @@ static inline bool kvm_system_needs_idmapped_vectors(void) void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu); -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} @@ -994,7 +993,7 @@ static inline void kvm_clr_pmu_events(u32 clr) {} void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu); void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu); -int kvm_set_ipa_limit(void); +int __init kvm_set_ipa_limit(void); #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index e4a7e6369499..7f7c1231679e 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -163,7 +163,7 @@ int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **haddr); int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, void **haddr); -void free_hyp_pgds(void); +void __init free_hyp_pgds(void); void stage2_unmap_vm(struct kvm *kvm); int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type); @@ -175,7 +175,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); -int kvm_mmu_init(u32 *hyp_va_bits); +int __init kvm_mmu_init(u32 *hyp_va_bits); static inline void *__kvm_vector_slot2addr(void *base, enum arm64_hyp_spectre_vector slot) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 05da3c8f7e88..ca6eadeb7d1a 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -21,6 +21,7 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM + select KVM_GENERIC_HARDWARE_ENABLING select MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index bb24a76b4224..23346585a294 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -811,10 +811,18 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) ptimer->host_timer_irq_flags = host_ptimer_irq_flags; } -static void kvm_timer_init_interrupt(void *info) +void kvm_timer_cpu_up(void) { enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); - enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); + if (host_ptimer_irq) + enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); +} + +void kvm_timer_cpu_down(void) +{ + disable_percpu_irq(host_vtimer_irq); + if (host_ptimer_irq) + disable_percpu_irq(host_ptimer_irq); } int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) @@ -976,18 +984,6 @@ void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, preempt_enable(); } -static int kvm_timer_starting_cpu(unsigned int cpu) -{ - kvm_timer_init_interrupt(NULL); - return 0; -} - -static int kvm_timer_dying_cpu(unsigned int cpu) -{ - disable_percpu_irq(host_vtimer_irq); - return 0; -} - static int timer_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu) { if (vcpu) @@ -1117,7 +1113,7 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info) return 0; } -int kvm_timer_hyp_init(bool has_gic) +int __init kvm_timer_hyp_init(bool has_gic) { struct arch_timer_kvm_info *info; int err; @@ -1185,9 +1181,6 @@ int kvm_timer_hyp_init(bool has_gic) goto out_free_irq; } - cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, - "kvm/arm/timer:starting", kvm_timer_starting_cpu, - kvm_timer_dying_cpu); return 0; out_free_irq: free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9c5573bc4614..698787ed87e9 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -63,16 +63,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -1539,7 +1529,7 @@ static int kvm_init_vector_slots(void) return 0; } -static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) +static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); unsigned long tcr; @@ -1682,7 +1672,15 @@ static void _kvm_arch_hardware_enable(void *discard) int kvm_arch_hardware_enable(void) { + int was_enabled = __this_cpu_read(kvm_arm_hardware_enabled); + _kvm_arch_hardware_enable(NULL); + + if (!was_enabled) { + kvm_vgic_cpu_up(); + kvm_timer_cpu_up(); + } + return 0; } @@ -1696,6 +1694,11 @@ static void _kvm_arch_hardware_disable(void *discard) void kvm_arch_hardware_disable(void) { + if (__this_cpu_read(kvm_arm_hardware_enabled)) { + kvm_timer_cpu_down(); + kvm_vgic_cpu_down(); + } + if (!is_protected_kvm_enabled()) _kvm_arch_hardware_disable(NULL); } @@ -1738,26 +1741,26 @@ static struct notifier_block hyp_init_cpu_pm_nb = { .notifier_call = hyp_init_cpu_pm_notifier, }; -static void hyp_cpu_pm_init(void) +static void __init hyp_cpu_pm_init(void) { if (!is_protected_kvm_enabled()) cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); } -static void hyp_cpu_pm_exit(void) +static void __init hyp_cpu_pm_exit(void) { if (!is_protected_kvm_enabled()) cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); } #else -static inline void hyp_cpu_pm_init(void) +static inline void __init hyp_cpu_pm_init(void) { } -static inline void hyp_cpu_pm_exit(void) +static inline void __init hyp_cpu_pm_exit(void) { } #endif -static void init_cpu_logical_map(void) +static void __init init_cpu_logical_map(void) { unsigned int cpu; @@ -1774,7 +1777,7 @@ static void init_cpu_logical_map(void) #define init_psci_0_1_impl_state(config, what) \ config.psci_0_1_ ## what ## _implemented = psci_ops.what -static bool init_psci_relay(void) +static bool __init init_psci_relay(void) { /* * If PSCI has not been initialized, protected KVM cannot install @@ -1797,7 +1800,7 @@ static bool init_psci_relay(void) return true; } -static int init_subsystems(void) +static int __init init_subsystems(void) { int err = 0; @@ -1838,13 +1841,22 @@ static int init_subsystems(void) kvm_register_perf_callbacks(NULL); out: + if (err) + hyp_cpu_pm_exit(); + if (err || !is_protected_kvm_enabled()) on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); return err; } -static void teardown_hyp_mode(void) +static void __init teardown_subsystems(void) +{ + kvm_unregister_perf_callbacks(); + hyp_cpu_pm_exit(); +} + +static void __init teardown_hyp_mode(void) { int cpu; @@ -1855,7 +1867,7 @@ static void teardown_hyp_mode(void) } } -static int do_pkvm_init(u32 hyp_va_bits) +static int __init do_pkvm_init(u32 hyp_va_bits) { void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)); int ret; @@ -1891,7 +1903,7 @@ static void kvm_hyp_init_symbols(void) kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; } -static int kvm_hyp_init_protection(u32 hyp_va_bits) +static int __init kvm_hyp_init_protection(u32 hyp_va_bits) { void *addr = phys_to_virt(hyp_mem_base); int ret; @@ -1912,7 +1924,7 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits) /** * Inits Hyp-mode on all online CPUs */ -static int init_hyp_mode(void) +static int __init init_hyp_mode(void) { u32 hyp_va_bits; int cpu; @@ -2094,7 +2106,7 @@ out_err: return err; } -static void _kvm_host_prot_finalize(void *arg) +static void __init _kvm_host_prot_finalize(void *arg) { int *err = arg; @@ -2102,7 +2114,7 @@ static void _kvm_host_prot_finalize(void *arg) WRITE_ONCE(*err, -EINVAL); } -static int pkvm_drop_host_privileges(void) +static int __init pkvm_drop_host_privileges(void) { int ret = 0; @@ -2115,7 +2127,7 @@ static int pkvm_drop_host_privileges(void) return ret; } -static int finalize_hyp_mode(void) +static int __init finalize_hyp_mode(void) { if (!is_protected_kvm_enabled()) return 0; @@ -2190,7 +2202,7 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) /** * Initialize Hyp-mode and memory mappings on all CPUs. */ -int kvm_arch_init(void *opaque) +static __init int kvm_arm_init(void) { int err; bool in_hyp_mode; @@ -2241,7 +2253,7 @@ int kvm_arch_init(void *opaque) err = kvm_init_vector_slots(); if (err) { kvm_err("Cannot initialise vector slots\n"); - goto out_err; + goto out_hyp; } err = init_subsystems(); @@ -2252,7 +2264,7 @@ int kvm_arch_init(void *opaque) err = finalize_hyp_mode(); if (err) { kvm_err("Failed to finalize Hyp protection\n"); - goto out_hyp; + goto out_subs; } } @@ -2264,10 +2276,19 @@ int kvm_arch_init(void *opaque) kvm_info("Hyp mode initialized successfully\n"); } + /* + * FIXME: Do something reasonable if kvm_init() fails after pKVM + * hypervisor protection is finalized. + */ + err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (err) + goto out_subs; + return 0; +out_subs: + teardown_subsystems(); out_hyp: - hyp_cpu_pm_exit(); if (!in_hyp_mode) teardown_hyp_mode(); out_err: @@ -2275,12 +2296,6 @@ out_err: return err; } -/* NOP: Compiling as a module not supported */ -void kvm_arch_exit(void) -{ - kvm_unregister_perf_callbacks(); -} - static int __init early_kvm_mode_cfg(char *arg) { if (!arg) @@ -2319,10 +2334,4 @@ enum kvm_mode kvm_get_mode(void) return kvm_mode; } -static int arm_init(void) -{ - int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); - return rc; -} - -module_init(arm_init); +module_init(kvm_arm_init); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a3ee3b605c9b..01352f5838a0 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -25,11 +25,11 @@ static struct kvm_pgtable *hyp_pgtable; static DEFINE_MUTEX(kvm_hyp_pgd_mutex); -static unsigned long hyp_idmap_start; -static unsigned long hyp_idmap_end; -static phys_addr_t hyp_idmap_vector; +static unsigned long __ro_after_init hyp_idmap_start; +static unsigned long __ro_after_init hyp_idmap_end; +static phys_addr_t __ro_after_init hyp_idmap_vector; -static unsigned long io_map_base; +static unsigned long __ro_after_init io_map_base; static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) { @@ -280,7 +280,7 @@ static void stage2_flush_vm(struct kvm *kvm) /** * free_hyp_pgds - free Hyp-mode page tables */ -void free_hyp_pgds(void) +void __init free_hyp_pgds(void) { mutex_lock(&kvm_hyp_pgd_mutex); if (hyp_pgtable) { @@ -1668,7 +1668,7 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { .virt_to_phys = kvm_host_pa, }; -int kvm_mmu_init(u32 *hyp_va_bits) +int __init kvm_mmu_init(u32 *hyp_va_bits) { int err; u32 idmap_bits; diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e0267f672b8a..2bc74739a6df 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -30,7 +30,7 @@ #include <asm/virt.h> /* Maximum phys_shift supported for any VM on this host */ -static u32 kvm_ipa_limit; +static u32 __ro_after_init kvm_ipa_limit; /* * ARMv8 Reset Values @@ -41,9 +41,9 @@ static u32 kvm_ipa_limit; #define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \ PSR_AA32_I_BIT | PSR_AA32_F_BIT) -unsigned int kvm_sve_max_vl; +unsigned int __ro_after_init kvm_sve_max_vl; -int kvm_arm_init_sve(void) +int __init kvm_arm_init_sve(void) { if (system_supports_sve()) { kvm_sve_max_vl = sve_max_virtualisable_vl(); @@ -352,7 +352,7 @@ u32 get_kvm_ipa_limit(void) return kvm_ipa_limit; } -int kvm_set_ipa_limit(void) +int __init kvm_set_ipa_limit(void) { unsigned int parange; u64 mmfr0; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c6cbfe6b854b..46d161fe08d3 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -82,7 +82,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) } /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ -static u32 cache_levels; +static u32 __ro_after_init cache_levels; /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ #define CSSELR_MAX 14 @@ -2733,7 +2733,7 @@ static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) } /* ->val is filled in by kvm_sys_reg_table_init() */ -static struct sys_reg_desc invariant_sys_regs[] = { +static struct sys_reg_desc invariant_sys_regs[] __ro_after_init = { { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 }, { SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 }, { SYS_DESC(SYS_CLIDR_EL1), NULL, get_clidr_el1 }, @@ -3057,7 +3057,7 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) return write_demux_regids(uindices); } -int kvm_sys_reg_table_init(void) +int __init kvm_sys_reg_table_init(void) { bool valid = true; unsigned int i; diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index f6d4f4052555..6c7f6ae21ec0 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -465,17 +465,15 @@ out: /* GENERIC PROBE */ -static int vgic_init_cpu_starting(unsigned int cpu) +void kvm_vgic_cpu_up(void) { enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0); - return 0; } -static int vgic_init_cpu_dying(unsigned int cpu) +void kvm_vgic_cpu_down(void) { disable_percpu_irq(kvm_vgic_global_state.maint_irq); - return 0; } static irqreturn_t vgic_maintenance_handler(int irq, void *data) @@ -584,19 +582,6 @@ int kvm_vgic_hyp_init(void) return ret; } - ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING, - "kvm/arm/vgic:starting", - vgic_init_cpu_starting, vgic_init_cpu_dying); - if (ret) { - kvm_err("Cannot register vgic CPU notifier\n"); - goto out_free_irq; - } - kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq); return 0; - -out_free_irq: - free_percpu_irq(kvm_vgic_global_state.maint_irq, - kvm_get_running_vcpus()); - return ret; } diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c index d78ae63d7c15..08978d0672e7 100644 --- a/arch/arm64/kvm/vmid.c +++ b/arch/arm64/kvm/vmid.c @@ -16,7 +16,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> -unsigned int kvm_arm_vmid_bits; +unsigned int __ro_after_init kvm_arm_vmid_bits; static DEFINE_RAW_SPINLOCK(cpu_vmid_lock); static atomic64_t vmid_generation; @@ -172,7 +172,7 @@ void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) /* * Initialize the VMID allocator */ -int kvm_arm_vmid_alloc_init(void) +int __init kvm_arm_vmid_alloc_init(void) { kvm_arm_vmid_bits = kvm_get_vmid_bits(); @@ -190,7 +190,7 @@ int kvm_arm_vmid_alloc_init(void) return 0; } -void kvm_arm_vmid_alloc_free(void) +void __init kvm_arm_vmid_alloc_free(void) { kfree(vmid_map); } diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 5cedb28e8a40..2803c9c21ef9 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -758,7 +758,7 @@ struct kvm_mips_callbacks { void (*vcpu_reenter)(struct kvm_vcpu *vcpu); }; extern struct kvm_mips_callbacks *kvm_mips_callbacks; -int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); +int kvm_mips_emulation_init(void); /* Debug: dump vcpu state */ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); @@ -888,7 +888,6 @@ extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm); extern int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_mips_interrupt *irq); -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 91d197bee9c0..29e51649203b 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -28,6 +28,7 @@ config KVM select MMU_NOTIFIER select SRCU select INTERVAL_TREE + select KVM_GENERIC_HARDWARE_ENABLING help Support for hosting Guest kernels. diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile index 21ff75bcdbc4..805aeea2166e 100644 --- a/arch/mips/kvm/Makefile +++ b/arch/mips/kvm/Makefile @@ -17,4 +17,4 @@ kvm-$(CONFIG_CPU_LOONGSON64) += loongson_ipi.o kvm-y += vz.o obj-$(CONFIG_KVM) += kvm.o -obj-y += callback.o tlb.o +obj-y += tlb.o diff --git a/arch/mips/kvm/callback.c b/arch/mips/kvm/callback.c deleted file mode 100644 index d88aa2173fb0..000000000000 --- a/arch/mips/kvm/callback.c +++ /dev/null @@ -1,14 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. - * Authors: Yann Le Du <ledu@kymasys.com> - */ - -#include <linux/export.h> -#include <linux/kvm_host.h> - -struct kvm_mips_callbacks *kvm_mips_callbacks; -EXPORT_SYMBOL_GPL(kvm_mips_callbacks); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a25e0b73ee70..36c8991b5d39 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -135,16 +135,6 @@ void kvm_arch_hardware_disable(void) kvm_mips_callbacks->hardware_disable(); } -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - extern void kvm_init_loongson_ipi(struct kvm *kvm); int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) @@ -1015,21 +1005,6 @@ long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) return r; } -int kvm_arch_init(void *opaque) -{ - if (kvm_mips_callbacks) { - kvm_err("kvm: module already exists\n"); - return -EEXIST; - } - - return kvm_mips_emulation_init(&kvm_mips_callbacks); -} - -void kvm_arch_exit(void) -{ - kvm_mips_callbacks = NULL; -} - int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -1646,16 +1621,21 @@ static int __init kvm_mips_init(void) if (ret) return ret; - ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); - + ret = kvm_mips_emulation_init(); if (ret) return ret; + if (boot_cpu_type() == CPU_LOONGSON64) kvm_priority_to_irq = kvm_loongson3_priority_to_irq; register_die_notifier(&kvm_mips_csr_die_notifier); + ret = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (ret) { + unregister_die_notifier(&kvm_mips_csr_die_notifier); + return ret; + } return 0; } diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index c706f5890a05..dafab003ea0d 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -3304,7 +3304,10 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = { .vcpu_reenter = kvm_vz_vcpu_reenter, }; -int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks) +/* FIXME: Get rid of the callbacks now that trap-and-emulate is gone. */ +struct kvm_mips_callbacks *kvm_mips_callbacks = &kvm_vz_callbacks; + +int kvm_mips_emulation_init(void) { if (!cpu_has_vz) return -ENODEV; @@ -3318,7 +3321,5 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks) return -ENODEV; pr_info("Starting KVM with MIPS VZ extensions\n"); - - *install_callbacks = &kvm_vz_callbacks; return 0; } diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index caea15dcb91d..959f566a455c 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -876,13 +876,10 @@ struct kvm_vcpu_arch { #define __KVM_HAVE_ARCH_WQP #define __KVM_HAVE_CREATE_DEVICE -static inline void kvm_arch_hardware_disable(void) {} -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} -static inline void kvm_arch_exit(void) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index eae9619b6190..6bef23d6d0e3 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -118,7 +118,6 @@ extern int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, extern int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu); extern void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu); extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu); -extern int kvmppc_core_check_processor_compat(void); extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 6d525285dbe8..57f4e7896d67 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -999,16 +999,6 @@ int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store); -int kvmppc_core_check_processor_compat(void) -{ - /* - * We always return 0 for book3s. We check - * for compatibility while loading the HV - * or PR module - */ - return 0; -} - int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall) { return kvm->arch.kvm_ops->hcall_implemented(hcall); @@ -1062,7 +1052,7 @@ static int kvmppc_book3s_init(void) { int r; - r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (r) return r; #ifdef CONFIG_KVM_BOOK3S_32_HANDLER diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index c8b2b4478545..b0f695428733 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -314,7 +314,7 @@ static void kvmppc_core_vcpu_put_e500(struct kvm_vcpu *vcpu) kvmppc_booke_vcpu_put(vcpu); } -int kvmppc_core_check_processor_compat(void) +static int kvmppc_e500_check_processor_compat(void) { int r; @@ -507,7 +507,7 @@ static int __init kvmppc_e500_init(void) unsigned long handler_len; unsigned long max_ivor = 0; - r = kvmppc_core_check_processor_compat(); + r = kvmppc_e500_check_processor_compat(); if (r) goto err_out; @@ -531,7 +531,7 @@ static int __init kvmppc_e500_init(void) flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + ivor[max_ivor] + handler_len); - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); if (r) goto err_out; kvm_ops_e500.owner = THIS_MODULE; diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 57e0ad6a2ca3..611532a0dedc 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -388,6 +388,10 @@ static int __init kvmppc_e500mc_init(void) { int r; + r = kvmppc_e500mc_check_processor_compat(); + if (r) + return kvmppc_e500mc; + r = kvmppc_booke_init(); if (r) goto err_out; @@ -400,7 +404,7 @@ static int __init kvmppc_e500mc_init(void) */ kvmppc_init_lpid(KVMPPC_NR_LPIDS/threads_per_core); - r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); if (r) goto err_out; kvm_ops_e500mc.owner = THIS_MODULE; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 04494a4fb37a..4c5405fc5538 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -435,21 +435,6 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, } EXPORT_SYMBOL_GPL(kvmppc_ld); -int kvm_arch_hardware_enable(void) -{ - return 0; -} - -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - return kvmppc_core_check_processor_compat(); -} - int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { struct kvmppc_ops *kvm_ops = NULL; @@ -2544,11 +2529,6 @@ void kvmppc_init_lpid(unsigned long nr_lpids_param) } EXPORT_SYMBOL_GPL(kvmppc_init_lpid); -int kvm_arch_init(void *opaque) -{ - return 0; -} - EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ppc_instr); void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 93f43a3e7886..cf24da85b13c 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -230,7 +230,6 @@ struct kvm_vcpu_arch { bool pause; }; -static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} @@ -297,11 +296,11 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm); void kvm_riscv_gstage_free_pgd(struct kvm *kvm); void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu); -void kvm_riscv_gstage_mode_detect(void); -unsigned long kvm_riscv_gstage_mode(void); +void __init kvm_riscv_gstage_mode_detect(void); +unsigned long __init kvm_riscv_gstage_mode(void); int kvm_riscv_gstage_gpa_bits(void); -void kvm_riscv_gstage_vmid_detect(void); +void __init kvm_riscv_gstage_vmid_detect(void); unsigned long kvm_riscv_gstage_vmid_bits(void); int kvm_riscv_gstage_vmid_init(struct kvm *kvm); bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid); diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index f36a737d5f96..d5a658a047a7 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -20,6 +20,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)" depends on RISCV_SBI && MMU + select KVM_GENERIC_HARDWARE_ENABLING select MMU_NOTIFIER select PREEMPT_NOTIFIERS select KVM_MMIO diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 58c5489d3031..e2da56ed9069 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -20,16 +20,6 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - -int kvm_arch_hardware_setup(void *opaque) -{ - return 0; -} - int kvm_arch_hardware_enable(void) { unsigned long hideleg, hedeleg; @@ -70,7 +60,7 @@ void kvm_arch_hardware_disable(void) csr_write(CSR_HIDELEG, 0); } -int kvm_arch_init(void *opaque) +static int __init riscv_kvm_init(void) { const char *str; @@ -115,16 +105,7 @@ int kvm_arch_init(void *opaque) kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits()); - return 0; -} - -void kvm_arch_exit(void) -{ -} - -static int __init riscv_kvm_init(void) -{ - return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + return kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); } module_init(riscv_kvm_init); diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 34b57e0be2ef..66ef19676fe4 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -20,12 +20,12 @@ #include <asm/pgtable.h> #ifdef CONFIG_64BIT -static unsigned long gstage_mode = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); -static unsigned long gstage_pgd_levels = 3; +static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); +static unsigned long gstage_pgd_levels __ro_after_init = 3; #define gstage_index_bits 9 #else -static unsigned long gstage_mode = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); -static unsigned long gstage_pgd_levels = 2; +static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); +static unsigned long gstage_pgd_levels __ro_after_init = 2; #define gstage_index_bits 10 #endif @@ -758,7 +758,7 @@ void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu) kvm_riscv_local_hfence_gvma_all(); } -void kvm_riscv_gstage_mode_detect(void) +void __init kvm_riscv_gstage_mode_detect(void) { #ifdef CONFIG_64BIT /* Try Sv57x4 G-stage mode */ @@ -782,7 +782,7 @@ skip_sv48x4_test: #endif } -unsigned long kvm_riscv_gstage_mode(void) +unsigned long __init kvm_riscv_gstage_mode(void) { return gstage_mode >> HGATP_MODE_SHIFT; } diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index 6cd93995fb65..5246da1c9167 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -17,10 +17,10 @@ static unsigned long vmid_version = 1; static unsigned long vmid_next; -static unsigned long vmid_bits; +static unsigned long vmid_bits __ro_after_init; static DEFINE_SPINLOCK(vmid_lock); -void kvm_riscv_gstage_vmid_detect(void) +void __init kvm_riscv_gstage_vmid_detect(void) { unsigned long old; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d67ce719d16a..2bbc3d54959d 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -1031,7 +1031,6 @@ extern char sie_exit; extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); -static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ab26aa53ee37..8edb3bee74b6 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -3415,7 +3415,7 @@ void kvm_s390_gib_destroy(void) gib = NULL; } -int kvm_s390_gib_init(u8 nisc) +int __init kvm_s390_gib_init(u8 nisc) { int rc = 0; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e4890e04b210..bd25076aa19b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -256,17 +256,6 @@ debug_info_t *kvm_s390_dbf; debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ -int kvm_arch_hardware_enable(void) -{ - /* every s390 is virtualization enabled ;-) */ - return 0; -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - return 0; -} - /* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); @@ -329,25 +318,6 @@ static struct notifier_block kvm_clock_notifier = { .notifier_call = kvm_clock_sync, }; -int kvm_arch_hardware_setup(void *opaque) -{ - gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_pte_notifier(&gmap_notifier); - vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; - gmap_register_pte_notifier(&vsie_gmap_notifier); - atomic_notifier_chain_register(&s390_epoch_delta_notifier, - &kvm_clock_notifier); - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - gmap_unregister_pte_notifier(&gmap_notifier); - gmap_unregister_pte_notifier(&vsie_gmap_notifier); - atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, - &kvm_clock_notifier); -} - static void allow_cpu_feat(unsigned long nr) { set_bit_inv(nr, kvm_s390_available_cpu_feat); @@ -385,7 +355,7 @@ static __always_inline void __insn32_query(unsigned int opcode, u8 *query) #define INSN_SORTL 0xb938 #define INSN_DFLTCC 0xb939 -static void kvm_s390_cpu_feat_init(void) +static void __init kvm_s390_cpu_feat_init(void) { int i; @@ -488,7 +458,7 @@ static void kvm_s390_cpu_feat_init(void) */ } -int kvm_arch_init(void *opaque) +static int __init __kvm_s390_init(void) { int rc = -ENOMEM; @@ -498,11 +468,11 @@ int kvm_arch_init(void *opaque) kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long)); if (!kvm_s390_dbf_uv) - goto out; + goto err_kvm_uv; if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) || debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view)) - goto out; + goto err_debug_view; kvm_s390_cpu_feat_init(); @@ -510,30 +480,49 @@ int kvm_arch_init(void *opaque) rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); if (rc) { pr_err("A FLIC registration call failed with rc=%d\n", rc); - goto out; + goto err_flic; } if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { rc = kvm_s390_pci_init(); if (rc) { pr_err("Unable to allocate AIFT for PCI\n"); - goto out; + goto err_pci; } } rc = kvm_s390_gib_init(GAL_ISC); if (rc) - goto out; + goto err_gib; + + gmap_notifier.notifier_call = kvm_gmap_notifier; + gmap_register_pte_notifier(&gmap_notifier); + vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; + gmap_register_pte_notifier(&vsie_gmap_notifier); + atomic_notifier_chain_register(&s390_epoch_delta_notifier, + &kvm_clock_notifier); return 0; -out: - kvm_arch_exit(); +err_gib: + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_exit(); +err_pci: +err_flic: +err_debug_view: + debug_unregister(kvm_s390_dbf_uv); +err_kvm_uv: + debug_unregister(kvm_s390_dbf); return rc; } -void kvm_arch_exit(void) +static void __kvm_s390_exit(void) { + gmap_unregister_pte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&vsie_gmap_notifier); + atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, + &kvm_clock_notifier); + kvm_s390_gib_destroy(); if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) kvm_s390_pci_exit(); @@ -5696,7 +5685,7 @@ static inline unsigned long nonhyp_mask(int i) static int __init kvm_s390_init(void) { - int i; + int i, r; if (!sclp.has_sief2) { pr_info("SIE is not available\n"); @@ -5712,12 +5701,23 @@ static int __init kvm_s390_init(void) kvm_s390_fac_base[i] |= stfle_fac_list[i] & nonhyp_mask(i); - return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + r = __kvm_s390_init(); + if (r) + return r; + + r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (r) { + __kvm_s390_exit(); + return r; + } + return 0; } static void __exit kvm_s390_exit(void) { kvm_exit(); + + __kvm_s390_exit(); } module_init(kvm_s390_init); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index d48588c207d8..0261d42c7d01 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -470,7 +470,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm); void kvm_s390_gisa_destroy(struct kvm *kvm); void kvm_s390_gisa_disable(struct kvm *kvm); void kvm_s390_gisa_enable(struct kvm *kvm); -int kvm_s390_gib_init(u8 nisc); +int __init kvm_s390_gib_init(u8 nisc); void kvm_s390_gib_destroy(void); /* implemented in guestdbg.c */ diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index ec51e810e381..b124d586db55 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -672,7 +672,7 @@ out: return r; } -int kvm_s390_pci_init(void) +int __init kvm_s390_pci_init(void) { zpci_kvm_hook.kvm_register = kvm_s390_pci_register_kvm; zpci_kvm_hook.kvm_unregister = kvm_s390_pci_unregister_kvm; diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h index 486d06ef563f..ff0972dd5e71 100644 --- a/arch/s390/kvm/pci.h +++ b/arch/s390/kvm/pci.h @@ -60,7 +60,7 @@ void kvm_s390_pci_clear_list(struct kvm *kvm); int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args); -int kvm_s390_pci_init(void); +int __init kvm_s390_pci_init(void); void kvm_s390_pci_exit(void); static inline bool kvm_s390_pci_interp_allowed(void) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 08e822bd7aa6..1e2fe398b66d 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -255,6 +255,9 @@ enum hv_isolation_type { /* TSC invariant control */ #define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118 +/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */ +#define HV_EXPOSE_INVARIANT_TSC BIT_ULL(0) + /* Register name aliases for temporary compatibility */ #define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT #define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index abccd51dcfca..dba2909e5ae2 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -14,6 +14,7 @@ BUILD_BUG_ON(1) * to make a definition optional, but in this case the default will * be __static_call_return0. */ +KVM_X86_OP(check_processor_compatibility) KVM_X86_OP(hardware_enable) KVM_X86_OP(hardware_disable) KVM_X86_OP(hardware_unsetup) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6aaae18f1854..8d0a0a7c34fc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1088,6 +1088,7 @@ struct kvm_hv { u64 hv_reenlightenment_control; u64 hv_tsc_emulation_control; u64 hv_tsc_emulation_status; + u64 hv_invtsc_control; /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; @@ -1342,21 +1343,12 @@ struct kvm_arch { struct task_struct *nx_huge_page_recovery_thread; #ifdef CONFIG_X86_64 - /* - * Whether the TDP MMU is enabled for this VM. This contains a - * snapshot of the TDP MMU module parameter from when the VM was - * created and remains unchanged for the life of the VM. If this is - * true, TDP MMU handler functions will run for various MMU - * operations. - */ - bool tdp_mmu_enabled; - /* The number of TDP MMU pages across all roots. */ atomic64_t tdp_mmu_pages; /* - * List of kvm_mmu_page structs being used as roots. - * All kvm_mmu_page structs in the list should have + * List of struct kvm_mmu_pages being used as roots. + * All struct kvm_mmu_pages in the list should have * tdp_mmu_page set. * * For reads, this list is protected by: @@ -1520,6 +1512,8 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) struct kvm_x86_ops { const char *name; + int (*check_processor_compatibility)(void); + int (*hardware_enable)(void); void (*hardware_disable)(void); void (*hardware_unsetup)(void); @@ -1731,9 +1725,6 @@ struct kvm_x86_nested_ops { }; struct kvm_x86_init_ops { - int (*cpu_has_kvm_support)(void); - int (*disabled_by_bios)(void); - int (*check_processor_compatibility)(void); int (*hardware_setup)(void); unsigned int (*handle_intel_pt_intr)(void); @@ -1760,6 +1751,9 @@ extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops); +void kvm_x86_vendor_exit(void); + #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 46668e255421..bfbdc072017d 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -388,7 +388,7 @@ static void __init ms_hyperv_init_platform(void) * setting of this MSR bit should happen before init_intel() * is called. */ - wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); + wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fbeaa9ddef59..8e578311ca9d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -49,6 +49,7 @@ config KVM select SRCU select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM + select KVM_GENERIC_HARDWARE_ENABLING help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 596061c1610e..2a9f1e200dbc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -8,6 +8,7 @@ * Copyright 2011 Red Hat, Inc. and/or its affiliates. * Copyright IBM Corporation, 2008 */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/export.h> @@ -701,6 +702,10 @@ void kvm_set_cpu_caps(void) if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64)) kvm_cpu_cap_set(X86_FEATURE_GBPAGES); + kvm_cpu_cap_init_kvm_defined(CPUID_8000_0007_EDX, + SF(CONSTANT_TSC) + ); + kvm_cpu_cap_mask(CPUID_8000_0008_EBX, F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | @@ -1148,8 +1153,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx &= ~GENMASK(17, 16); break; case 0x80000007: /* Advanced power management */ - /* invariant TSC is CPUID.80000007H:EDX[8] */ - entry->edx &= (1 << 8); + cpuid_entry_override(entry, CPUID_8000_0007_EDX); + /* mask against host */ entry->edx &= boot_cpu_data.x86_power; entry->eax = entry->ebx = entry->ecx = 0; @@ -1482,6 +1487,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && (data & TSX_CTRL_CPUID_CLEAR)) *ebx &= ~(F(RTM) | F(HLE)); + } else if (function == 0x80000007) { + if (kvm_hv_invtsc_suppressed(vcpu)) + *edx &= ~SF(CONSTANT_TSC); } } else { *eax = *ebx = *ecx = *edx = 0; diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index c1390357126a..ee8c4c3496ed 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -4,6 +4,8 @@ * * Copyright 2016 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kvm_host.h> #include <linux/debugfs.h> #include "lapic.h" diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5cc3efa0e21c..c3443045cd93 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -17,6 +17,7 @@ * * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "kvm_cache_regs.h" diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index e8296942a868..71aff0edc0ed 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -17,6 +17,7 @@ * Ben-Ami Yassour <benami@il.ibm.com> * Andrey Smetanin <asmetanin@virtuozzo.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "lapic.h" @@ -999,6 +1000,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; @@ -1283,6 +1285,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_TSC_EMULATION_STATUS: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_REENLIGHTENMENT; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + return hv_vcpu->cpuid_cache.features_eax & + HV_ACCESS_TSC_INVARIANT; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: return hv_vcpu->cpuid_cache.features_edx & @@ -1410,6 +1415,17 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, if (!host) return 1; break; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + /* Only bit 0 is supported */ + if (data & ~HV_EXPOSE_INVARIANT_TSC) + return 1; + + /* The feature can't be disabled from the guest */ + if (!host && hv->hv_invtsc_control && !data) + return 1; + + hv->hv_invtsc_control = data; + break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); @@ -1585,6 +1601,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; + case HV_X64_MSR_TSC_INVARIANT_CONTROL: + data = hv->hv_invtsc_control; + break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); @@ -2733,6 +2752,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; ent->eax |= HV_ACCESS_FREQUENCY_MSRS; ent->eax |= HV_ACCESS_REENLIGHTENMENT; + ent->eax |= HV_ACCESS_TSC_INVARIANT; ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 9f96414a31c5..f83b8db72b11 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -136,6 +136,33 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) HV_SYNIC_STIMER_COUNT); } +/* + * With HV_ACCESS_TSC_INVARIANT feature, invariant TSC (CPUID.80000007H:EDX[8]) + * is only observed after HV_X64_MSR_TSC_INVARIANT_CONTROL was written to. + */ +static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + /* + * If Hyper-V's invariant TSC control is not exposed to the guest, + * the invariant TSC CPUID flag is not suppressed, Windows guests were + * observed to be able to handle it correctly. Going forward, VMMs are + * encouraged to enable Hyper-V's invariant TSC control when invariant + * TSC CPUID flag is set to make KVM's behavior match genuine Hyper-V. + */ + if (!hv_vcpu || + !(hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT)) + return false; + + /* + * If Hyper-V's invariant TSC control is exposed to the guest, KVM is + * responsible for suppressing the invariant TSC CPUID flag if the + * Hyper-V control is not enabled. + */ + return !(to_kvm_hv(vcpu->kvm)->hv_invtsc_control & HV_EXPOSE_INVARIANT_TSC); +} + void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); void kvm_hv_setup_tsc_page(struct kvm *kvm, diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index e0a7a0e7a73c..cd57a517d04a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -30,7 +30,7 @@ * Based on QEMU and Xen. */ -#define pr_fmt(fmt) "pit: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/slab.h> @@ -351,7 +351,7 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period) if (ps->period < min_period) { pr_info_ratelimited( - "kvm: requested %lld ns " + "requested %lld ns " "i8254 timer period limited to %lld ns\n", ps->period, min_period); ps->period = min_period; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index e1bb6218bb96..4756bcb5724f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -26,6 +26,8 @@ * Yaozu (Eddie) Dong <Eddie.dong@intel.com> * Port from Qemu. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/mm.h> #include <linux/slab.h> #include <linux/bitops.h> @@ -35,7 +37,7 @@ #include "trace.h" #define pr_pic_unimpl(fmt, ...) \ - pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__) + pr_err_ratelimited("pic: " fmt, ## __VA_ARGS__) static void pic_irq_request(struct kvm *kvm, int level); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 765943d7cfa5..042dee556125 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -26,6 +26,7 @@ * Yaozu (Eddie) Dong <eddie.dong@intel.com> * Based on Xen 3.1 code. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/kvm.h> diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index a70952eca905..b2c397dd2bc6 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -7,6 +7,7 @@ * Authors: * Yaozu (Eddie) Dong <Eddie.dong@intel.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/export.h> #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 3742d9adacfc..16d076a1b91a 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -8,6 +8,7 @@ * * Copyright 2010 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/slab.h> @@ -56,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (irq->dest_mode == APIC_DEST_PHYSICAL && irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { - printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); + pr_info("apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } @@ -199,7 +200,7 @@ int kvm_request_irq_source_id(struct kvm *kvm) irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG); if (irq_source_id >= BITS_PER_LONG) { - printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n"); + pr_warn("exhausted allocatable IRQ sources!\n"); irq_source_id = -EFAULT; goto unlock; } @@ -221,7 +222,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) mutex_lock(&kvm->irq_lock); if (irq_source_id < 0 || irq_source_id >= BITS_PER_LONG) { - printk(KERN_ERR "kvm: IRQ source ID out of range!\n"); + pr_err("IRQ source ID out of range!\n"); goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index ee4f696a0782..482d6639ef88 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -2,6 +2,7 @@ /* * KVM L1 hypervisor optimizations on Hyper-V. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <asm/mshyperv.h> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4efdb4a4d72c..cfaf1d8c64ca 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -15,6 +15,7 @@ * * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/kvm.h> @@ -941,8 +942,7 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm) { if (!kvm->arch.disabled_lapic_found) { kvm->arch.disabled_lapic_found = true; - printk(KERN_INFO - "Disabled LAPIC found during irq injection\n"); + pr_info("Disabled LAPIC found during irq injection\n"); } } @@ -1560,7 +1560,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) if (apic->lapic_timer.period < min_period) { pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " + "vcpu %i: requested %lld ns " "lapic timer period limited to %lld ns\n", apic->vcpu->vcpu_id, apic->lapic_timer.period, min_period); @@ -1845,7 +1845,7 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg) deadline = apic->lapic_timer.period; else if (unlikely(deadline > apic->lapic_timer.period)) { pr_info_ratelimited( - "kvm: vcpu %i: requested lapic timer restore with " + "vcpu %i: requested lapic timer restore with " "starting count register %#x=%u (%lld ns) > initial count (%lld ns). " "Using initial count to start timer.\n", apic->vcpu->vcpu_id, diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 6bdaacb6faa0..168c46fd8dd1 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -230,14 +230,14 @@ static inline bool kvm_shadow_root_allocated(struct kvm *kvm) } #ifdef CONFIG_X86_64 -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } +extern bool tdp_mmu_enabled; #else -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +#define tdp_mmu_enabled false #endif static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { - return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm); + return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm); } static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 835426254e76..aeb240b339f5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -14,6 +14,7 @@ * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "irq.h" #include "ioapic.h" @@ -99,6 +100,13 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); */ bool tdp_enabled = false; +bool __ro_after_init tdp_mmu_allowed; + +#ifdef CONFIG_X86_64 +bool __read_mostly tdp_mmu_enabled = true; +module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); +#endif + static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; @@ -609,9 +617,14 @@ static bool mmu_spte_age(u64 *sptep) return true; } +static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) +{ + return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; +} + static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - if (is_tdp_mmu(vcpu->arch.mmu)) { + if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_begin(); } else { /* @@ -630,7 +643,7 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - if (is_tdp_mmu(vcpu->arch.mmu)) { + if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_end(); } else { /* @@ -1279,7 +1292,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); @@ -1312,7 +1325,7 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); @@ -1395,7 +1408,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, } } - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) write_protected |= kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); @@ -1558,7 +1571,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); return flush; @@ -1571,7 +1584,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); return flush; @@ -1646,7 +1659,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); return young; @@ -1659,7 +1672,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (kvm_memslots_have_rmaps(kvm)) young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); return young; @@ -1921,7 +1934,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) return true; /* TDP MMU pages do not use the MMU generation. */ - return !sp->tdp_mmu_page && + return !is_tdp_mmu_page(sp) && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } @@ -2355,7 +2368,16 @@ static void __link_shadow_page(struct kvm *kvm, mmu_page_add_parent_pte(cache, sp, sptep); - if (sp->unsync_children || sp->unsync) + /* + * The non-direct sub-pagetable must be updated before linking. For + * L1 sp, the pagetable is updated via kvm_sync_page() in + * kvm_mmu_find_shadow_page() without write-protecting the gfn, + * so sp->unsync can be true or false. For higher level non-direct + * sp, the pagetable is updated/synced via mmu_sync_children() in + * FNAME(fetch)(), so sp->unsync_children can only be false. + * WARN_ON_ONCE() if anything happens unexpectedly. + */ + if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync) mark_unsync(sptep); } @@ -3116,11 +3138,11 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ !is_large_pte(spte) && spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* - * A small SPTE exists for this pfn, but FNAME(fetch) - * and __direct_map would like to create a large PTE - * instead: just force them to go down another level, - * patching back for them into pfn the next 9 bits of - * the address. + * A small SPTE exists for this pfn, but FNAME(fetch), + * direct_map(), or kvm_tdp_mmu_map() would like to create a + * large PTE instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of the + * address. */ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - KVM_PAGES_PER_HPAGE(cur_level - 1); @@ -3129,7 +3151,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ } } -static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -3173,14 +3195,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return ret; } -static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) +static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn) { - send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); + unsigned long hva = gfn_to_hva_memslot(slot, gfn); + + send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current); } -static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - if (is_sigpending_pfn(pfn)) { + if (is_sigpending_pfn(fault->pfn)) { kvm_handle_signal_exit(vcpu); return -EINTR; } @@ -3190,43 +3214,43 @@ static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access. */ - if (pfn == KVM_PFN_ERR_RO_FAULT) + if (fault->pfn == KVM_PFN_ERR_RO_FAULT) return RET_PF_EMULATE; - if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); + if (fault->pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(fault->slot, fault->gfn); return RET_PF_RETRY; } return -EFAULT; } -static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - unsigned int access) +static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + unsigned int access) { - /* The pfn is invalid, report the error! */ - if (unlikely(is_error_pfn(fault->pfn))) - return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn); + gva_t gva = fault->is_tdp ? 0 : fault->addr; - if (unlikely(!fault->slot)) { - gva_t gva = fault->is_tdp ? 0 : fault->addr; + vcpu_cache_mmio_info(vcpu, gva, fault->gfn, + access & shadow_mmio_access_mask); - vcpu_cache_mmio_info(vcpu, gva, fault->gfn, - access & shadow_mmio_access_mask); - /* - * If MMIO caching is disabled, emulate immediately without - * touching the shadow page tables as attempting to install an - * MMIO SPTE will just be an expensive nop. Do not cache MMIO - * whose gfn is greater than host.MAXPHYADDR, any guest that - * generates such gfns is running nested and is being tricked - * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if - * and only if L1's MAXPHYADDR is inaccurate with respect to - * the hardware's). - */ - if (unlikely(!enable_mmio_caching) || - unlikely(fault->gfn > kvm_mmu_max_gfn())) - return RET_PF_EMULATE; - } + /* + * If MMIO caching is disabled, emulate immediately without + * touching the shadow page tables as attempting to install an + * MMIO SPTE will just be an expensive nop. + */ + if (unlikely(!enable_mmio_caching)) + return RET_PF_EMULATE; + + /* + * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR, + * any guest that generates such gfns is running nested and is being + * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and + * only if L1's MAXPHYADDR is inaccurate with respect to the + * hardware's). + */ + if (unlikely(fault->gfn > kvm_mmu_max_gfn())) + return RET_PF_EMULATE; return RET_PF_CONTINUE; } @@ -3350,7 +3374,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) do { u64 new_spte; - if (is_tdp_mmu(vcpu->arch.mmu)) + if (tdp_mmu_enabled) sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); @@ -3433,8 +3457,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } if (++retry_count > 4) { - printk_once(KERN_WARNING - "kvm: Fast #PF retrying more than 4 times.\n"); + pr_warn_once("Fast #PF retrying more than 4 times.\n"); break; } @@ -3596,7 +3619,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) if (r < 0) goto out_unlock; - if (is_tdp_mmu_enabled(vcpu->kvm)) { + if (tdp_mmu_enabled) { root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); mmu->root.hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { @@ -4026,7 +4049,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) walk_shadow_page_lockless_begin(vcpu); - if (is_tdp_mmu(vcpu->arch.mmu)) + if (is_tdp_mmu_active(vcpu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); @@ -4174,7 +4197,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); } -static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; bool async; @@ -4235,12 +4258,33 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return RET_PF_CONTINUE; } +static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + unsigned int access) +{ + int ret; + + fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; + smp_rmb(); + + ret = __kvm_faultin_pfn(vcpu, fault); + if (ret != RET_PF_CONTINUE) + return ret; + + if (unlikely(is_error_pfn(fault->pfn))) + return kvm_handle_error_pfn(vcpu, fault); + + if (unlikely(!fault->slot)) + return kvm_handle_noslot_fault(vcpu, fault, access); + + return RET_PF_CONTINUE; +} + /* * Returns true if the page fault is stale and needs to be retried, i.e. if the * root was invalidated by a memslot update or a relevant mmu_notifier fired. */ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault, int mmu_seq) + struct kvm_page_fault *fault) { struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa); @@ -4260,19 +4304,13 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, return true; return fault->slot && - mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva); + mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); - - unsigned long mmu_seq; int r; - fault->gfn = fault->addr >> PAGE_SHIFT; - fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); - if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; @@ -4284,41 +4322,24 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - mmu_seq = vcpu->kvm->mmu_invalidate_seq; - smp_rmb(); - - r = kvm_faultin_pfn(vcpu, fault); - if (r != RET_PF_CONTINUE) - return r; - - r = handle_abnormal_pfn(vcpu, fault, ACC_ALL); + r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; r = RET_PF_RETRY; + write_lock(&vcpu->kvm->mmu_lock); - if (is_tdp_mmu_fault) - read_lock(&vcpu->kvm->mmu_lock); - else - write_lock(&vcpu->kvm->mmu_lock); + if (is_page_fault_stale(vcpu, fault)) + goto out_unlock; - if (is_page_fault_stale(vcpu, fault, mmu_seq)) + r = make_mmu_pages_available(vcpu); + if (r) goto out_unlock; - if (is_tdp_mmu_fault) { - r = kvm_tdp_mmu_map(vcpu, fault); - } else { - r = make_mmu_pages_available(vcpu); - if (r) - goto out_unlock; - r = __direct_map(vcpu, fault); - } + r = direct_map(vcpu, fault); out_unlock: - if (is_tdp_mmu_fault) - read_unlock(&vcpu->kvm->mmu_lock); - else - write_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(fault->pfn); return r; } @@ -4366,6 +4387,42 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); +#ifdef CONFIG_X86_64 +static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + int r; + + if (page_fault_handle_page_track(vcpu, fault)) + return RET_PF_EMULATE; + + r = fast_page_fault(vcpu, fault); + if (r != RET_PF_INVALID) + return r; + + r = mmu_topup_memory_caches(vcpu, false); + if (r) + return r; + + r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); + if (r != RET_PF_CONTINUE) + return r; + + r = RET_PF_RETRY; + read_lock(&vcpu->kvm->mmu_lock); + + if (is_page_fault_stale(vcpu, fault)) + goto out_unlock; + + r = kvm_tdp_mmu_map(vcpu, fault); + +out_unlock: + read_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(fault->pfn); + return r; +} +#endif + int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { /* @@ -4383,13 +4440,18 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); + gfn_t base = fault->gfn & ~(page_num - 1); if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) break; } } +#ifdef CONFIG_X86_64 + if (tdp_mmu_enabled) + return kvm_tdp_mmu_page_fault(vcpu, fault); +#endif + return direct_page_fault(vcpu, fault); } @@ -5719,6 +5781,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; +#ifdef CONFIG_X86_64 + tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled; +#endif /* * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when @@ -5966,7 +6031,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * write and in the same critical section as making the reload request, * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield. */ - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_invalidate_all_roots(kvm); /* @@ -5991,7 +6056,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * Deferring the zap until the final reference to the root is put would * lead to use-after-free. */ - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_zap_invalidated_roots(kvm); } @@ -6017,9 +6082,11 @@ int kvm_mmu_init_vm(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - r = kvm_mmu_init_tdp_mmu(kvm); - if (r < 0) - return r; + if (tdp_mmu_enabled) { + r = kvm_mmu_init_tdp_mmu(kvm); + if (r < 0) + return r; + } node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; @@ -6049,7 +6116,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_page_track_unregister_notifier(kvm, node); - kvm_mmu_uninit_tdp_mmu(kvm); + if (tdp_mmu_enabled) + kvm_mmu_uninit_tdp_mmu(kvm); mmu_free_vm_memory_caches(kvm); } @@ -6103,7 +6171,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start, gfn_end, true, flush); @@ -6136,7 +6204,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); read_unlock(&kvm->mmu_lock); @@ -6379,7 +6447,7 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, u64 start, u64 end, int target_level) { - if (!is_tdp_mmu_enabled(kvm)) + if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) @@ -6400,7 +6468,7 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, u64 start = memslot->base_gfn; u64 end = start + memslot->npages; - if (!is_tdp_mmu_enabled(kvm)) + if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) { @@ -6483,7 +6551,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); read_unlock(&kvm->mmu_lock); @@ -6518,7 +6586,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); read_unlock(&kvm->mmu_lock); @@ -6553,7 +6621,7 @@ restart: kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_zap_all(kvm); write_unlock(&kvm->mmu_lock); @@ -6579,7 +6647,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) * zap all shadow pages. */ if (unlikely(gen == 0)) { - kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); + kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_zap_all_fast(kvm); } } @@ -6718,6 +6786,13 @@ void __init kvm_mmu_x86_module_init(void) if (nx_huge_pages == -1) __set_nx_huge_pages(get_nx_auto_mode()); + /* + * Snapshot userspace's desire to enable the TDP MMU. Whether or not the + * TDP MMU is actually enabled is determined in kvm_configure_mmu() + * when the vendor module is loaded. + */ + tdp_mmu_allowed = tdp_mmu_enabled; + kvm_mmu_spte_module_init(); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index dbaf6755c5a7..ac00bfbf32f6 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -199,7 +199,7 @@ struct kvm_page_fault { /* * Maximum page size that can be created for this fault; input to - * FNAME(fetch), __direct_map and kvm_tdp_mmu_map. + * FNAME(fetch), direct_map() and kvm_tdp_mmu_map(). */ u8 max_level; @@ -222,6 +222,7 @@ struct kvm_page_fault { struct kvm_memory_slot *slot; /* Outputs of kvm_faultin_pfn. */ + unsigned long mmu_seq; kvm_pfn_t pfn; hva_t hva; bool map_writable; @@ -279,6 +280,11 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, }; int r; + if (vcpu->arch.mmu->root_role.direct) { + fault.gfn = fault.addr >> PAGE_SHIFT; + fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); + } + /* * Async #PF "faults", a.k.a. prefetch faults, are not faults from the * guest perspective and have already been counted at the time of the diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 2e09d1b6249f..0a2ac438d647 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -10,6 +10,7 @@ * Author: * Xiao Guangrong <guangrong.xiao@linux.intel.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/rculist.h> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 0f6455072055..e5662dbd519c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -791,7 +791,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { struct guest_walker walker; int r; - unsigned long mmu_seq; bool is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); @@ -838,14 +837,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else fault->max_level = walker.level; - mmu_seq = vcpu->kvm->mmu_invalidate_seq; - smp_rmb(); - - r = kvm_faultin_pfn(vcpu, fault); - if (r != RET_PF_CONTINUE) - return r; - - r = handle_abnormal_pfn(vcpu, fault, walker.pte_access); + r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; @@ -871,7 +863,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (is_page_fault_stale(vcpu, fault, mmu_seq)) + if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = make_mmu_pages_available(vcpu); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index c0fd7e049b4e..fce6f047399f 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -7,7 +7,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright 2020 Red Hat, Inc. and/or its affiliates. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "mmu.h" @@ -352,7 +352,7 @@ u64 mark_spte_for_access_track(u64 spte) WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT), - "kvm: Access Tracking saved bit locations are not zero\n"); + "Access Tracking saved bit locations are not zero\n"); spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 6f54dc9409c9..0d8deefee66c 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -435,11 +435,11 @@ static inline void check_spte_writable_invariants(u64 spte) { if (spte & shadow_mmu_writable_mask) WARN_ONCE(!(spte & shadow_host_writable_mask), - "kvm: MMU-writable SPTE is not Host-writable: %llx", + KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx", spte); else WARN_ONCE(is_writable_pte(spte), - "kvm: Writable SPTE is not MMU-writable: %llx", spte); + KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte); } static inline bool is_mmu_writable_spte(u64 spte) diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 39b48e7d7d1a..e26e744df1d1 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "mmu_internal.h" #include "tdp_iter.h" diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d6df38d371a0..bba33aea0fb0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "mmu.h" #include "mmu_internal.h" @@ -10,23 +11,15 @@ #include <asm/cmpxchg.h> #include <trace/events/kvm.h> -static bool __read_mostly tdp_mmu_enabled = true; -module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); - /* Initializes the TDP MMU for the VM, if enabled. */ int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { struct workqueue_struct *wq; - if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) - return 0; - wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); if (!wq) return -ENOMEM; - /* This should not be changed for the lifetime of the VM. */ - kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); kvm->arch.tdp_mmu_zap_wq = wq; @@ -47,9 +40,6 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { - if (!kvm->arch.tdp_mmu_enabled) - return; - /* Also waits for any queued work items. */ destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); @@ -144,7 +134,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) return; - WARN_ON(!root->tdp_mmu_page); + WARN_ON(!is_tdp_mmu_page(root)); /* * The root now has refcount=0. It is valid, but readers already diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index d3714200b932..0a63b1afabd3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -7,6 +7,9 @@ #include "spte.h" +int kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); + hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) @@ -68,31 +71,9 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, u64 *spte); #ifdef CONFIG_X86_64 -int kvm_mmu_init_tdp_mmu(struct kvm *kvm); -void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } - -static inline bool is_tdp_mmu(struct kvm_mmu *mmu) -{ - struct kvm_mmu_page *sp; - hpa_t hpa = mmu->root.hpa; - - if (WARN_ON(!VALID_PAGE(hpa))) - return false; - - /* - * A NULL shadow page is legal when shadowing a non-paging guest with - * PAE paging, as the MMU will be direct with root_hpa pointing at the - * pae_root page, not a shadow page. - */ - sp = to_shadow_page(hpa); - return sp && is_tdp_mmu_page(sp) && sp->root_count; -} #else -static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; } -static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } -static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } #endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index a8502e02f479..9fac1ec03463 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -13,6 +13,7 @@ * Paolo Bonzini <pbonzini@redhat.com> * Xiao Guangrong <guangrong.xiao@linux.intel.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <asm/mtrr.h> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index eb594620dd75..d939d3b84e6f 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -9,6 +9,7 @@ * Gleb Natapov <gleb@redhat.com> * Wei Huang <wei@redhat.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 042d0aca3c92..4945456fd646 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -14,6 +14,7 @@ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, CPUID_7_1_EDX, + CPUID_8000_0007_EDX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -43,6 +44,9 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) +/* CPUID level 0x80000007 (EDX). */ +#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) + struct cpuid_reg { u32 function; u32 index; @@ -68,6 +72,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX}, [CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX}, [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX}, + [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX}, }; /* @@ -100,6 +105,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_SGX2; else if (x86_feature == X86_FEATURE_SGX_EDECCSSA) return KVM_X86_FEATURE_SGX_EDECCSSA; + else if (x86_feature == X86_FEATURE_CONSTANT_TSC) + return KVM_X86_FEATURE_CONSTANT_TSC; return x86_feature; } diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index a9c1c2af8d94..cc43638d48a3 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "x86.h" diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 6919dee69f18..f52f5e0dd465 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -12,7 +12,7 @@ * Avi Kivity <avi@qumranet.com> */ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_types.h> #include <linux/hashtable.h> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index add65dd59756..500da957e590 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -12,7 +12,7 @@ * Avi Kivity <avi@qumranet.com> */ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_types.h> #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 0e313fbae055..1ff068f23841 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -9,6 +9,8 @@ * * Implementation is based on pmu_intel.c file */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 86d6897f4806..273cba809328 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -6,6 +6,7 @@ * * Copyright 2010 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_types.h> #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9a194aa1a75a..799b24801d31 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1,4 +1,4 @@ -#define pr_fmt(fmt) "SVM: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> @@ -519,21 +519,37 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu) vcpu->arch.osvw.status |= 1; } -static int has_svm(void) +static bool kvm_is_svm_supported(void) { + int cpu = raw_smp_processor_id(); const char *msg; + u64 vm_cr; if (!cpu_has_svm(&msg)) { - printk(KERN_INFO "has_svm: %s\n", msg); - return 0; + pr_err("SVM not supported by CPU %d, %s\n", cpu, msg); + return false; } if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { pr_info("KVM is unsupported when running as an SEV guest\n"); - return 0; + return false; } - return 1; + rdmsrl(MSR_VM_CR, vm_cr); + if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) { + pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu); + return false; + } + + return true; +} + +static int svm_check_processor_compat(void) +{ + if (!kvm_is_svm_supported()) + return -EIO; + + return 0; } void __svm_write_tsc_multiplier(u64 multiplier) @@ -572,10 +588,6 @@ static int svm_hardware_enable(void) if (efer & EFER_SVME) return -EBUSY; - if (!has_svm()) { - pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); - return -EINVAL; - } sd = per_cpu_ptr(&svm_data, me); sd->asid_generation = 1; sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; @@ -2076,7 +2088,7 @@ static void svm_handle_mce(struct kvm_vcpu *vcpu) * Erratum 383 triggered. Guest state is corrupt so kill the * guest. */ - pr_err("KVM: Guest triggered AMD Erratum 383\n"); + pr_err("Guest triggered AMD Erratum 383\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); @@ -4076,17 +4088,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcb_mark_dirty(svm->vmcb, VMCB_CR); } -static int is_disabled(void) -{ - u64 vm_cr; - - rdmsrl(MSR_VM_CR, vm_cr); - if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) - return 1; - - return 0; -} - static void svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) { @@ -4098,11 +4099,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xd9; } -static int __init svm_check_processor_compat(void) -{ - return 0; -} - /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. @@ -4629,7 +4625,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, smap = cr4 & X86_CR4_SMAP; is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { - pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); + pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n"); /* * If the fault occurred in userspace, arbitrarily inject #GP @@ -4701,7 +4697,9 @@ static int svm_vm_init(struct kvm *kvm) } static struct kvm_x86_ops svm_x86_ops __initdata = { - .name = "kvm_amd", + .name = KBUILD_MODNAME, + + .check_processor_compatibility = svm_check_processor_compat, .hardware_unsetup = svm_hardware_unsetup, .hardware_enable = svm_hardware_enable, @@ -4978,7 +4976,7 @@ static __init int svm_hardware_setup(void) } if (nested) { - printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); + pr_info("Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); } @@ -4996,7 +4994,7 @@ static __init int svm_hardware_setup(void) /* Force VM NPT level equal to the host's paging level */ kvm_configure_mmu(npt_enabled, get_npt_level(), get_npt_level(), PG_LEVEL_1G); - pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); @@ -5086,10 +5084,7 @@ err: static struct kvm_x86_init_ops svm_init_ops __initdata = { - .cpu_has_kvm_support = has_svm, - .disabled_by_bios = is_disabled, .hardware_setup = svm_hardware_setup, - .check_processor_compatibility = svm_check_processor_compat, .runtime_ops = &svm_x86_ops, .pmu_ops = &amd_pmu_ops, @@ -5097,15 +5092,37 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static int __init svm_init(void) { + int r; + __unused_size_checks(); - return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + if (!kvm_is_svm_supported()) + return -EOPNOTSUPP; + + r = kvm_x86_vendor_init(&svm_init_ops); + if (r) + return r; + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), + THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + kvm_x86_vendor_exit(); + return r; } static void __exit svm_exit(void) { kvm_exit(); + kvm_x86_vendor_exit(); } module_init(svm_init) diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 26a89d0da93e..7af8422d3382 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -2,6 +2,7 @@ /* * KVM L1 hypervisor optimizations on Hyper-V for SVM. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 45faf84476ce..6981c1e9a809 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -34,7 +34,7 @@ static inline void svm_hv_hardware_setup(void) { if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { - pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n"); + pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; svm_x86_ops.tlb_remote_flush_with_range = hv_remote_flush_tlb_with_range; @@ -43,7 +43,7 @@ static inline void svm_hv_hardware_setup(void) if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { int cpu; - pr_info("kvm: Hyper-V Direct TLB Flush enabled\n"); + pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n"); for_each_online_cpu(cpu) { struct hv_vp_assist_page *vp_ap = hv_get_vp_assist_page(cpu); diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index cd2ac9536c99..45162c1bcd8f 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -66,13 +66,13 @@ struct vmcs_config { u64 misc; struct nested_vmx_msrs nested; }; -extern struct vmcs_config vmcs_config; +extern struct vmcs_config vmcs_config __ro_after_init; struct vmx_capability { u32 ept; u32 vpid; }; -extern struct vmx_capability vmx_capability; +extern struct vmx_capability vmx_capability __ro_after_init; static inline bool cpu_has_vmx_basic_inout(void) { diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index ae03d1fe0355..22daca752797 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/smp.h> @@ -361,35 +362,43 @@ enum evmcs_revision { enum evmcs_ctrl_type { EVMCS_EXIT_CTRLS, EVMCS_ENTRY_CTRLS, + EVMCS_EXEC_CTRL, EVMCS_2NDEXEC, + EVMCS_3RDEXEC, EVMCS_PINCTRL, EVMCS_VMFUNC, NR_EVMCS_CTRLS, }; -static const u32 evmcs_unsupported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { +static const u32 evmcs_supported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { [EVMCS_EXIT_CTRLS] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMEXIT_CTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMEXIT_CTRL, }, [EVMCS_ENTRY_CTRLS] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMENTRY_CTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMENTRY_CTRL, + }, + [EVMCS_EXEC_CTRL] = { + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_EXEC_CTRL, }, [EVMCS_2NDEXEC] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_2NDEXEC, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_2NDEXEC & ~SECONDARY_EXEC_TSC_SCALING, + }, + [EVMCS_3RDEXEC] = { + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_3RDEXEC, }, [EVMCS_PINCTRL] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_PINCTRL, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_PINCTRL, }, [EVMCS_VMFUNC] = { - [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMFUNC, + [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMFUNC, }, }; -static u32 evmcs_get_unsupported_ctls(enum evmcs_ctrl_type ctrl_type) +static u32 evmcs_get_supported_ctls(enum evmcs_ctrl_type ctrl_type) { enum evmcs_revision evmcs_rev = EVMCSv1_LEGACY; - return evmcs_unsupported_ctrls[ctrl_type][evmcs_rev]; + return evmcs_supported_ctrls[ctrl_type][evmcs_rev]; } static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu) @@ -413,7 +422,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * { u32 ctl_low = (u32)*pdata; u32 ctl_high = (u32)(*pdata >> 32); - u32 unsupported_ctrls; + u32 supported_ctrls; /* * Hyper-V 2016 and 2019 try using these features even when eVMCS @@ -422,27 +431,31 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * switch (msr_index) { case MSR_IA32_VMX_EXIT_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: - unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_EXIT_CTRLS); + supported_ctrls = evmcs_get_supported_ctls(EVMCS_EXIT_CTRLS); if (!evmcs_has_perf_global_ctrl(vcpu)) - unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - ctl_high &= ~unsupported_ctrls; + supported_ctrls &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= supported_ctrls; break; case MSR_IA32_VMX_ENTRY_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_ENTRY_CTRLS); + supported_ctrls = evmcs_get_supported_ctls(EVMCS_ENTRY_CTRLS); if (!evmcs_has_perf_global_ctrl(vcpu)) - unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - ctl_high &= ~unsupported_ctrls; + supported_ctrls &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= supported_ctrls; + break; + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + ctl_high &= evmcs_get_supported_ctls(EVMCS_EXEC_CTRL); break; case MSR_IA32_VMX_PROCBASED_CTLS2: - ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_2NDEXEC); + ctl_high &= evmcs_get_supported_ctls(EVMCS_2NDEXEC); break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: - ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_PINCTRL); + ctl_high &= evmcs_get_supported_ctls(EVMCS_PINCTRL); break; case MSR_IA32_VMX_VMFUNC: - ctl_low &= ~evmcs_get_unsupported_ctls(EVMCS_VMFUNC); + ctl_low &= evmcs_get_supported_ctls(EVMCS_VMFUNC); break; } @@ -452,7 +465,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 * static bool nested_evmcs_is_valid_controls(enum evmcs_ctrl_type ctrl_type, u32 val) { - return !(val & evmcs_get_unsupported_ctls(ctrl_type)); + return !(val & ~evmcs_get_supported_ctls(ctrl_type)); } int nested_evmcs_check_controls(struct vmcs12 *vmcs12) @@ -461,6 +474,10 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) vmcs12->pin_based_vm_exec_control))) return -EINVAL; + if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXEC_CTRL, + vmcs12->cpu_based_vm_exec_control))) + return -EINVAL; + if (CC(!nested_evmcs_is_valid_controls(EVMCS_2NDEXEC, vmcs12->secondary_vm_exec_control))) return -EINVAL; @@ -488,6 +505,38 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12) return 0; } +#if IS_ENABLED(CONFIG_HYPERV) +/* + * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption + * is: in case a feature has corresponding fields in eVMCS described and it was + * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a + * feature which has no corresponding eVMCS field, this likely means that KVM + * needs to be updated. + */ +#define evmcs_check_vmcs_conf(field, ctrl) \ + do { \ + typeof(vmcs_conf->field) unsupported; \ + \ + unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \ + if (unsupported) { \ + pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\ + (u64)unsupported); \ + vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \ + } \ + } \ + while (0) + +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) +{ + evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL); + evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL); + evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC); + evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC); + evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL); + evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL); +} +#endif + int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index 571e7929d14e..ab08a9b9ab7d 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -48,22 +48,84 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); * Currently unsupported in KVM: * GUEST_IA32_RTIT_CTL = 0x00002814, */ -#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ - PIN_BASED_VMX_PREEMPTION_TIMER) -#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) -#define EVMCS1_UNSUPPORTED_2NDEXEC \ - (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ - SECONDARY_EXEC_APIC_REGISTER_VIRT | \ - SECONDARY_EXEC_ENABLE_PML | \ - SECONDARY_EXEC_ENABLE_VMFUNC | \ - SECONDARY_EXEC_SHADOW_VMCS | \ +#define EVMCS1_SUPPORTED_PINCTRL \ + (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + PIN_BASED_EXT_INTR_MASK | \ + PIN_BASED_NMI_EXITING | \ + PIN_BASED_VIRTUAL_NMIS) + +#define EVMCS1_SUPPORTED_EXEC_CTRL \ + (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \ + CPU_BASED_HLT_EXITING | \ + CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING | \ + CPU_BASED_UNCOND_IO_EXITING | \ + CPU_BASED_MOV_DR_EXITING | \ + CPU_BASED_USE_TSC_OFFSETTING | \ + CPU_BASED_MWAIT_EXITING | \ + CPU_BASED_MONITOR_EXITING | \ + CPU_BASED_INVLPG_EXITING | \ + CPU_BASED_RDPMC_EXITING | \ + CPU_BASED_INTR_WINDOW_EXITING | \ + CPU_BASED_CR8_LOAD_EXITING | \ + CPU_BASED_CR8_STORE_EXITING | \ + CPU_BASED_RDTSC_EXITING | \ + CPU_BASED_TPR_SHADOW | \ + CPU_BASED_USE_IO_BITMAPS | \ + CPU_BASED_MONITOR_TRAP_FLAG | \ + CPU_BASED_USE_MSR_BITMAPS | \ + CPU_BASED_NMI_WINDOW_EXITING | \ + CPU_BASED_PAUSE_EXITING | \ + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) + +#define EVMCS1_SUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ + SECONDARY_EXEC_WBINVD_EXITING | \ + SECONDARY_EXEC_ENABLE_VPID | \ + SECONDARY_EXEC_ENABLE_EPT | \ + SECONDARY_EXEC_UNRESTRICTED_GUEST | \ + SECONDARY_EXEC_DESC | \ + SECONDARY_EXEC_ENABLE_RDTSCP | \ + SECONDARY_EXEC_ENABLE_INVPCID | \ + SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_RDSEED_EXITING | \ + SECONDARY_EXEC_RDRAND_EXITING | \ SECONDARY_EXEC_TSC_SCALING | \ - SECONDARY_EXEC_PAUSE_LOOP_EXITING) -#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \ - (VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) -#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (0) -#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ + SECONDARY_EXEC_PT_USE_GPA | \ + SECONDARY_EXEC_PT_CONCEAL_VMX | \ + SECONDARY_EXEC_BUS_LOCK_DETECTION | \ + SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_ENCLS_EXITING) + +#define EVMCS1_SUPPORTED_3RDEXEC (0ULL) + +#define EVMCS1_SUPPORTED_VMEXIT_CTRL \ + (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_ACK_INTR_ON_EXIT | \ + VM_EXIT_HOST_ADDR_SPACE_SIZE | \ + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_IA32_PAT | \ + VM_EXIT_LOAD_IA32_PAT | \ + VM_EXIT_SAVE_IA32_EFER | \ + VM_EXIT_LOAD_IA32_EFER | \ + VM_EXIT_CLEAR_BNDCFGS | \ + VM_EXIT_PT_CONCEAL_PIP | \ + VM_EXIT_CLEAR_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMENTRY_CTRL \ + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \ + VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_IA32E_MODE | \ + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_ENTRY_LOAD_IA32_PAT | \ + VM_ENTRY_LOAD_IA32_EFER | \ + VM_ENTRY_LOAD_BNDCFGS | \ + VM_ENTRY_PT_CONCEAL_PIP | \ + VM_ENTRY_LOAD_IA32_RTIT_CTL) + +#define EVMCS1_SUPPORTED_VMFUNC (0) struct evmcs_field { u16 offset; @@ -117,9 +179,7 @@ static __always_inline int get_evmcs_offset(unsigned long field, { int offset = evmcs_field_offset(field, clean_field); - WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n", - field); - + WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field); return offset; } @@ -211,6 +271,7 @@ static inline void evmcs_load(u64 phys_addr) vp_ap->enlighten_vmentry = 1; } +void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d93c715cda6a..557b9c468734 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/objtool.h> #include <linux/percpu.h> @@ -203,7 +204,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); + pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index e5cec07ca8d9..efce9ad70e4e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -8,6 +8,8 @@ * Avi Kivity <avi@redhat.com> * Gleb Natapov <gleb@redhat.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> @@ -762,8 +764,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) return; warn: - pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n", - vcpu->vcpu_id); + pr_warn_ratelimited("vcpu-%d: fail to passthrough LBR.\n", vcpu->vcpu_id); } static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 1b56c5e5c9fb..94c38bea60e7 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kvm_host.h> #include <asm/irq_remapping.h> diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index b12da2a6dec9..aa53c98034bf 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2021 Intel Corporation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/sgx.h> @@ -164,7 +165,7 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, if (!vcpu->kvm->arch.sgx_provisioning_allowed && (attributes & SGX_ATTR_PROVISIONKEY)) { if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY) - pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n"); + pr_warn_once("SGX PROVISIONKEY advertised but not allowed\n"); kvm_inject_gp(vcpu, 0); return 1; } @@ -381,7 +382,7 @@ int handle_encls(struct kvm_vcpu *vcpu) return handle_encls_ecreate(vcpu); if (leaf == EINIT) return handle_encls_einit(vcpu); - WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf); + WARN_ONCE(1, "unexpected exit on ENCLS[%u]", leaf); vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS; return 0; diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 2251b60920f8..106a72c923ca 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "vmcs12.h" diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fc9008dbed33..73005d7e4e43 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -12,6 +12,7 @@ * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/highmem.h> #include <linux/hrtimer.h> @@ -444,36 +445,36 @@ void vmread_error(unsigned long field, bool fault) if (fault) kvm_spurious_fault(); else - vmx_insn_failed("kvm: vmread failed: field=%lx\n", field); + vmx_insn_failed("vmread failed: field=%lx\n", field); } noinline void vmwrite_error(unsigned long field, unsigned long value) { - vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", + vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) { - vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", + vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", ext, vpid, gva); } noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) { - vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", + vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", ext, eptp, gpa); } @@ -488,8 +489,8 @@ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); -struct vmcs_config vmcs_config; -struct vmx_capability vmx_capability; +struct vmcs_config vmcs_config __ro_after_init; +struct vmx_capability vmx_capability __ro_after_init; #define VMX_SEGMENT_FIELD(seg) \ [VCPU_SREG_##seg] = { \ @@ -523,6 +524,8 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static unsigned long host_idt_base; #if IS_ENABLED(CONFIG_HYPERV) +static struct kvm_x86_ops vmx_x86_ops __initdata; + static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -551,6 +554,71 @@ static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) return 0; } +static __init void hv_init_evmcs(void) +{ + int cpu; + + if (!enlightened_vmcs) + return; + + /* + * Enlightened VMCS usage should be recommended and the host needs + * to support eVMCS v1 or above. + */ + if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= + KVM_EVMCS_VERSION) { + + /* Check that we have assist pages on all online CPUs */ + for_each_online_cpu(cpu) { + if (!hv_get_vp_assist_page(cpu)) { + enlightened_vmcs = false; + break; + } + } + + if (enlightened_vmcs) { + pr_info("Using Hyper-V Enlightened VMCS\n"); + static_branch_enable(&enable_evmcs); + } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) + vmx_x86_ops.enable_l2_tlb_flush + = hv_enable_l2_tlb_flush; + + } else { + enlightened_vmcs = false; + } +} + +static void hv_reset_evmcs(void) +{ + struct hv_vp_assist_page *vp_ap; + + if (!static_branch_unlikely(&enable_evmcs)) + return; + + /* + * KVM should enable eVMCS if and only if all CPUs have a VP assist + * page, and should reject CPU onlining if eVMCS is enabled the CPU + * doesn't have a VP assist page allocated. + */ + vp_ap = hv_get_vp_assist_page(smp_processor_id()); + if (WARN_ON_ONCE(!vp_ap)) + return; + + /* + * Reset everything to support using non-enlightened VMCS access later + * (e.g. when we reload the module with enlightened_vmcs=0) + */ + vp_ap->nested_control.features.directhypercall = 0; + vp_ap->current_nested_vmcs = 0; + vp_ap->enlighten_vmentry = 0; +} + +#else /* IS_ENABLED(CONFIG_HYPERV) */ +static void hv_init_evmcs(void) {} +static void hv_reset_evmcs(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ /* @@ -1613,8 +1681,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) if (!instr_len) goto rip_updated; - WARN(exit_reason.enclave_mode, - "KVM: skipping instruction after SGX enclave VM-Exit"); + WARN_ONCE(exit_reason.enclave_mode, + "skipping instruction after SGX enclave VM-Exit"); orig_rip = kvm_rip_read(vcpu); rip = orig_rip + instr_len; @@ -2448,88 +2516,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } -static __init int cpu_has_kvm_support(void) -{ - return cpu_has_vmx(); -} - -static __init int vmx_disabled_by_bios(void) -{ - return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !boot_cpu_has(X86_FEATURE_VMX); -} - -static int kvm_cpu_vmxon(u64 vmxon_pointer) -{ - u64 msr; - - cr4_set_bits(X86_CR4_VMXE); - - asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" - _ASM_EXTABLE(1b, %l[fault]) - : : [vmxon_pointer] "m"(vmxon_pointer) - : : fault); - return 0; - -fault: - WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", - rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); - cr4_clear_bits(X86_CR4_VMXE); - - return -EFAULT; -} - -static int vmx_hardware_enable(void) -{ - int cpu = raw_smp_processor_id(); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - int r; - - if (cr4_read_shadow() & X86_CR4_VMXE) - return -EBUSY; - - /* - * This can happen if we hot-added a CPU but failed to allocate - * VP assist page for it. - */ - if (static_branch_unlikely(&enable_evmcs) && - !hv_get_vp_assist_page(cpu)) - return -EFAULT; - - intel_pt_handle_vmx(1); - - r = kvm_cpu_vmxon(phys_addr); - if (r) { - intel_pt_handle_vmx(0); - return r; - } - - if (enable_ept) - ept_sync_global(); - - return 0; -} - -static void vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v, *n; - - list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - __loaded_vmcs_clear(v); -} - -static void vmx_hardware_disable(void) -{ - vmclear_local_loaded_vmcss(); - - if (cpu_vmxoff()) - kvm_spurious_fault(); - - intel_pt_handle_vmx(0); -} - /* * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID * directly instead of going through cpu_has(), to ensure KVM is trapping @@ -2565,8 +2551,7 @@ static bool cpu_has_perf_global_ctrl_bug(void) return false; } -static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, - u32 msr, u32 *result) +static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { u32 vmx_msr_low, vmx_msr_high; u32 ctl = ctl_min | ctl_opt; @@ -2584,7 +2569,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } -static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) { u64 allowed; @@ -2593,8 +2578,8 @@ static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) return ctl_opt & allowed; } -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, - struct vmx_capability *vmx_cap) +static int setup_vmcs_config(struct vmcs_config *vmcs_conf, + struct vmx_capability *vmx_cap) { u32 vmx_msr_low, vmx_msr_high; u32 _pin_based_exec_control = 0; @@ -2752,9 +2737,127 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->vmentry_ctrl = _vmentry_control; vmcs_conf->misc = misc_msr; +#if IS_ENABLED(CONFIG_HYPERV) + if (enlightened_vmcs) + evmcs_sanitize_exec_ctrls(vmcs_conf); +#endif + + return 0; +} + +static bool kvm_is_vmx_supported(void) +{ + int cpu = raw_smp_processor_id(); + + if (!cpu_has_vmx()) { + pr_err("VMX not supported by CPU %d\n", cpu); + return false; + } + + if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || + !this_cpu_has(X86_FEATURE_VMX)) { + pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); + return false; + } + + return true; +} + +static int vmx_check_processor_compat(void) +{ + int cpu = raw_smp_processor_id(); + struct vmcs_config vmcs_conf; + struct vmx_capability vmx_cap; + + if (!kvm_is_vmx_supported()) + return -EIO; + + if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { + pr_err("Failed to setup VMCS config on CPU %d\n", cpu); + return -EIO; + } + if (nested) + nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { + pr_err("Inconsistent VMCS config on CPU %d\n", cpu); + return -EIO; + } + return 0; +} + +static int kvm_cpu_vmxon(u64 vmxon_pointer) +{ + u64 msr; + + cr4_set_bits(X86_CR4_VMXE); + + asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + _ASM_EXTABLE(1b, %l[fault]) + : : [vmxon_pointer] "m"(vmxon_pointer) + : : fault); + return 0; + +fault: + WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", + rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + cr4_clear_bits(X86_CR4_VMXE); + + return -EFAULT; +} + +static int vmx_hardware_enable(void) +{ + int cpu = raw_smp_processor_id(); + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + int r; + + if (cr4_read_shadow() & X86_CR4_VMXE) + return -EBUSY; + + /* + * This can happen if we hot-added a CPU but failed to allocate + * VP assist page for it. + */ + if (static_branch_unlikely(&enable_evmcs) && + !hv_get_vp_assist_page(cpu)) + return -EFAULT; + + intel_pt_handle_vmx(1); + + r = kvm_cpu_vmxon(phys_addr); + if (r) { + intel_pt_handle_vmx(0); + return r; + } + + if (enable_ept) + ept_sync_global(); + return 0; } +static void vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v, *n; + + list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + __loaded_vmcs_clear(v); +} + +static void vmx_hardware_disable(void) +{ + vmclear_local_loaded_vmcss(); + + if (cpu_vmxoff()) + kvm_spurious_fault(); + + hv_reset_evmcs(); + + intel_pt_handle_vmx(0); +} + struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) { int node = cpu_to_node(cpu); @@ -2950,9 +3053,8 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save) var.type = 0x3; var.avl = 0; if (save->base & 0xf) - printk_once(KERN_WARNING "kvm: segment base is not " - "paragraph aligned when entering " - "protected mode (seg=%d)", seg); + pr_warn_once("segment base is not paragraph aligned " + "when entering protected mode (seg=%d)", seg); } vmcs_write16(sf->selector, var.selector); @@ -2982,8 +3084,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) * vcpu. Warn the user that an update is overdue. */ if (!kvm_vmx->tss_addr) - printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " - "called before entering vcpu\n"); + pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n"); vmx_segment_cache_clear(vmx); @@ -6851,7 +6952,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) gate_desc *desc = (gate_desc *)host_idt_base + vector; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, - "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) + "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); @@ -7449,29 +7550,6 @@ static int vmx_vm_init(struct kvm *kvm) return 0; } -static int __init vmx_check_processor_compat(void) -{ - struct vmcs_config vmcs_conf; - struct vmx_capability vmx_cap; - - if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || - !this_cpu_has(X86_FEATURE_VMX)) { - pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id()); - return -EIO; - } - - if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) - return -EIO; - if (nested) - nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); - if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { - printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", - smp_processor_id()); - return -EIO; - } - return 0; -} - static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; @@ -8071,7 +8149,9 @@ static void vmx_vm_destroy(struct kvm *kvm) } static struct kvm_x86_ops vmx_x86_ops __initdata = { - .name = "kvm_intel", + .name = KBUILD_MODNAME, + + .check_processor_compatibility = vmx_check_processor_compat, .hardware_unsetup = vmx_hardware_unsetup, @@ -8291,7 +8371,7 @@ static __init int hardware_setup(void) return -EIO; if (cpu_has_perf_global_ctrl_bug()) - pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " "does not work properly. Using workaround\n"); if (boot_cpu_has(X86_FEATURE_NX)) @@ -8299,7 +8379,7 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_MPX)) { rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); - WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); } if (!cpu_has_vmx_mpx()) @@ -8318,7 +8398,7 @@ static __init int hardware_setup(void) /* NX support is required for shadow paging. */ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { - pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n"); + pr_err_ratelimited("NX (Execute Disable) not supported\n"); return -EOPNOTSUPP; } @@ -8470,9 +8550,6 @@ static __init int hardware_setup(void) } static struct kvm_x86_init_ops vmx_init_ops __initdata = { - .cpu_has_kvm_support = cpu_has_kvm_support, - .disabled_by_bios = vmx_disabled_by_bios, - .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, .handle_intel_pt_intr = NULL, @@ -8490,41 +8567,23 @@ static void vmx_cleanup_l1d_flush(void) l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } -static void vmx_exit(void) +static void __vmx_exit(void) { + allow_smaller_maxphyaddr = false; + #ifdef CONFIG_KEXEC_CORE RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); #endif + vmx_cleanup_l1d_flush(); +} +static void vmx_exit(void) +{ kvm_exit(); + kvm_x86_vendor_exit(); -#if IS_ENABLED(CONFIG_HYPERV) - if (static_branch_unlikely(&enable_evmcs)) { - int cpu; - struct hv_vp_assist_page *vp_ap; - /* - * Reset everything to support using non-enlightened VMCS - * access later (e.g. when we reload the module with - * enlightened_vmcs=0) - */ - for_each_online_cpu(cpu) { - vp_ap = hv_get_vp_assist_page(cpu); - - if (!vp_ap) - continue; - - vp_ap->nested_control.features.directhypercall = 0; - vp_ap->current_nested_vmcs = 0; - vp_ap->enlighten_vmentry = 0; - } - - static_branch_disable(&enable_evmcs); - } -#endif - vmx_cleanup_l1d_flush(); - - allow_smaller_maxphyaddr = false; + __vmx_exit(); } module_exit(vmx_exit); @@ -8532,56 +8591,29 @@ static int __init vmx_init(void) { int r, cpu; -#if IS_ENABLED(CONFIG_HYPERV) + if (!kvm_is_vmx_supported()) + return -EOPNOTSUPP; + /* - * Enlightened VMCS usage should be recommended and the host needs - * to support eVMCS v1 or above. We can also disable eVMCS support - * with module parameter. + * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing + * to unwind if a later step fails. */ - if (enlightened_vmcs && - ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && - (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= - KVM_EVMCS_VERSION) { - - /* Check that we have assist pages on all online CPUs */ - for_each_online_cpu(cpu) { - if (!hv_get_vp_assist_page(cpu)) { - enlightened_vmcs = false; - break; - } - } - - if (enlightened_vmcs) { - pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); - static_branch_enable(&enable_evmcs); - } + hv_init_evmcs(); - if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) - vmx_x86_ops.enable_l2_tlb_flush - = hv_enable_l2_tlb_flush; - - } else { - enlightened_vmcs = false; - } -#endif - - r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + r = kvm_x86_vendor_init(&vmx_init_ops); if (r) return r; /* - * Must be called after kvm_init() so enable_ept is properly set + * Must be called after common x86 init so enable_ept is properly set * up. Hand the parameter mitigation value in which was stored in * the pre module init parser. If no parameter was given, it will * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + if (r) + goto err_l1d_flush; vmx_setup_fb_clear_ctrl(); @@ -8605,6 +8637,21 @@ static int __init vmx_init(void) if (!enable_ept) allow_smaller_maxphyaddr = true; + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), + THIS_MODULE); + if (r) + goto err_kvm_init; + return 0; + +err_kvm_init: + __vmx_exit(); +err_l1d_flush: + kvm_x86_vendor_exit(); + return r; } module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 842dc898c972..a5282014616c 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -100,8 +100,8 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) return value; do_fail: - WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field); - pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field); + WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field); + pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field); return 0; do_exception: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index da4bbd043a7b..c3ac88036b52 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -15,6 +15,7 @@ * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "irq.h" @@ -128,6 +129,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); +static DEFINE_MUTEX(vendor_module_lock); struct kvm_x86_ops kvm_x86_ops __read_mostly; #define KVM_X86_OP(func) \ @@ -1480,7 +1482,7 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_STIMER0_CONFIG, HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, - HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_X64_MSR_SYNDBG_OPTIONS, HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, @@ -2086,7 +2088,7 @@ static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) return kvm_handle_invalid_op(vcpu); - pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn); + pr_warn_once("%s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } int kvm_emulate_mwait(struct kvm_vcpu *vcpu) @@ -2433,7 +2435,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { - pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); + pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n", + user_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); @@ -3821,6 +3824,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_set_msr_common(vcpu, msr, data, msr_info->host_initiated); case MSR_IA32_BBL_CR_CTL3: @@ -4191,6 +4195,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); @@ -7699,7 +7704,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; emul_write: - printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); + pr_warn_once("emulating exchange as write\n"); return emulator_write_emulated(ctxt, addr, new, bytes, exception); } @@ -8260,7 +8265,7 @@ static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); if (!ctxt) { - pr_err("kvm: failed to allocate vcpu's emulator\n"); + pr_err("failed to allocate vcpu's emulator\n"); return NULL; } @@ -9271,35 +9276,66 @@ static struct notifier_block pvclock_gtod_notifier = { }; #endif -int kvm_arch_init(void *opaque) +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) +{ + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + +#define __KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ + (void *)__static_call_return0); +#include <asm/kvm-x86-ops.h> +#undef __KVM_X86_OP + + kvm_pmu_ops_update(ops->pmu_ops); +} + +static int kvm_x86_check_processor_compatibility(void) +{ + int cpu = smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* + * Compatibility checks are done when loading KVM and when enabling + * hardware, e.g. during CPU hotplug, to ensure all online CPUs are + * compatible, i.e. KVM should never perform a compatibility check on + * an offline CPU. + */ + WARN_ON(!cpu_online(cpu)); + + if (__cr4_reserved_bits(cpu_has, c) != + __cr4_reserved_bits(cpu_has, &boot_cpu_data)) + return -EIO; + + return static_call(kvm_x86_check_processor_compatibility)(); +} + +static void kvm_x86_check_cpu_compat(void *ret) +{ + *(int *)ret = kvm_x86_check_processor_compatibility(); +} + +static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) { - struct kvm_x86_init_ops *ops = opaque; u64 host_pat; - int r; + int r, cpu; if (kvm_x86_ops.hardware_enable) { - pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); + pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); return -EEXIST; } - if (!ops->cpu_has_kvm_support()) { - pr_err_ratelimited("kvm: no hardware support for '%s'\n", - ops->runtime_ops->name); - return -EOPNOTSUPP; - } - if (ops->disabled_by_bios()) { - pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", - ops->runtime_ops->name); - return -EOPNOTSUPP; - } - /* * KVM explicitly assumes that the guest has an FPU and * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the * vCPU's FPU state as a fxregs_state struct. */ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { - printk(KERN_ERR "kvm: inadequate fpu\n"); + pr_err("inadequate fpu\n"); return -EOPNOTSUPP; } @@ -9317,19 +9353,19 @@ int kvm_arch_init(void *opaque) */ if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || (host_pat & GENMASK(2, 0)) != 6) { - pr_err("kvm: host PAT[0] is not WB\n"); + pr_err("host PAT[0] is not WB\n"); return -EIO; } x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { - pr_err("kvm: failed to allocate cache for x86 emulator\n"); + pr_err("failed to allocate cache for x86 emulator\n"); return -ENOMEM; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); if (!user_return_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + pr_err("failed to allocate percpu kvm_user_return_msrs\n"); r = -ENOMEM; goto out_free_x86_emulator_cache; } @@ -9339,13 +9375,37 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_timer_init(); - if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } + rdmsrl_safe(MSR_EFER, &host_efer); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + + kvm_init_pmu_capability(); + + r = ops->hardware_setup(); + if (r != 0) + goto out_mmu_exit; + + kvm_ops_update(ops); + + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1); + if (r < 0) + goto out_unwind_ops; + } + + /* + * Point of no return! DO NOT add error paths below this point unless + * absolutely necessary, as most operations from this point forward + * require unwinding. + */ + kvm_timer_init(); + if (pi_inject_timer == -1) pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER); #ifdef CONFIG_X86_64 @@ -9355,8 +9415,35 @@ int kvm_arch_init(void *opaque) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + kvm_caps.supported_xss = 0; + +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); +#undef __kvm_cpu_cap_has + + if (kvm_caps.has_tsc_control) { + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated because it will always + * be 1 on all machines. + */ + u64 max = min(0x7fffffffULL, + __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); + kvm_caps.max_guest_tsc_khz = max; + } + kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; + kvm_init_msr_list(); return 0; +out_unwind_ops: + kvm_x86_ops.hardware_enable = NULL; + static_call(kvm_x86_hardware_unsetup)(); +out_mmu_exit: + kvm_mmu_vendor_module_exit(); out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: @@ -9364,8 +9451,22 @@ out_free_x86_emulator_cache: return r; } -void kvm_arch_exit(void) +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +{ + int r; + + mutex_lock(&vendor_module_lock); + r = __kvm_x86_vendor_init(ops); + mutex_unlock(&vendor_module_lock); + + return r; +} +EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); + +void kvm_x86_vendor_exit(void) { + kvm_unregister_perf_callbacks(); + #ifdef CONFIG_X86_64 if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) clear_hv_tscchange_cb(); @@ -9382,7 +9483,7 @@ void kvm_arch_exit(void) irq_work_sync(&pvclock_irq_work); cancel_work_sync(&pvclock_gtod_work); #endif - kvm_x86_ops.hardware_enable = NULL; + static_call(kvm_x86_hardware_unsetup)(); kvm_mmu_vendor_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); @@ -9390,7 +9491,11 @@ void kvm_arch_exit(void) static_key_deferred_flush(&kvm_xen_enabled); WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif + mutex_lock(&vendor_module_lock); + kvm_x86_ops.hardware_enable = NULL; + mutex_unlock(&vendor_module_lock); } +EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) { @@ -11531,7 +11636,7 @@ static int sync_regs(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (kvm_check_tsc_unstable() && kvm->created_vcpus) - pr_warn_once("kvm: SMP vm created on host with unstable TSC; " + pr_warn_once("SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); if (!kvm->arch.max_vcpu_ids) @@ -11608,7 +11713,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto free_wbinvd_dirty_mask; if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) { - pr_err("kvm: failed to allocate vcpu's fpu\n"); + pr_err("failed to allocate vcpu's fpu\n"); goto free_emulate_ctxt; } @@ -11882,6 +11987,11 @@ int kvm_arch_hardware_enable(void) bool stable, backwards_tsc = false; kvm_user_return_msr_cpu_online(); + + ret = kvm_x86_check_processor_compatibility(); + if (ret) + return ret; + ret = static_call(kvm_x86_hardware_enable)(); if (ret != 0) return ret; @@ -11968,88 +12078,6 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } -static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) -{ - memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); - -#define __KVM_X86_OP(func) \ - static_call_update(kvm_x86_##func, kvm_x86_ops.func); -#define KVM_X86_OP(func) \ - WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) -#define KVM_X86_OP_OPTIONAL __KVM_X86_OP -#define KVM_X86_OP_OPTIONAL_RET0(func) \ - static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ - (void *)__static_call_return0); -#include <asm/kvm-x86-ops.h> -#undef __KVM_X86_OP - - kvm_pmu_ops_update(ops->pmu_ops); -} - -int kvm_arch_hardware_setup(void *opaque) -{ - struct kvm_x86_init_ops *ops = opaque; - int r; - - rdmsrl_safe(MSR_EFER, &host_efer); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - - kvm_init_pmu_capability(); - - r = ops->hardware_setup(); - if (r != 0) - return r; - - kvm_ops_update(ops); - - kvm_register_perf_callbacks(ops->handle_intel_pt_intr); - - if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - kvm_caps.supported_xss = 0; - -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) - cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); -#undef __kvm_cpu_cap_has - - if (kvm_caps.has_tsc_control) { - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated because it will always - * be 1 on all machines. - */ - u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); - kvm_caps.max_guest_tsc_khz = max; - } - kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; - kvm_init_msr_list(); - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - kvm_unregister_perf_callbacks(); - - static_call(kvm_x86_hardware_unsetup)(); -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - struct kvm_x86_init_ops *ops = opaque; - - WARN_ON(!irqs_disabled()); - - if (__cr4_reserved_bits(cpu_has, c) != - __cr4_reserved_bits(cpu_has, &boot_cpu_data)) - return -EIO; - - return ops->check_processor_compatibility(); -} - bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 8fd41f5deae3..2681e2007e39 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -5,6 +5,7 @@ * * KVM Xen emulation */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "xen.h" |