From 6abe9c1386e5c86f360e4e8fde8eec95eee77aa3 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 22 Jun 2020 18:04:41 -0400 Subject: KVM: X86: Move ignore_msrs handling upper the stack MSR accesses can be one of: (1) KVM internal access, (2) userspace access (e.g., via KVM_SET_MSRS ioctl), (3) guest access. The ignore_msrs was previously handled by kvm_get_msr_common() and kvm_set_msr_common(), which is the bottom of the msr access stack. It's working in most cases, however it could dump unwanted warning messages to dmesg even if kvm get/set the msrs internally when calling __kvm_set_msr() or __kvm_get_msr() (e.g. kvm_cpuid()). Ideally we only want to trap cases (2) or (3), but not (1) above. To achieve this, move the ignore_msrs handling upper until the callers of __kvm_get_msr() and __kvm_set_msr(). To identify the "msr missing" event, a new return value (KVM_MSR_RET_INVALID==2) is used for that. Signed-off-by: Peter Xu Message-Id: <20200622220442.21998-2-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 80 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 26 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 88c593f83b28..e7c1567b9501 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -243,6 +243,29 @@ static struct kmem_cache *x86_fpu_cache; static struct kmem_cache *x86_emulator_cache; +/* + * When called, it means the previous get/set msr reached an invalid msr. + * Return 0 if we want to ignore/silent this failed msr access, or 1 if we want + * to fail the caller. + */ +static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, + u64 data, bool write) +{ + const char *op = write ? "wrmsr" : "rdmsr"; + + if (ignore_msrs) { + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n", + op, msr, data); + /* Mask the error */ + return 0; + } else { + vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n", + op, msr, data); + return 1; + } +} + static struct kmem_cache *kvm_alloc_emulator_cache(void) { unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); @@ -1516,6 +1539,17 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, return kvm_x86_ops.set_msr(vcpu, &msr); } +static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 data, bool host_initiated) +{ + int ret = __kvm_set_msr(vcpu, index, data, host_initiated); + + if (ret == KVM_MSR_RET_INVALID) + ret = kvm_msr_ignored_check(vcpu, index, data, true); + + return ret; +} + /* * Read the MSR specified by @index into @data. Select MSR specific fault * checks are bypassed if @host_initiated is %true. @@ -1537,15 +1571,29 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, return ret; } +static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 *data, bool host_initiated) +{ + int ret = __kvm_get_msr(vcpu, index, data, host_initiated); + + if (ret == KVM_MSR_RET_INVALID) { + /* Unconditionally clear *data for simplicity */ + *data = 0; + ret = kvm_msr_ignored_check(vcpu, index, 0, false); + } + + return ret; +} + int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - return __kvm_get_msr(vcpu, index, data, false); + return kvm_get_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_get_msr); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) { - return __kvm_set_msr(vcpu, index, data, false); + return kvm_set_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_set_msr); @@ -1665,12 +1713,12 @@ EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); */ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return __kvm_get_msr(vcpu, index, data, true); + return kvm_get_msr_ignored_check(vcpu, index, data, true); } static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return __kvm_set_msr(vcpu, index, *data, true); + return kvm_set_msr_ignored_check(vcpu, index, *data, true); } #ifdef CONFIG_X86_64 @@ -3066,17 +3114,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return xen_hvm_config(vcpu, data); if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - if (!ignore_msrs) { - vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); - return 1; - } else { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, - "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); - break; - } + return KVM_MSR_RET_INVALID; } return 0; } @@ -3331,17 +3369,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); - if (!ignore_msrs) { - vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", - msr_info->index); - return 1; - } else { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", - msr_info->index); - msr_info->data = 0; - } - break; + return KVM_MSR_RET_INVALID; } return 0; } -- cgit From 12bc2132b15e0a969b3f455d90a5f215ef239eff Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 22 Jun 2020 18:04:42 -0400 Subject: KVM: X86: Do the same ignore_msrs check for feature msrs Logically the ignore_msrs and report_ignored_msrs should also apply to feature MSRs. Add them in. Signed-off-by: Peter Xu Message-Id: <20200622220442.21998-3-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e7c1567b9501..ec9aba133c1c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1412,8 +1412,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - if (kvm_x86_ops.get_msr_feature(msr)) - return 1; + return kvm_x86_ops.get_msr_feature(msr); } return 0; } @@ -1425,6 +1424,13 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) msr.index = index; r = kvm_get_msr_feature(&msr); + + if (r == KVM_MSR_RET_INVALID) { + /* Unconditionally clear the output for simplicity */ + *data = 0; + r = kvm_msr_ignored_check(vcpu, index, 0, false); + } + if (r) return r; -- cgit From f5f6145e41d39c7fd04a17c3b2596c7abe933f10 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Fri, 22 May 2020 18:19:51 -0400 Subject: KVM: x86: Move the check for upper 32 reserved bits of DR6 to separate function Signed-off-by: Krish Sadhukhan Message-Id: <20200522221954.32131-2-krish.sadhukhan@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec9aba133c1c..82f457f0e7e0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1133,7 +1133,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 4: /* fall through */ case 6: - if (val & 0xffffffff00000000ULL) + if (!kvm_dr6_valid(val)) return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); break; -- cgit From 1aa561b1a4c0ae2a9a9b9c21a84b5ca66b4775d8 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 3 Jun 2020 16:56:21 -0700 Subject: kvm: x86: Add "last CPU" to some KVM_EXIT information More often than not, a failed VM-entry in an x86 production environment is induced by a defective CPU. To help identify the bad hardware, include the id of the last logical CPU to run a vCPU in the information provided to userspace on a KVM exit for failed VM-entry or for KVM internal errors not associated with emulation. The presence of this additional information is indicated by a new capability, KVM_CAP_LAST_CPU. Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20200603235623.245638-5-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 82f457f0e7e0..1a0fad1018f9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3510,6 +3510,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_LAST_CPU: r = 1; break; case KVM_CAP_SYNC_REGS: -- cgit From c967118ddb21191178c0e0080fdc41f5d85ca1d1 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 3 Jun 2020 16:56:23 -0700 Subject: kvm: x86: Set last_vmentry_cpu in vcpu_enter_guest Since this field is now in kvm_vcpu_arch, clean things up a little by setting it in vendor-agnostic code: vcpu_enter_guest. Note that it must be set after the call to kvm_x86_ops.run(), since it can't be updated before pre_sev_run(). Suggested-by: Sean Christopherson Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20200603235623.245638-7-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1a0fad1018f9..bd8690ca7b6b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8583,6 +8583,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); + vcpu->arch.last_vmentry_cpu = vcpu->cpu; vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); vcpu->mode = OUTSIDE_GUEST_MODE; -- cgit From d42e3fae6faedacb2a7b4c984417ed0d9f540401 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 7 Jul 2020 15:36:30 -0700 Subject: kvm: x86: Read PDPTEs on CR0.CD and CR0.NW changes According to the SDM, when PAE paging would be in use following a MOV-to-CR0 that modifies any of CR0.CD, CR0.NW, or CR0.PG, then the PDPTEs are loaded from the address in CR3. Previously, kvm only loaded the PDPTEs when PAE paging would be in use following a MOV-to-CR0 that modified CR0.PG. Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Reviewed-by: Peter Shier Message-Id: <20200707223630.336700-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd8690ca7b6b..1153ce7d118b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -798,6 +798,7 @@ EXPORT_SYMBOL_GPL(pdptrs_changed); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; @@ -815,9 +816,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) return 1; - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { + if (cr0 & X86_CR0_PG) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.efer & EFER_LME)) { + if (!is_paging(vcpu) && (vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) @@ -827,8 +828,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 1; } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) + if (is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) return 1; } -- cgit From b899c13277a918d268055ade3e4fd8289314ee84 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 8 Jul 2020 00:39:55 +0000 Subject: KVM: x86: Create mask for guest CR4 reserved bits in kvm_update_cpuid() Instead of creating the mask for guest CR4 reserved bits in kvm_valid_cr4(), do it in kvm_update_cpuid() so that it can be reused instead of creating it each time kvm_valid_cr4() is called. Suggested-by: Paolo Bonzini Signed-off-by: Krish Sadhukhan Message-Id: <1594168797-29444-2-git-send-email-krish.sadhukhan@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1153ce7d118b..549b3f7228ac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -955,33 +955,12 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -#define __cr4_reserved_bits(__cpu_has, __c) \ -({ \ - u64 __reserved_bits = CR4_RESERVED_BITS; \ - \ - if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \ - __reserved_bits |= X86_CR4_OSXSAVE; \ - if (!__cpu_has(__c, X86_FEATURE_SMEP)) \ - __reserved_bits |= X86_CR4_SMEP; \ - if (!__cpu_has(__c, X86_FEATURE_SMAP)) \ - __reserved_bits |= X86_CR4_SMAP; \ - if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \ - __reserved_bits |= X86_CR4_FSGSBASE; \ - if (!__cpu_has(__c, X86_FEATURE_PKU)) \ - __reserved_bits |= X86_CR4_PKE; \ - if (!__cpu_has(__c, X86_FEATURE_LA57)) \ - __reserved_bits |= X86_CR4_LA57; \ - if (!__cpu_has(__c, X86_FEATURE_UMIP)) \ - __reserved_bits |= X86_CR4_UMIP; \ - __reserved_bits; \ -}) - static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return -EINVAL; - if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu)) + if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return -EINVAL; return 0; -- cgit From 761e4169346553c180bbd4a383aedd72f905bc9a Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 8 Jul 2020 00:39:56 +0000 Subject: KVM: nSVM: Check that MBZ bits in CR3 and CR4 are not set on vmrun of nested guests According to section "Canonicalization and Consistency Checks" in APM vol. 2 the following guest state is illegal: "Any MBZ bit of CR3 is set." "Any MBZ bit of CR4 is set." Suggeted-by: Paolo Bonzini Signed-off-by: Krish Sadhukhan Message-Id: <1594168797-29444-3-git-send-email-krish.sadhukhan@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 549b3f7228ac..475456a14d76 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -955,7 +955,7 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return -EINVAL; @@ -965,6 +965,7 @@ static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 0; } +EXPORT_SYMBOL_GPL(kvm_valid_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { -- cgit From aedbaf4f6afdcf9da0f48f97d7e9d62f4d591e19 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Thu, 9 Jul 2020 12:34:23 +0800 Subject: KVM: x86: Extract kvm_update_cpuid_runtime() from kvm_update_cpuid() Beside called in kvm_vcpu_ioctl_set_cpuid*(), kvm_update_cpuid() is also called 5 places else in x86.c and 1 place else in lapic.c. All those 6 places only need the part of updating guest CPUIDs (OSXSAVE, OSPKE, APIC, KVM_FEATURE_PV_UNHALT, ...) based on the runtime vcpu state, so extract them as a separate kvm_update_cpuid_runtime(). Signed-off-by: Xiaoyao Li Message-Id: <20200709043426.92712-3-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 475456a14d76..c432a445cbbe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -940,7 +940,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); return 0; } @@ -1004,7 +1004,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_mmu_reset_context(vcpu); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); return 0; } @@ -2916,7 +2916,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); } else { vcpu->arch.ia32_misc_enable_msr = data; } @@ -8170,7 +8170,7 @@ static void enter_smm(struct kvm_vcpu *vcpu) kvm_x86_ops.set_efer(vcpu, 0); #endif - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); kvm_mmu_reset_context(vcpu); } @@ -9194,7 +9194,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) (X86_CR4_OSXSAVE | X86_CR4_PKE)); kvm_x86_ops.set_cr4(vcpu, sregs->cr4); if (cpuid_update_needed) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); if (is_pae_paging(vcpu)) { -- cgit From 841c2be09fe4f495fe5224952a419bd8c7e5b455 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 8 Jul 2020 14:57:31 +0300 Subject: kvm: x86: replace kvm_spec_ctrl_test_value with runtime test on the host To avoid complex and in some cases incorrect logic in kvm_spec_ctrl_test_value, just try the guest's given value on the host processor instead, and if it doesn't #GP, allow the guest to set it. One such case is when host CPU supports STIBP mitigation but doesn't support IBRS (as is the case with some Zen2 AMD cpus), and in this case we were giving guest #GP when it tried to use STIBP The reason why can can do the host test is that IA32_SPEC_CTRL msr is passed to the guest, after the guest sets it to a non zero value for the first time (due to performance reasons), and as as result of this, it is pointless to emulate #GP condition on this first access, in a different way than what the host CPU does. This is based on a patch from Sean Christopherson, who suggested this idea. Fixes: 6441fa6178f5 ("KVM: x86: avoid incorrect writes to host MSR_IA32_SPEC_CTRL") Cc: stable@vger.kernel.org Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Message-Id: <20200708115731.180097-1-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c432a445cbbe..7f32169e8449 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10693,28 +10693,32 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); -u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu) + +int kvm_spec_ctrl_test_value(u64 value) { - uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD; + /* + * test that setting IA32_SPEC_CTRL to given value + * is allowed by the host processor + */ + + u64 saved_value; + unsigned long flags; + int ret = 0; - /* The STIBP bit doesn't fault even if it's not advertised */ - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) - bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) && - !boot_cpu_has(X86_FEATURE_AMD_IBRS)) - bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + local_irq_save(flags); - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) - bits &= ~SPEC_CTRL_SSBD; - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && - !boot_cpu_has(X86_FEATURE_AMD_SSBD)) - bits &= ~SPEC_CTRL_SSBD; + if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + ret = 1; + else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) + ret = 1; + else + wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); - return bits; + local_irq_restore(flags); + + return ret; } -EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits); +EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); -- cgit From 87fa7f3e98a1310ef1ac1900e7ee7f9610a038bc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Jul 2020 21:51:54 +0200 Subject: x86/kvm: Move context tracking where it belongs Context tracking for KVM happens way too early in the vcpu_run() code. Anything after guest_enter_irqoff() and before guest_exit_irqoff() cannot use RCU and should also be not instrumented. The current way of doing this covers way too much code. Move it closer to the actual vmenter/exit code. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Paolo Bonzini Message-Id: <20200708195321.724574345@linutronix.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7f32169e8449..d7d82b3c0e4c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8522,7 +8522,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); - guest_enter_irqoff(); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) @@ -8585,7 +8584,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) local_irq_disable(); kvm_after_interrupt(vcpu); - guest_exit_irqoff(); if (lapic_in_kernel(vcpu)) { s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; if (delta != S64_MIN) { -- cgit From 3ebccdf373c21d8697782b7e8d5af0adc9c26e04 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Jul 2020 21:51:57 +0200 Subject: x86/kvm/vmx: Move guest enter/exit into .noinstr.text Move the functions which are inside the RCU off region into the non-instrumentable text section. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Paolo Bonzini Message-Id: <20200708195322.037311579@linutronix.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d7d82b3c0e4c..e27d3db7e43f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -402,7 +402,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible void kvm_spurious_fault(void) +asmlinkage __visible noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); -- cgit From d574c539c3c4d8a3f9e07c8a0785954d22a63dcb Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 10 Jul 2020 17:25:59 +0200 Subject: KVM: x86: move MSR_IA32_PERF_CAPABILITIES emulation to common x86 code state_test/smm_test selftests are failing on AMD with: "Unexpected result from KVM_GET_MSRS, r: 51 (failed MSR was 0x345)" MSR_IA32_PERF_CAPABILITIES is an emulated MSR on Intel but it is not known to AMD code, we can move the emulation to common x86 code. For AMD, we basically just allow the host to read and write zero to the MSR. Fixes: 27461da31089 ("KVM: x86/pmu: Support full width counting") Suggested-by: Jim Mattson Suggested-by: Paolo Bonzini Signed-off-by: Vitaly Kuznetsov Message-Id: <20200710152559.1645827-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e27d3db7e43f..69a2e9c981e9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2857,6 +2857,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.arch_capabilities = data; break; + case MSR_IA32_PERF_CAPABILITIES: { + struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; + + if (!msr_info->host_initiated) + return 1; + if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent)) + return 1; + if (data & ~msr_ent.data) + return 1; + + vcpu->arch.perf_capabilities = data; + + return 0; + } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: @@ -3197,6 +3211,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.arch_capabilities; break; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return 1; + msr_info->data = vcpu->arch.perf_capabilities; + break; case MSR_IA32_POWER_CTL: msr_info->data = vcpu->arch.msr_ia32_power_ctl; break; -- cgit From 897861479c0640ed93ec82db78f8d839df32c4ac Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Fri, 10 Jul 2020 17:48:03 +0200 Subject: KVM: x86: Add helper functions for illegal GPA checking and page fault injection This patch adds two helper functions that will be used to support virtualizing MAXPHYADDR in both kvm-intel.ko and kvm.ko. kvm_fixup_and_inject_pf_error() injects a page fault for a user-specified GVA, while kvm_mmu_is_illegal_gpa() checks whether a GPA exceeds vCPU address limits. Signed-off-by: Mohammed Gamal Signed-off-by: Paolo Bonzini Message-Id: <20200710154811.418214-2-mgamal@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 69a2e9c981e9..0867a626b226 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10738,6 +10738,27 @@ int kvm_spec_ctrl_test_value(u64 value) } EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); +void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) +{ + struct x86_exception fault; + + if (!(error_code & PFERR_PRESENT_MASK) || + vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, error_code, &fault) != UNMAPPED_GVA) { + /* + * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page + * tables probably do not match the TLB. Just proceed + * with the error code that the processor gave. + */ + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = error_code; + fault.nested_page_fault = false; + fault.address = gva; + } + vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); +} +EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); -- cgit From 6986982fef86ae71a27d6309739a0c38b562a5c0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 10 Jul 2020 17:48:06 +0200 Subject: KVM: x86: rename update_bp_intercept to update_exception_bitmap We would like to introduce a callback to update the #PF intercept when CPUID changes. Just reuse update_bp_intercept since VMX is already using update_exception_bitmap instead of a bespoke function. While at it, remove an unnecessary assignment in the SVM version, which is already done in the caller (kvm_arch_vcpu_ioctl_set_guest_debug) and has nothing to do with the exception bitmap. Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0867a626b226..35abe69aad28 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9316,7 +9316,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops.update_bp_intercept(vcpu); + kvm_x86_ops.update_exception_bitmap(vcpu); r = 0; -- cgit From 3edd68399dc155b80335244c8c2673eaa652931a Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Fri, 10 Jul 2020 17:48:11 +0200 Subject: KVM: x86: Add a capability for GUEST_MAXPHYADDR < HOST_MAXPHYADDR support This patch adds a new capability KVM_CAP_SMALLER_MAXPHYADDR which allows userspace to query if the underlying architecture would support GUEST_MAXPHYADDR < HOST_MAXPHYADDR and hence act accordingly (e.g. qemu can decide if it should warn for -cpu ..,phys-bits=X) The complications in this patch are due to unexpected (but documented) behaviour we see with NPF vmexit handling in AMD processor. If SVM is modified to add guest physical address checks in the NPF and guest #PF paths, we see the followning error multiple times in the 'access' test in kvm-unit-tests: test pte.p pte.36 pde.p: FAIL: pte 2000021 expected 2000001 Dump mapping: address: 0x123400000000 ------L4: 24c3027 ------L3: 24c4027 ------L2: 24c5021 ------L1: 1002000021 This is because the PTE's accessed bit is set by the CPU hardware before the NPF vmexit. This is handled completely by hardware and cannot be fixed in software. Therefore, availability of the new capability depends on a boolean variable allow_smaller_maxphyaddr which is set individually by VMX and SVM init routines. On VMX it's always set to true, on SVM it's only set to true when NPT is not enabled. CC: Tom Lendacky CC: Babu Moger Signed-off-by: Mohammed Gamal Message-Id: <20200710154811.418214-10-mgamal@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 35abe69aad28..95ef62922869 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -187,6 +187,9 @@ static struct kvm_shared_msrs __percpu *shared_msrs; u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); +bool __read_mostly allow_smaller_maxphyaddr; +EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); + static u64 __read_mostly host_xss; u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); @@ -3574,6 +3577,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; + case KVM_CAP_SMALLER_MAXPHYADDR: + r = (int) allow_smaller_maxphyaddr; + break; default: break; } -- cgit From d468d94b7bafa7a2dd9bc72e5f7868469be3f7c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:20 -0700 Subject: KVM: x86: Dynamically calculate TDP level from max level and MAXPHYADDR Calculate the desired TDP level on the fly using the max TDP level and MAXPHYADDR instead of doing the same when CPUID is updated. This avoids the hidden dependency on cpuid_maxphyaddr() in vmx_get_tdp_level() and also standardizes the "use 5-level paging iff MAXPHYADDR > 48" behavior across x86. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-8-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 95ef62922869..41f43bb716c1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9520,7 +9520,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); + vcpu->arch.max_tdp_level = kvm_x86_ops.get_max_tdp_level(); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; -- cgit From 83013059bdc5486dcbcd33b5c4abf8431766e06c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jul 2020 20:41:22 -0700 Subject: KVM: x86: Specify max TDP level via kvm_configure_mmu() Capture the max TDP level during kvm_configure_mmu() instead of using a kvm_x86_ops hook to do it at every vCPU creation. Signed-off-by: Sean Christopherson Message-Id: <20200716034122.5998-10-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41f43bb716c1..dc4370394ab8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9520,7 +9520,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.max_tdp_level = kvm_x86_ops.get_max_tdp_level(); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; -- cgit