From 3f16a5c318392cbb5a0c7a3d19dff8c8ef3c38ee Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 26 Jun 2019 14:16:13 +0200 Subject: KVM: x86: degrade WARN to pr_warn_ratelimited This warning can be triggered easily by userspace, so it should certainly not cause a panic if panic_on_warn is set. Reported-by: syzbot+c03f30b4f4c46bdf8575@syzkaller.appspotmail.com Suggested-by: Alexander Potapenko Acked-by: Alexander Potapenko Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9857992d4e58..fafd81d2c9ea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1554,7 +1554,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) vcpu->arch.tsc_always_catchup = 1; return 0; } else { - WARN(1, "user requested TSC rate below hardware speed\n"); + pr_warn_ratelimited("user requested TSC rate below hardware speed\n"); return -1; } } @@ -1564,8 +1564,8 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) user_tsc_khz, tsc_khz); if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { - WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", - user_tsc_khz); + pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", + user_tsc_khz); return -1; } -- cgit From 65b712f1560abdd9ebec005e9bd17c21ecacc849 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Tue, 25 Jun 2019 14:26:42 +0300 Subject: KVM: nVMX: Allow restore nested-state to enable eVMCS when vCPU in SMM As comment in code specifies, SMM temporarily disables VMX so we cannot be in guest mode, nor can VMLAUNCH/VMRESUME be pending. However, code currently assumes that these are the only flags that can be set on kvm_state->flags. This is not true as KVM_STATE_NESTED_EVMCS can also be set on this field to signal that eVMCS should be enabled. Therefore, fix code to check for guest-mode and pending VMLAUNCH/VMRESUME explicitly. Reviewed-by: Joao Martins Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 5f9c1a200201..adbf4fc77ad8 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5373,7 +5373,10 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags * must be zero. */ - if (is_smm(vcpu) ? kvm_state->flags : kvm_state->hdr.vmx.smm.flags) + if (is_smm(vcpu) ? + (kvm_state->flags & + (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) + : kvm_state->hdr.vmx.smm.flags) return -EINVAL; if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && -- cgit From 323d73a8ecad22bf3284f11112a7cce576ade6af Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Wed, 26 Jun 2019 16:09:27 +0300 Subject: KVM: nVMX: Change KVM_STATE_NESTED_EVMCS to signal vmcs12 is copied from eVMCS Currently KVM_STATE_NESTED_EVMCS is used to signal that eVMCS capability is enabled on vCPU. As indicated by vmx->nested.enlightened_vmcs_enabled. This is quite bizarre as userspace VMM should make sure to expose same vCPU with same CPUID values in both source and destination. In case vCPU is exposed with eVMCS support on CPUID, it is also expected to enable KVM_CAP_HYPERV_ENLIGHTENED_VMCS capability. Therefore, KVM_STATE_NESTED_EVMCS is redundant. KVM_STATE_NESTED_EVMCS is currently used on restore path (vmx_set_nested_state()) only to enable eVMCS capability in KVM and to signal need_vmcs12_sync such that on next VMEntry to guest nested_sync_from_vmcs12() will be called to sync vmcs12 content into eVMCS in guest memory. However, because restore nested-state is rare enough, we could have just modified vmx_set_nested_state() to always signal need_vmcs12_sync. From all the above, it seems that we could have just removed the usage of KVM_STATE_NESTED_EVMCS. However, in order to preserve backwards migration compatibility, we cannot do that. (vmx_get_nested_state() needs to signal flag when migrating from new kernel to old kernel). Returning KVM_STATE_NESTED_EVMCS when just vCPU have eVMCS enabled have a bad side-effect of userspace VMM having to send nested-state from source to destination as part of migration stream. Even if guest have never used eVMCS as it doesn't even run a nested hypervisor workload. This requires destination userspace VMM and KVM to support setting nested-state. Which make it more difficult to migrate from new host to older host. To avoid this, change KVM_STATE_NESTED_EVMCS to signal eVMCS is not only enabled but also active. i.e. Guest have made some eVMCS active via an enlightened VMEntry. i.e. vmcs12 is copied from eVMCS and therefore should be restored into eVMCS resident in memory (by copy_vmcs12_to_enlightened()). Reviewed-by: Vitaly Kuznetsov Reviewed-by: Maran Wilson Reviewed-by: Krish Sadhukhan Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 25 ++++++++++++++++--------- tools/testing/selftests/kvm/x86_64/evmcs_test.c | 1 + 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index adbf4fc77ad8..46af3a5e9209 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5240,9 +5240,6 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); - if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled) - kvm_state.flags |= KVM_STATE_NESTED_EVMCS; - if (nested_vmx_allowed(vcpu) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; @@ -5251,6 +5248,9 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (vmx_has_valid_vmcs12(vcpu)) { kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); + if (vmx->nested.hv_evmcs) + kvm_state.flags |= KVM_STATE_NESTED_EVMCS; + if (is_guest_mode(vcpu) && nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != -1ull) @@ -5350,6 +5350,15 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) return -EINVAL; + /* + * KVM_STATE_NESTED_EVMCS used to signal that KVM should + * enable eVMCS capability on vCPU. However, since then + * code was changed such that flag signals vmcs12 should + * be copied into eVMCS in guest memory. + * + * To preserve backwards compatability, allow user + * to set this flag even when there is no VMXON region. + */ if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) return -EINVAL; } else { @@ -5358,7 +5367,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) return -EINVAL; - } + } if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) @@ -5383,13 +5392,11 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; - vmx_leave_nested(vcpu); - if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { - if (!nested_vmx_allowed(vcpu)) + if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && + (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) return -EINVAL; - nested_enable_evmcs(vcpu, NULL); - } + vmx_leave_nested(vcpu); if (kvm_state->hdr.vmx.vmxon_pa == -1ull) return 0; diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c index b38260e29775..241919ef1eac 100644 --- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c +++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c @@ -146,6 +146,7 @@ int main(int argc, char *argv[]) kvm_vm_restart(vm, O_RDWR); vm_vcpu_add(vm, VCPU_ID, 0, 0); vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + vcpu_ioctl(vm, VCPU_ID, KVM_ENABLE_CAP, &enable_evmcs_cap); vcpu_load_state(vm, VCPU_ID, state); run = vcpu_state(vm, VCPU_ID); free(state); -- cgit From bb34e690e9340bc155ebed5a3d75fc63ff69e082 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 2 Jul 2019 17:25:02 +0800 Subject: KVM: LAPIC: Fix pending interrupt in IRR blocked by software disable LAPIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thomas reported that: | Background: | | In preparation of supporting IPI shorthands I changed the CPU offline | code to software disable the local APIC instead of just masking it. | That's done by clearing the APIC_SPIV_APIC_ENABLED bit in the APIC_SPIV | register. | | Failure: | | When the CPU comes back online the startup code triggers occasionally | the warning in apic_pending_intr_clear(). That complains that the IRRs | are not empty. | | The offending vector is the local APIC timer vector who's IRR bit is set | and stays set. | | It took me quite some time to reproduce the issue locally, but now I can | see what happens. | | It requires apicv_enabled=0, i.e. full apic emulation. With apicv_enabled=1 | (and hardware support) it behaves correctly. | | Here is the series of events: | | Guest CPU | | goes down | | native_cpu_disable() | | apic_soft_disable(); | | play_dead() | | .... | | startup() | | if (apic_enabled()) | apic_pending_intr_clear() <- Not taken | | enable APIC | | apic_pending_intr_clear() <- Triggers warning because IRR is stale | | When this happens then the deadline timer or the regular APIC timer - | happens with both, has fired shortly before the APIC is disabled, but the | interrupt was not serviced because the guest CPU was in an interrupt | disabled region at that point. | | The state of the timer vector ISR/IRR bits: | | ISR IRR | before apic_soft_disable() 0 1 | after apic_soft_disable() 0 1 | | On startup 0 1 | | Now one would assume that the IRR is cleared after the INIT reset, but this | happens only on CPU0. | | Why? | | Because our CPU0 hotplug is just for testing to make sure nothing breaks | and goes through an NMI wakeup vehicle because INIT would send it through | the boots-trap code which is not really working if that CPU was not | physically unplugged. | | Now looking at a real world APIC the situation in that case is: | | ISR IRR | before apic_soft_disable() 0 1 | after apic_soft_disable() 0 1 | | On startup 0 0 | | Why? | | Once the dying CPU reenables interrupts the pending interrupt gets | delivered as a spurious interupt and then the state is clear. | | While that CPU0 hotplug test case is surely an esoteric issue, the APIC | emulation is still wrong, Even if the play_dead() code would not enable | interrupts then the pending IRR bit would turn into an ISR .. interrupt | when the APIC is reenabled on startup. From SDM 10.4.7.2 Local APIC State After It Has Been Software Disabled * Pending interrupts in the IRR and ISR registers are held and require masking or handling by the CPU. In Thomas's testing, hardware cpu will not respect soft disable LAPIC when IRR has already been set or APICv posted-interrupt is in flight, so we can skip soft disable APIC checking when clearing IRR and set ISR, continue to respect soft disable APIC when attempting to set IRR. Reported-by: Rong Chen Reported-by: Feng Tang Reported-by: Thomas Gleixner Tested-by: Thomas Gleixner Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Thomas Gleixner Cc: Rong Chen Cc: Feng Tang Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a21c440ff356..4dabc318adb8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2339,7 +2339,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u32 ppr; - if (!apic_enabled(apic)) + if (!kvm_apic_hw_enabled(apic)) return -1; __apic_update_ppr(apic, &ppr); -- cgit From e644fa18e2ffc8895ca30dade503ae10128573a6 Mon Sep 17 00:00:00 2001 From: Zhang Lei Date: Wed, 3 Jul 2019 18:42:50 +0100 Subject: KVM: arm64/sve: Fix vq_present() macro to yield a bool The original implementation of vq_present() relied on aggressive inlining in order for the compiler to know that the code is correct, due to some const-casting issues. This was causing sparse and clang to complain, while GCC compiled cleanly. Commit 0c529ff789bc addressed this problem, but since vq_present() is no longer a function, there is now no implicit casting of the returned value to the return type (bool). In set_sve_vls(), this uncast bit value is compared against a bool, and so may spuriously compare as unequal when both are nonzero. As a result, KVM may reject valid SVE vector length configurations as invalid, and vice versa. Fix it by forcing the returned value to a bool. Signed-off-by: Zhang Lei Fixes: 0c529ff789bc ("KVM: arm64: Implement vq_present() as a macro") Signed-off-by: Dave Martin [commit message rewrite] Cc: Viresh Kumar Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/guest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index c2afa7982047..dfd626447482 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -208,7 +208,7 @@ out: #define vq_word(vq) (((vq) - SVE_VQ_MIN) / 64) #define vq_mask(vq) ((u64)1 << ((vq) - SVE_VQ_MIN) % 64) -#define vq_present(vqs, vq) ((vqs)[vq_word(vq)] & vq_mask(vq)) +#define vq_present(vqs, vq) (!!((vqs)[vq_word(vq)] & vq_mask(vq))) static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { -- cgit