From d33294541889b023068522270cd4153ddd8e4635 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 19 Mar 2020 13:41:06 -0400 Subject: KVM: x86: remove bogus user-triggerable WARN_ON The WARN_ON is essentially comparing a user-provided value with 0. It is trivial to trigger it just by passing garbage to KVM_SET_CLOCK. Guests can break if you do so, but the same applies to every KVM_SET_* ioctl. So, if it hurts when you do like this, just do not do it. Reported-by: syzbot+00be5da1d75f1cc95f6b@syzkaller.appspotmail.com Fixes: 9446e6fce0ab ("KVM: x86: fix WARN_ON check of an unsigned less than zero") Cc: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3156e25b0774..d65ff2008cf1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2444,7 +2444,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; - WARN_ON((s64)vcpu->hv_clock.system_time < 0); /* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; -- cgit From 2da1ed62d55c6cbebbdee924f6af4e87bb6666e5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 20 Mar 2020 13:34:50 -0400 Subject: KVM: SVM: document KVM_MEM_ENCRYPT_OP, let userspace detect if SEV is available Userspace has no way to query if SEV has been disabled with the sev module parameter of kvm-amd.ko. Actually it has one, but it is a hack: do ioctl(KVM_MEM_ENCRYPT_OP, NULL) and check if it returns EFAULT. Make it a little nicer by returning zero for SEV enabled and NULL argument, and while at it document the ioctl arguments. Cc: Brijesh Singh Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/amd-memory-encryption.rst | 25 ++++++++++++++++++++++++ arch/x86/kvm/svm.c | 3 +++ 2 files changed, 28 insertions(+) diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst index d18c97b4e140..c3129b9ba5cb 100644 --- a/Documentation/virt/kvm/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/amd-memory-encryption.rst @@ -53,6 +53,29 @@ key management interface to perform common hypervisor activities such as encrypting bootstrap code, snapshot, migrating and debugging the guest. For more information, see the SEV Key Management spec [api-spec]_ +The main ioctl to access SEV is KVM_MEM_ENCRYPT_OP. If the argument +to KVM_MEM_ENCRYPT_OP is NULL, the ioctl returns 0 if SEV is enabled +and ``ENOTTY` if it is disabled (on some older versions of Linux, +the ioctl runs normally even with a NULL argument, and therefore will +likely return ``EFAULT``). If non-NULL, the argument to KVM_MEM_ENCRYPT_OP +must be a struct kvm_sev_cmd:: + + struct kvm_sev_cmd { + __u32 id; + __u64 data; + __u32 error; + __u32 sev_fd; + }; + + +The ``id`` field contains the subcommand, and the ``data`` field points to +another struct containing arguments specific to command. The ``sev_fd`` +should point to a file descriptor that is opened on the ``/dev/sev`` +device, if needed (see individual commands). + +On output, ``error`` is zero on success, or an error code. Error codes +are defined in ```. + KVM implements the following commands to support common lifecycle events of SEV guests, such as launching, running, snapshotting, migrating and decommissioning. @@ -90,6 +113,8 @@ Returns: 0 on success, -negative on error On success, the 'handle' field contains a new handle and on error, a negative value. +KVM_SEV_LAUNCH_START requires the ``sev_fd`` field to be valid. + For more details, see SEV spec Section 6.2. 3. KVM_SEV_LAUNCH_UPDATE_DATA diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 91000501756e..f0aa9ff9666f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -7158,6 +7158,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) if (!svm_sev_enabled()) return -ENOTTY; + if (!argp) + return 0; + if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd))) return -EFAULT; -- cgit From 2e2409afe5f0c284c7dfe5504058e8d115806a7d Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 20 Mar 2020 11:07:07 -0500 Subject: KVM: SVM: Issue WBINVD after deactivating an SEV guest Currently, CLFLUSH is used to flush SEV guest memory before the guest is terminated (or a memory hotplug region is removed). However, CLFLUSH is not enough to ensure that SEV guest tagged data is flushed from the cache. With 33af3a7ef9e6 ("KVM: SVM: Reduce WBINVD/DF_FLUSH invocations"), the original WBINVD was removed. This then exposed crashes at random times because of a cache flush race with a page that had both a hypervisor and a guest tag in the cache. Restore the WBINVD when destroying an SEV guest and add a WBINVD to the svm_unregister_enc_region() function to ensure hotplug memory is flushed when removed. The DF_FLUSH can still be avoided at this point. Fixes: 33af3a7ef9e6 ("KVM: SVM: Reduce WBINVD/DF_FLUSH invocations") Signed-off-by: Tom Lendacky Message-Id: Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f0aa9ff9666f..50d1ebafe0b3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1933,14 +1933,6 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) static void __unregister_enc_region_locked(struct kvm *kvm, struct enc_region *region) { - /* - * The guest may change the memory encryption attribute from C=0 -> C=1 - * or vice versa for this memory range. Lets make sure caches are - * flushed to ensure that guest data gets written into memory with - * correct C-bit. - */ - sev_clflush_pages(region->pages, region->npages); - sev_unpin_memory(kvm, region->pages, region->npages); list_del(®ion->list); kfree(region); @@ -1970,6 +1962,13 @@ static void sev_vm_destroy(struct kvm *kvm) mutex_lock(&kvm->lock); + /* + * Ensure that all guest tagged cache entries are flushed before + * releasing the pages back to the system for use. CLFLUSH will + * not do this, so issue a WBINVD. + */ + wbinvd_on_all_cpus(); + /* * if userspace was terminated before unregistering the memory regions * then lets unpin all the registered memory. @@ -7288,6 +7287,13 @@ static int svm_unregister_enc_region(struct kvm *kvm, goto failed; } + /* + * Ensure that all guest tagged cache entries are flushed before + * releasing the pages back to the system for use. CLFLUSH will + * not do this, so issue a WBINVD. + */ + wbinvd_on_all_cpus(); + __unregister_enc_region_locked(kvm, region); mutex_unlock(&kvm->lock); -- cgit From edec6e015a02003c2af0ce82c54ea016b5a9e3f0 Mon Sep 17 00:00:00 2001 From: He Zhe Date: Fri, 20 Mar 2020 15:06:07 +0800 Subject: KVM: LAPIC: Mark hrtimer for period or oneshot mode to expire in hard interrupt context apic->lapic_timer.timer was initialized with HRTIMER_MODE_ABS_HARD but started later with HRTIMER_MODE_ABS, which may cause the following warning in PREEMPT_RT kernel. WARNING: CPU: 1 PID: 2957 at kernel/time/hrtimer.c:1129 hrtimer_start_range_ns+0x348/0x3f0 CPU: 1 PID: 2957 Comm: qemu-system-x86 Not tainted 5.4.23-rt11 #1 Hardware name: Supermicro SYS-E300-9A-8C/A2SDi-8C-HLN4F, BIOS 1.1a 09/18/2018 RIP: 0010:hrtimer_start_range_ns+0x348/0x3f0 Code: 4d b8 0f 94 c1 0f b6 c9 e8 35 f1 ff ff 4c 8b 45 b0 e9 3b fd ff ff e8 d7 3f fa ff 48 98 4c 03 34 c5 a0 26 bf 93 e9 a1 fd ff ff <0f> 0b e9 fd fc ff ff 65 8b 05 fa b7 90 6d 89 c0 48 0f a3 05 60 91 RSP: 0018:ffffbc60026ffaf8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff9d81657d4110 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000006cc7987bcf RDI: ffff9d81657d4110 RBP: ffffbc60026ffb58 R08: 0000000000000001 R09: 0000000000000010 R10: 0000000000000000 R11: 0000000000000000 R12: 0000006cc7987bcf R13: 0000000000000000 R14: 0000006cc7987bcf R15: ffffbc60026d6a00 FS: 00007f401daed700(0000) GS:ffff9d81ffa40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000ffffffff CR3: 0000000fa7574000 CR4: 00000000003426e0 Call Trace: ? kvm_release_pfn_clean+0x22/0x60 [kvm] start_sw_timer+0x85/0x230 [kvm] ? vmx_vmexit+0x1b/0x30 [kvm_intel] kvm_lapic_switch_to_sw_timer+0x72/0x80 [kvm] vmx_pre_block+0x1cb/0x260 [kvm_intel] ? vmx_vmexit+0xf/0x30 [kvm_intel] ? vmx_vmexit+0x1b/0x30 [kvm_intel] ? vmx_vmexit+0xf/0x30 [kvm_intel] ? vmx_vmexit+0x1b/0x30 [kvm_intel] ? vmx_vmexit+0xf/0x30 [kvm_intel] ? vmx_vmexit+0x1b/0x30 [kvm_intel] ? vmx_vmexit+0xf/0x30 [kvm_intel] ? vmx_vmexit+0xf/0x30 [kvm_intel] ? vmx_vmexit+0x1b/0x30 [kvm_intel] ? vmx_vmexit+0xf/0x30 [kvm_intel] ? vmx_vmexit+0x1b/0x30 [kvm_intel] ? vmx_vmexit+0xf/0x30 [kvm_intel] ? vmx_vmexit+0x1b/0x30 [kvm_intel] ? vmx_vmexit+0xf/0x30 [kvm_intel] ? vmx_vmexit+0x1b/0x30 [kvm_intel] ? vmx_vmexit+0xf/0x30 [kvm_intel] ? vmx_sync_pir_to_irr+0x9e/0x100 [kvm_intel] ? kvm_apic_has_interrupt+0x46/0x80 [kvm] kvm_arch_vcpu_ioctl_run+0x85b/0x1fa0 [kvm] ? _raw_spin_unlock_irqrestore+0x18/0x50 ? _copy_to_user+0x2c/0x30 kvm_vcpu_ioctl+0x235/0x660 [kvm] ? rt_spin_unlock+0x2c/0x50 do_vfs_ioctl+0x3e4/0x650 ? __fget+0x7a/0xa0 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x1a/0x20 do_syscall_64+0x4d/0x120 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f4027cc54a7 Code: 00 00 90 48 8b 05 e9 59 0c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b9 59 0c 00 f7 d8 64 89 01 48 RSP: 002b:00007f401dae9858 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00005558bd029690 RCX: 00007f4027cc54a7 RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 000000000000000d RBP: 00007f4028b72000 R08: 00005558bc829ad0 R09: 00000000ffffffff R10: 00005558bcf90ca0 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 00005558bce1c840 --[ end trace 0000000000000002 ]-- Signed-off-by: He Zhe Message-Id: <1584687967-332859-1-git-send-email-zhe.he@windriver.com> Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e3099c642fec..929511e01960 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1715,7 +1715,7 @@ static void start_sw_period(struct kvm_lapic *apic) hrtimer_start(&apic->lapic_timer.timer, apic->lapic_timer.target_expiration, - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_HARD); } bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) -- cgit From 428b8f1d9f92f838b73997adc10046d3c6e05790 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 23 Mar 2020 12:12:43 -0700 Subject: KVM: VMX: don't allow memory operands for inline asm that modifies SP THUNK_TARGET defines [thunk_target] as having "rm" input constraints when CONFIG_RETPOLINE is not set, which isn't constrained enough for this specific case. For inline assembly that modifies the stack pointer before using this input, the underspecification of constraints is dangerous, and results in an indirect call to a previously pushed flags register. In this case `entry`'s stack slot is good enough to satisfy the "m" constraint in "rm", but the inline assembly in handle_external_interrupt_irqoff() modifies the stack pointer via push+pushf before using this input, which in this case results in calling what was the previous state of the flags register, rather than `entry`. Be more specific in the constraints by requiring `entry` be in a register, and not a memory operand. Reported-by: Dmitry Vyukov Reported-by: syzbot+3f29ca2efb056a761e38@syzkaller.appspotmail.com Debugged-by: Alexander Potapenko Debugged-by: Paolo Bonzini Debugged-by: Sean Christopherson Signed-off-by: Nick Desaulniers Message-Id: <20200323191243.30002-1-ndesaulniers@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 26f8f31563e9..079d9fbf278e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6287,7 +6287,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) #endif ASM_CALL_CONSTRAINT : - THUNK_TARGET(entry), + [thunk_target]"r"(entry), [ss]"i"(__KERNEL_DS), [cs]"i"(__KERNEL_CS) ); -- cgit From 94be4b85d89520329f766c7d69a4d74bcc87f5b5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 24 Mar 2020 14:32:10 +0800 Subject: KVM: LAPIC: Also cancel preemption timer when disarm LAPIC timer The timer is disarmed when switching between TSC deadline and other modes, we should set everything to disarmed state, however, LAPIC timer can be emulated by preemption timer, it still works if vmx->hv_deadline_timer is not -1. This patch also cancels preemption timer when disarm LAPIC timer. Signed-off-by: Wanpeng Li Message-Id: <1585031530-19823-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 929511e01960..7356a56e6282 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1445,6 +1445,8 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) } } +static void cancel_hv_timer(struct kvm_lapic *apic); + static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & @@ -1454,6 +1456,10 @@ static void apic_update_lvtt(struct kvm_lapic *apic) if (apic_lvtt_tscdeadline(apic) != (timer_mode == APIC_LVT_TIMER_TSCDEADLINE)) { hrtimer_cancel(&apic->lapic_timer.timer); + preempt_disable(); + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + preempt_enable(); kvm_lapic_set_reg(apic, APIC_TMICT, 0); apic->lapic_timer.period = 0; apic->lapic_timer.tscdeadline = 0; -- cgit From e1be9ac8e6014a9b0a216aebae7250f9863e9fc3 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 26 Mar 2020 10:20:01 +0800 Subject: KVM: X86: Narrow down the IPI fastpath to single target IPI The original single target IPI fastpath patch forgot to filter the ICR destination shorthand field. Multicast IPI is not suitable for this feature since wakeup the multiple sleeping vCPUs will extend the interrupt disabled time, it especially worse in the over-subscribe and VM has a little bit more vCPUs scenario. Let's narrow it down to single target IPI. Two VMs, each is 76 vCPUs, one running 'ebizzy -M', the other running cyclictest on all vCPUs, w/ this patch, the avg score of cyclictest can improve more than 5%. (pv tlb, pv ipi, pv sched yield are disabled during testing to avoid the disturb). Signed-off-by: Wanpeng Li Message-Id: <1585189202-1708-3-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d65ff2008cf1..cf95c36cb4f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1554,7 +1554,10 @@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); */ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) { - if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) && + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) + return 1; + + if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) { -- cgit