From e080e538e69753e6934be0d23144cb1a9a550589 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 13 Feb 2020 10:53:25 +0800 Subject: KVM: x86: eliminate some unreachable code These code are unreachable, remove them. Signed-off-by: Miaohe Lin Reviewed-by: Krish Sadhukhan Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 359fcd395132..de7f3e3f277c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3077,7 +3077,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - break; case MSR_IA32_TSCDEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; @@ -3160,7 +3159,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); - break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow @@ -8484,7 +8482,6 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) break; default: return -EINTR; - break; } return 1; } -- cgit From 92daa48b34d784748b575ae424def4ea7f024b2f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 15:03:08 -0800 Subject: KVM: x86: Add EMULTYPE_PF when emulation is triggered by a page fault Add a new emulation type flag to explicitly mark emulation related to a page fault. Move the propation of the GPA into the emulator from the page fault handler into x86_emulate_instruction, using EMULTYPE_PF as an indicator that cr2 is valid. Similarly, don't propagate cr2 into the exception.address when it's *not* valid. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index de7f3e3f277c..1ce7bbc075e4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6492,10 +6492,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, gpa_t gpa = cr2_or_gpa; kvm_pfn_t pfn; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu))) + if (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; if (!vcpu->arch.mmu->direct_map) { @@ -6583,10 +6584,11 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, */ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu))) + if (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; if (x86_page_table_writing_insn(ctxt)) @@ -6839,8 +6841,19 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } restart: - /* Save the faulting GPA (cr2) in the address field */ - ctxt->exception.address = cr2_or_gpa; + if (emulation_type & EMULTYPE_PF) { + /* Save the faulting GPA (cr2) in the address field */ + ctxt->exception.address = cr2_or_gpa; + + /* With shadow page tables, cr2 contains a GVA or nGPA. */ + if (vcpu->arch.mmu->direct_map) { + vcpu->arch.gpa_available = true; + vcpu->arch.gpa_val = cr2_or_gpa; + } + } else { + /* Sanitize the address out of an abundance of paranoia. */ + ctxt->exception.address = 0; + } r = x86_emulate_insn(ctxt); -- cgit From 744e699c7e9913a9b1f825f189ab547c4da0d182 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 15:03:09 -0800 Subject: KVM: x86: Move gpa_val and gpa_available into the emulator context Move the GPA tracking into the emulator context now that the context is guaranteed to be initialized via __init_emulate_ctxt() prior to dereferencing gpa_{available,val}, i.e. now that seeing a stale gpa_available will also trigger a WARN due to an invalid context. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1ce7bbc075e4..4f026f877fc1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5745,10 +5745,9 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, * operation using rep will only have the initial GPA from the NPF * occurred. */ - if (vcpu->arch.gpa_available && - emulator_can_use_gpa(ctxt) && - (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) { - gpa = vcpu->arch.gpa_val; + if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) && + (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) { + gpa = ctxt->gpa_val; ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write); } else { ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -6417,6 +6416,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + ctxt->gpa_available = false; ctxt->eflags = kvm_get_rflags(vcpu); ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; @@ -6847,8 +6847,8 @@ restart: /* With shadow page tables, cr2 contains a GVA or nGPA. */ if (vcpu->arch.mmu->direct_map) { - vcpu->arch.gpa_available = true; - vcpu->arch.gpa_val = cr2_or_gpa; + ctxt->gpa_available = true; + ctxt->gpa_val = cr2_or_gpa; } } else { /* Sanitize the address out of an abundance of paranoia. */ @@ -8454,7 +8454,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - vcpu->arch.gpa_available = false; r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath); return r; -- cgit From edd4fa37baa6ee8e44dc65523b27bd6fe44c94de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 13:07:15 -0800 Subject: KVM: x86: Allocate new rmap and large page tracking when moving memslot Reallocate a rmap array and recalcuate large page compatibility when moving an existing memslot to correctly handle the alignment properties of the new memslot. The number of rmap entries required at each level is dependent on the alignment of the memslot's base gfn with respect to that level, e.g. moving a large-page aligned memslot so that it becomes unaligned will increase the number of rmap entries needed at the now unaligned level. Not updating the rmap array is the most obvious bug, as KVM accesses garbage data beyond the end of the rmap. KVM interprets the bad data as pointers, leading to non-canonical #GPs, unexpected #PFs, etc... general protection fault: 0000 [#1] SMP CPU: 0 PID: 1909 Comm: move_memory_reg Not tainted 5.4.0-rc7+ #139 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:rmap_get_first+0x37/0x50 [kvm] Code: <48> 8b 3b 48 85 ff 74 ec e8 6c f4 ff ff 85 c0 74 e3 48 89 d8 5b c3 RSP: 0018:ffffc9000021bbc8 EFLAGS: 00010246 RAX: ffff00617461642e RBX: ffff00617461642e RCX: 0000000000000012 RDX: ffff88827400f568 RSI: ffffc9000021bbe0 RDI: ffff88827400f570 RBP: 0010000000000000 R08: ffffc9000021bd00 R09: ffffc9000021bda8 R10: ffffc9000021bc48 R11: 0000000000000000 R12: 0030000000000000 R13: 0000000000000000 R14: ffff88827427d700 R15: ffffc9000021bce8 FS: 00007f7eda014700(0000) GS:ffff888277a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7ed9216ff8 CR3: 0000000274391003 CR4: 0000000000162eb0 Call Trace: kvm_mmu_slot_set_dirty+0xa1/0x150 [kvm] __kvm_set_memory_region.part.64+0x559/0x960 [kvm] kvm_set_memory_region+0x45/0x60 [kvm] kvm_vm_ioctl+0x30f/0x920 [kvm] do_vfs_ioctl+0xa1/0x620 ksys_ioctl+0x66/0x70 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x4c/0x170 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f7ed9911f47 Code: <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 21 6f 2c 00 f7 d8 64 89 01 48 RSP: 002b:00007ffc00937498 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000001ab0010 RCX: 00007f7ed9911f47 RDX: 0000000001ab1350 RSI: 000000004020ae46 RDI: 0000000000000004 RBP: 000000000000000a R08: 0000000000000000 R09: 00007f7ed9214700 R10: 00007f7ed92149d0 R11: 0000000000000246 R12: 00000000bffff000 R13: 0000000000000003 R14: 00007f7ed9215000 R15: 0000000000000000 Modules linked in: kvm_intel kvm irqbypass ---[ end trace 0c5f570b3358ca89 ]--- The disallow_lpage tracking is more subtle. Failure to update results in KVM creating large pages when it shouldn't, either due to stale data or again due to indexing beyond the end of the metadata arrays, which can lead to memory corruption and/or leaking data to guest/userspace. Note, the arrays for the old memslot are freed by the unconditional call to kvm_free_memslot() in __kvm_set_memory_region(). Fixes: 05da45583de9b ("KVM: MMU: large page support") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Reviewed-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4f026f877fc1..6387917d08ec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9878,6 +9878,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, { int i; + /* + * Clear out the previous array pointers for the KVM_MR_MOVE case. The + * old arrays will be freed by __kvm_set_memory_region() if installing + * the new memslot is successful. + */ + memset(&slot->arch, 0, sizeof(slot->arch)); + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { struct kvm_lpage_info *linfo; unsigned long ugfn; @@ -9959,6 +9966,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { + if (change == KVM_MR_MOVE) + return kvm_arch_create_memslot(kvm, memslot, + mem->memory_size >> PAGE_SHIFT); + return 0; } -- cgit From 0dab98b7ade66598cab3b59931995ce91bd61258 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 13:07:19 -0800 Subject: KVM: x86: Allocate memslot resources during prepare_memory_region() Allocate the various metadata structures associated with a new memslot during kvm_arch_prepare_memory_region(), which paves the way for removing kvm_arch_create_memslot() altogether. Moving x86's memory allocation only changes the order of kernel memory allocations between x86 and common KVM code. Reviewed-by: Peter Xu Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6387917d08ec..94ea7b914fc7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9875,6 +9875,12 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + +static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, + unsigned long npages) { int i; @@ -9966,10 +9972,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { - if (change == KVM_MR_MOVE) - return kvm_arch_create_memslot(kvm, memslot, - mem->memory_size >> PAGE_SHIFT); - + if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) + return kvm_alloc_memslot_metadata(memslot, + mem->memory_size >> PAGE_SHIFT); return 0; } -- cgit From 414de7abbf809f046511269797d9f2310b88e036 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 13:07:20 -0800 Subject: KVM: Drop kvm_arch_create_memslot() Remove kvm_arch_create_memslot() now that all arch implementations are effectively nops. Removing kvm_arch_create_memslot() eliminates the possibility for arch specific code to allocate memory prior to setting a memslot, which sets the stage for simplifying kvm_free_memslot(). Cc: Janosch Frank Acked-by: Christian Borntraeger Reviewed-by: Peter Xu Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 94ea7b914fc7..c23b08f48079 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9873,12 +9873,6 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, kvm_page_track_free_memslot(free, dont); } -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) -{ - return 0; -} - static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, unsigned long npages) { -- cgit From 9d4c197c0e94c372ceffd2ffc53a23518f301ed9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 13:07:24 -0800 Subject: KVM: Drop "const" attribute from old memslot in commit_memory_region() Drop the "const" attribute from @old in kvm_arch_commit_memory_region() to allow arch specific code to free arch specific resources in the old memslot without having to cast away the attribute. Freeing resources in kvm_arch_commit_memory_region() paves the way for simplifying kvm_free_memslot() by eliminating the last usage of its @dont param. Reviewed-by: Peter Xu Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c23b08f48079..a3c92c5077d0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10024,7 +10024,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, + struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { -- cgit From 21198846de1c348304280436caf3a5dc936d5c65 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 13:07:25 -0800 Subject: KVM: x86: Free arrays for old memslot when moving memslot's base gfn Explicitly free the metadata arrays (stored in slot->arch) in the old memslot structure when moving the memslot's base gfn is committed. This eliminates x86's dependency on kvm_free_memslot() being called when a memslot move is committed, and paves the way for removing the funky code in kvm_free_memslot() that conditionally frees structures based on its @dont param. Reviewed-by: Peter Xu Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a3c92c5077d0..6068208917dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10066,6 +10066,10 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, */ if (change != KVM_MR_DELETE) kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new); + + /* Free the arrays associated with the old memslot. */ + if (change == KVM_MR_MOVE) + kvm_arch_free_memslot(kvm, old, NULL); } void kvm_arch_flush_shadow_all(struct kvm *kvm) -- cgit From e96c81ee89d80e1a0fe50a0e9be40c1b77e14aaa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 13:07:27 -0800 Subject: KVM: Simplify kvm_free_memslot() and all its descendents Now that all callers of kvm_free_memslot() pass NULL for @dont, remove the param from the top-level routine and all arch's implementations. No functional change intended. Tested-by: Christoffer Dall Reviewed-by: Peter Xu Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6068208917dd..88885e3147c4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9850,27 +9850,22 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_hv_destroy_vm(kvm); } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { - kvfree(free->arch.rmap[i]); - free->arch.rmap[i] = NULL; - } + kvfree(slot->arch.rmap[i]); + slot->arch.rmap[i] = NULL; + if (i == 0) continue; - if (!dont || free->arch.lpage_info[i - 1] != - dont->arch.lpage_info[i - 1]) { - kvfree(free->arch.lpage_info[i - 1]); - free->arch.lpage_info[i - 1] = NULL; - } + kvfree(slot->arch.lpage_info[i - 1]); + slot->arch.lpage_info[i - 1] = NULL; } - kvm_page_track_free_memslot(free, dont); + kvm_page_track_free_memslot(slot); } static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, @@ -10069,7 +10064,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, /* Free the arrays associated with the old memslot. */ if (change == KVM_MR_MOVE) - kvm_arch_free_memslot(kvm, old, NULL); + kvm_arch_free_memslot(kvm, old); } void kvm_arch_flush_shadow_all(struct kvm *kvm) -- cgit From 0dff084607bd555d6f74db2af8406a9da9f0fc3a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 13:07:29 -0800 Subject: KVM: Provide common implementation for generic dirty log functions Move the implementations of KVM_GET_DIRTY_LOG and KVM_CLEAR_DIRTY_LOG for CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT into common KVM code. The arch specific implemenations are extremely similar, differing only in whether the dirty log needs to be sync'd from hardware (x86) and how the TLBs are flushed. Add new arch hooks to handle sync and TLB flush; the sync will also be used for non-generic dirty log support in a future patch (s390). The ulterior motive for providing a common implementation is to eliminate the dependency between arch and common code with respect to the memslot referenced by the dirty log, i.e. to make it obvious in the code that the validity of the memslot is guaranteed, as a future patch will rework memslot handling such that id_to_memslot() can return NULL. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 61 ++++-------------------------------------------------- 1 file changed, 4 insertions(+), 57 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 88885e3147c4..27b97e546980 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4759,77 +4759,24 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, return 0; } -/** - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot - * @kvm: kvm instance - * @log: slot id and address to which we copy the log - * - * Steps 1-4 below provide general overview of dirty page logging. See - * kvm_get_dirty_log_protect() function description for additional details. - * - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we - * always flush the TLB (step 4) even if previous step failed and the dirty - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API - * does not preclude user space subsequent dirty log read. Flushing TLB ensures - * writes will be marked dirty for next log read. - * - * 1. Take a snapshot of the bit and clear it if needed. - * 2. Write protect the corresponding page. - * 3. Copy the snapshot to the userspace. - * 4. Flush TLB's if needed. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - /* * Flush potentially hardware-cached dirty pages to dirty_bitmap. */ if (kvm_x86_ops->flush_log_dirty) kvm_x86_ops->flush_log_dirty(kvm); - - r = kvm_get_dirty_log_protect(kvm, log, &flush); - - /* - * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). - */ - lockdep_assert_held(&kvm->slots_lock); - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; } -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) { - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - - /* - * Flush potentially hardware-cached dirty pages to dirty_bitmap. - */ - if (kvm_x86_ops->flush_log_dirty) - kvm_x86_ops->flush_log_dirty(kvm); - - r = kvm_clear_dirty_log_protect(kvm, log, &flush); - /* * All the TLBs can be flushed out of mmu lock, see the comments in * kvm_mmu_slot_remove_write_access(). */ lockdep_assert_held(&kvm->slots_lock); - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; + kvm_flush_remote_tlbs(kvm); } int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, -- cgit From 0577d1abe704c315bb5cdfc71f4ca7b9b5358f59 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 13:07:31 -0800 Subject: KVM: Terminate memslot walks via used_slots Refactor memslot handling to treat the number of used slots as the de facto size of the memslot array, e.g. return NULL from id_to_memslot() when an invalid index is provided instead of relying on npages==0 to detect an invalid memslot. Rework the sorting and walking of memslots in advance of dynamically sizing memslots to aid bisection and debug, e.g. with luck, a bug in the refactoring will bisect here and/or hit a WARN instead of randomly corrupting memory. Alternatively, a global null/invalid memslot could be returned, i.e. so callers of id_to_memslot() don't have to explicitly check for a NULL memslot, but that approach runs the risk of introducing difficult-to- debug issues, e.g. if the global null slot is modified. Constifying the return from id_to_memslot() to combat such issues is possible, but would require a massive refactoring of arch specific code and would still be susceptible to casting shenanigans. Add function comments to update_memslots() and search_memslots() to explicitly (and loudly) state how memslots are sorted. Opportunistically stuff @hva with a non-canonical value when deleting a private memslot on x86 to detect bogus usage of the freed slot. No functional change intended. Tested-by: Christoffer Dall Tested-by: Marc Zyngier Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27b97e546980..13ac4d0f9794 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9715,9 +9715,9 @@ void kvm_arch_sync_events(struct kvm *kvm) int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; - unsigned long hva; + unsigned long hva, uninitialized_var(old_npages); struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *slot, old; + struct kvm_memory_slot *slot; /* Called with kvm->slots_lock held. */ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) @@ -9725,7 +9725,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) slot = id_to_memslot(slots, id); if (size) { - if (slot->npages) + if (slot && slot->npages) return -EEXIST; /* @@ -9737,13 +9737,14 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) if (IS_ERR((void *)hva)) return PTR_ERR((void *)hva); } else { - if (!slot->npages) + if (!slot || !slot->npages) return 0; - hva = 0; + /* Stuff a non-canonical value to catch use-after-delete. */ + hva = 0xdeadull << 48; + old_npages = slot->npages; } - old = *slot; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { struct kvm_userspace_memory_region m; @@ -9758,7 +9759,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } if (!size) - vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); + vm_munmap(hva, old_npages * PAGE_SIZE); return 0; } -- cgit From b3594ffbf932c8e8b23201cdc2c173708a4472dc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 13:07:34 -0800 Subject: KVM: x86/mmu: Move kvm_arch_flush_remote_tlbs_memslot() to mmu.c Move kvm_arch_flush_remote_tlbs_memslot() from x86.c to mmu.c in preparation for calling kvm_flush_remote_tlbs_with_address() instead of kvm_flush_remote_tlbs(). The with_address() variant is statically defined in mmu.c, arguably kvm_arch_flush_remote_tlbs_memslot() belongs in mmu.c anyways, and defining kvm_arch_flush_remote_tlbs_memslot() in mmu.c will allow the compiler to inline said function when a future patch consolidates open coded variants of the function. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 13ac4d0f9794..4b4749768f3d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4768,17 +4768,6 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) kvm_x86_ops->flush_log_dirty(kvm); } -void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, - struct kvm_memory_slot *memslot) -{ - /* - * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). - */ - lockdep_assert_held(&kvm->slots_lock); - kvm_flush_remote_tlbs(kvm); -} - int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, bool line_status) { -- cgit From 562b6b089d64724278de61114da658fb0a516250 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sun, 26 Jan 2020 16:41:13 -0800 Subject: KVM: x86: Consolidate VM allocation and free for VMX and SVM Move the VM allocation and free code to common x86 as the logic is more or less identical across SVM and VMX. Note, although hyperv.hv_pa_pg is part of the common kvm->arch, it's (currently) only allocated by VMX VMs. But, since kfree() plays nice when passed a NULL pointer, the superfluous call for SVM is harmless and avoids future churn if SVM gains support for HyperV's direct TLB flush. Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov [Make vm_size a field instead of a function. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b4749768f3d..ddd1d296bd20 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9622,6 +9622,13 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->sched_in(vcpu, cpu); } +void kvm_arch_free_vm(struct kvm *kvm) +{ + kfree(kvm->arch.hyperv.hv_pa_pg); + vfree(kvm); +} + + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { if (type) -- cgit From 3c9bd4006bfc2dccda1823db61b3f470ef91cfaa Mon Sep 17 00:00:00 2001 From: Jay Zhou Date: Thu, 27 Feb 2020 09:32:27 +0800 Subject: KVM: x86: enable dirty log gradually in small chunks It could take kvm->mmu_lock for an extended period of time when enabling dirty log for the first time. The main cost is to clear all the D-bits of last level SPTEs. This situation can benefit from manual dirty log protect as well, which can reduce the mmu_lock time taken. The sequence is like this: 1. Initialize all the bits of the dirty bitmap to 1 when enabling dirty log for the first time 2. Only write protect the huge pages 3. KVM_GET_DIRTY_LOG returns the dirty bitmap info 4. KVM_CLEAR_DIRTY_LOG will clear D-bit for each of the leaf level SPTEs gradually in small chunks Under the Intel(R) Xeon(R) Gold 6152 CPU @ 2.10GHz environment, I did some tests with a 128G windows VM and counted the time taken of memory_global_dirty_log_start, here is the numbers: VM Size Before After optimization 128G 460ms 10ms Signed-off-by: Jay Zhou Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ddd1d296bd20..864d0aded0b8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9916,7 +9916,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, { /* Still write protect RO slot */ if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new); + kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL); return; } @@ -9951,10 +9951,23 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * See the comments in fast_page_fault(). */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { - if (kvm_x86_ops->slot_enable_log_dirty) + if (kvm_x86_ops->slot_enable_log_dirty) { kvm_x86_ops->slot_enable_log_dirty(kvm, new); - else - kvm_mmu_slot_remove_write_access(kvm, new); + } else { + int level = + kvm_dirty_log_manual_protect_and_init_set(kvm) ? + PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL; + + /* + * If we're with initial-all-set, we don't need + * to write protect any small page because + * they're reported as dirty already. However + * we still need to write-protect huge pages + * so that the page split can happen lazily on + * the first write to the huge page. + */ + kvm_mmu_slot_remove_write_access(kvm, new, level); + } } else { if (kvm_x86_ops->slot_disable_log_dirty) kvm_x86_ops->slot_disable_log_dirty(kvm, new); -- cgit From 4abaffce4d25aa41392d2e81835592726d757857 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 26 Feb 2020 10:41:02 +0800 Subject: KVM: LAPIC: Recalculate apic map in batch In the vCPU reset and set APIC_BASE MSR path, the apic map will be recalculated several times, each time it will consume 10+ us observed by ftrace in my non-overcommit environment since the expensive memory allocate/mutex/rcu etc operations. This patch optimizes it by recaluating apic map in batch, I hope this can benefit the serverless scenario which can frequently create/destroy VMs. Before patch: kvm_lapic_reset ~27us After patch: kvm_lapic_reset ~14us Observed by ftrace, improve ~48%. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 864d0aded0b8..c5762c031b0c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -350,6 +350,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } kvm_lapic_set_base(vcpu, msr_info->data); + kvm_recalculate_apic_map(vcpu->kvm); return 0; } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -- cgit From b34de572a863b5a453dece431eac0da59b5aec0a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 28 Feb 2020 11:18:41 +0800 Subject: KVM: X86: trigger kvmclock sync request just once on VM creation In the progress of vCPUs creation, it queues a kvmclock sync worker to the global workqueue before each vCPU creation completes. The workqueue subsystem guarantees not to queue the already queued work; however, we can make the logic more clear by making just one leader to trigger this kvmclock sync request, and also save on cacheline bouncing caused by test_and_set_bit. Signed-off-by: Wanpeng Li Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c5762c031b0c..78e34c968ab5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9335,11 +9335,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) mutex_unlock(&vcpu->mutex); - if (!kvmclock_periodic_sync) - return; - - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); + if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -- cgit From a1c77abb8d93381e25a8d2df3a917388244ba776 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Mar 2020 22:27:35 -0800 Subject: KVM: nVMX: Properly handle userspace interrupt window request Return true for vmx_interrupt_allowed() if the vCPU is in L2 and L1 has external interrupt exiting enabled. IRQs are never blocked in hardware if the CPU is in the guest (L2 from L1's perspective) when IRQs trigger VM-Exit. The new check percolates up to kvm_vcpu_ready_for_interrupt_injection() and thus vcpu_run(), and so KVM will exit to userspace if userspace has requested an interrupt window (to inject an IRQ into L1). Remove the @external_intr param from vmx_check_nested_events(), which is actually an indicator that userspace wants an interrupt window, e.g. it's named @req_int_win further up the stack. Injecting a VM-Exit into L1 to try and bounce out to L0 userspace is all kinds of broken and is no longer necessary. Remove the hack in nested_vmx_vmexit() that attempted to workaround the breakage in vmx_check_nested_events() by only filling interrupt info if there's an actual interrupt pending. The hack actually made things worse because it caused KVM to _never_ fill interrupt info when the LAPIC resides in userspace (kvm_cpu_has_interrupt() queries interrupt.injected, which is always cleared by prepare_vmcs12() before reaching the hack in nested_vmx_vmexit()). Fixes: 6550c4df7e50 ("KVM: nVMX: Fix interrupt window request with "Acknowledge interrupt on exit"") Cc: stable@vger.kernel.org Cc: Liran Alon Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78e34c968ab5..b15092c9593d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7579,7 +7579,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) +static int inject_pending_event(struct kvm_vcpu *vcpu) { int r; @@ -7615,7 +7615,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) * from L2 to L1. */ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + r = kvm_x86_ops->check_nested_events(vcpu); if (r != 0) return r; } @@ -7677,7 +7677,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) * KVM_REQ_EVENT only on certain events and not unconditionally? */ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + r = kvm_x86_ops->check_nested_events(vcpu); if (r != 0) return r; } @@ -8210,7 +8210,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - if (inject_pending_event(vcpu, req_int_win) != 0) + if (inject_pending_event(vcpu) != 0) req_immediate_exit = true; else { /* Enable SMI/NMI/IRQ window open exits if needed. @@ -8438,7 +8438,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) - kvm_x86_ops->check_nested_events(vcpu, false); + kvm_x86_ops->check_nested_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); -- cgit From d8dd54e06348c43b97e5c0d488e5ee4e004bfb6f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Mar 2020 18:02:39 -0800 Subject: KVM: x86/mmu: Rename kvm_mmu->get_cr3() to ->get_guest_pgd() Rename kvm_mmu->get_cr3() to call out that it is retrieving a guest value, as opposed to kvm_mmu->set_cr3(), which sets a host value, and to note that it will return something other than CR3 when nested EPT is in use. Hopefully the new name will also make it more obvious that L1's nested_cr3 is returned in SVM's nested NPT case. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b15092c9593d..ba4d476b79ad 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10165,7 +10165,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) return; if (!vcpu->arch.mmu->direct_map && - work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) + work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu)) return; kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); -- cgit From abbed4fa94f69d2046c6f7c12f6ecabf195c553e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 4 Mar 2020 16:24:22 -0800 Subject: KVM: x86: Fix warning due to implicit truncation on 32-bit KVM Explicitly cast the integer literal to an unsigned long when stuffing a non-canonical value into the host virtual address during private memslot deletion. The explicit cast fixes a warning that gets promoted to an error when running with KVM's newfangled -Werror setting. arch/x86/kvm/x86.c:9739:9: error: large integer implicitly truncated to unsigned type [-Werror=overflow] Fixes: a3e967c0b87d3 ("KVM: Terminate memslot walks via used_slots" Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ba4d476b79ad..fa03f31ab33c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9735,8 +9735,12 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) if (!slot || !slot->npages) return 0; - /* Stuff a non-canonical value to catch use-after-delete. */ - hva = 0xdeadull << 48; + /* + * Stuff a non-canonical value to catch use-after-delete. This + * ends up being 0 on 32-bit KVM, but there's no better + * alternative. + */ + hva = (unsigned long)(0xdeadull << 48); old_npages = slot->npages; } -- cgit From 2e3bb4d886c72c0d5336a957ed66882e08f5c14b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 15:29:41 -0800 Subject: KVM: x86: Refactor I/O emulation helpers to provide vcpu-only variant Add variants of the I/O helpers that take a vCPU instead of an emulation context. This will eventually allow KVM to limit use of the emulation context to the full emulation path. Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa03f31ab33c..fbf68c305556 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5904,11 +5904,9 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, return 0; } -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int ret; if (vcpu->arch.pio.count) @@ -5928,17 +5926,30 @@ data_avail: return 0; } -static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, - const void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); +} + +static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, const void *val, + unsigned int count) +{ memcpy(vcpu->arch.pio_data, val, size * count); trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, + const void *val, unsigned int count) +{ + return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count); +} + static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { return kvm_x86_ops->get_segment_base(vcpu, seg); @@ -6891,8 +6902,8 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { unsigned long val = kvm_rax_read(vcpu); - int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, - size, port, &val, 1); + int ret = emulator_pio_out(vcpu, size, port, &val, 1); + if (ret) return ret; @@ -6928,11 +6939,10 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform + * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform * the copy and tracing */ - emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, - vcpu->arch.pio.port, &val, 1); + emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); @@ -6947,8 +6957,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, /* For size less than 4 we merge, else we zero extend */ val = (size < 4) ? kvm_rax_read(vcpu) : 0; - ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, - &val, 1); + ret = emulator_pio_in(vcpu, size, port, &val, 1); if (ret) { kvm_rax_write(vcpu, val); return ret; -- cgit From 21f1b8f29ea5b2301af7f2cc41a20b7b87a22bec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 15:29:42 -0800 Subject: KVM: x86: Explicitly pass an exception struct to check_intercept Explicitly pass an exception struct when checking for intercept from the emulator, which eliminates the last reference to arch.emulate_ctxt in vendor specific code. Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fbf68c305556..762a68200b46 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6212,7 +6212,8 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); + return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage, + &ctxt->exception); } static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, -- cgit From c9b8b07cded58c55ad2bf67e68b9bfae96092293 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 15:29:48 -0800 Subject: KVM: x86: Dynamically allocate per-vCPU emulation context Allocate the emulation context instead of embedding it in struct kvm_vcpu_arch. Dynamic allocation provides several benefits: - Shrinks the size x86 vcpus by ~2.5k bytes, dropping them back below the PAGE_ALLOC_COSTLY_ORDER threshold. - Allows for dropping the include of kvm_emulate.h from asm/kvm_host.h and moving kvm_emulate.h into KVM's private directory. - Allows a reducing KVM's attack surface by shrinking the amount of vCPU data that is exposed to usercopy. - Allows a future patch to disable the emulator entirely, which may or may not be a realistic endeavor. Mark the entire struct as valid for usercopy to maintain existing behavior with respect to hardened usercopy. Future patches can shrink the usercopy range to cover only what is necessary. Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 65 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 762a68200b46..4465975f034b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -81,7 +81,7 @@ u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ - container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) + ((struct kvm_vcpu *)(ctxt)->vcpu) /* EFER defaults: * - enable syscall per default because its emulated by KVM @@ -230,6 +230,19 @@ u64 __read_mostly host_xcr0; struct kmem_cache *x86_fpu_cache; EXPORT_SYMBOL_GPL(x86_fpu_cache); +static struct kmem_cache *x86_emulator_cache; + +static struct kmem_cache *kvm_alloc_emulator_cache(void) +{ + return kmem_cache_create_usercopy("x86_emulator", + sizeof(struct x86_emulate_ctxt), + __alignof__(struct x86_emulate_ctxt), + SLAB_ACCOUNT, + 0, + sizeof(struct x86_emulate_ctxt), + NULL); +} + static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) @@ -5673,7 +5686,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; /* * If the exit was due to a NPF we may already have a GPA. @@ -6346,7 +6359,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) static bool inject_emulated_exception(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; if (ctxt->exception.vector == PF_VECTOR) return kvm_propagate_fault(vcpu, &ctxt->exception); @@ -6358,9 +6371,26 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu) return false; } +static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt; + + ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); + if (!ctxt) { + pr_err("kvm: failed to allocate vcpu's emulator\n"); + return NULL; + } + + ctxt->vcpu = vcpu; + ctxt->ops = &emulate_ops; + vcpu->arch.emulate_ctxt = ctxt; + + return ctxt; +} + static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int cs_db, cs_l; kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); @@ -6385,7 +6415,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); @@ -6700,7 +6730,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len) { int r; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; @@ -7296,10 +7326,16 @@ int kvm_arch_init(void *opaque) goto out; } + x86_emulator_cache = kvm_alloc_emulator_cache(); + if (!x86_emulator_cache) { + pr_err("kvm: failed to allocate cache for x86 emulator\n"); + goto out_free_x86_fpu_cache; + } + shared_msrs = alloc_percpu(struct kvm_shared_msrs); if (!shared_msrs) { printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); - goto out_free_x86_fpu_cache; + goto out_free_x86_emulator_cache; } r = kvm_mmu_module_init(); @@ -7332,6 +7368,8 @@ int kvm_arch_init(void *opaque) out_free_percpu: free_percpu(shared_msrs); +out_free_x86_emulator_cache: + kmem_cache_destroy(x86_emulator_cache); out_free_x86_fpu_cache: kmem_cache_destroy(x86_fpu_cache); out: @@ -8709,7 +8747,7 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) * that usually, but some bad designed PV devices (vmware * backdoor interface) need this to work */ - emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); + emulator_writeback_register_cache(vcpu->arch.emulate_ctxt); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } regs->rax = kvm_rax_read(vcpu); @@ -8895,7 +8933,7 @@ out: int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); @@ -9227,7 +9265,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) struct page *page; int r; - vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else @@ -9265,11 +9302,14 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) GFP_KERNEL_ACCOUNT)) goto fail_free_mce_banks; + if (!alloc_emulate_ctxt(vcpu)) + goto free_wbinvd_dirty_mask; + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu->arch.user_fpu) { pr_err("kvm: failed to allocate userspace's fpu\n"); - goto free_wbinvd_dirty_mask; + goto free_emulate_ctxt; } vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, @@ -9311,6 +9351,8 @@ free_guest_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); +free_emulate_ctxt: + kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: @@ -9361,6 +9403,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); + kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); -- cgit From 2f728d66e8a7d89d7cb141bf0acb30c61ae7ded5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 15:29:49 -0800 Subject: KVM: x86: Move kvm_emulate.h into KVM's private directory Now that the emulation context is dynamically allocated and not embedded in struct kvm_vcpu, move its header, kvm_emulate.h, out of the public asm directory and into KVM's private x86 directory. Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4465975f034b..f55899055467 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -22,6 +22,7 @@ #include "i8254.h" #include "tss.h" #include "kvm_cache_regs.h" +#include "kvm_emulate.h" #include "x86.h" #include "cpuid.h" #include "pmu.h" -- cgit From 06add254c7f3b7f6fdfe04eb028aaabe5b27a734 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 15:29:50 -0800 Subject: KVM: x86: Shrink the usercopy region of the emulation context Shuffle a few operand structs to the end of struct x86_emulate_ctxt and update the cache creation to whitelist only the region of the emulation context that is expected to be copied to/from user memory, e.g. the instruction operands, registers, and fetch/io/mem caches. Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f55899055467..a69f7bf020d9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -235,13 +235,13 @@ static struct kmem_cache *x86_emulator_cache; static struct kmem_cache *kvm_alloc_emulator_cache(void) { - return kmem_cache_create_usercopy("x86_emulator", - sizeof(struct x86_emulate_ctxt), + unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); + unsigned int size = sizeof(struct x86_emulate_ctxt); + + return kmem_cache_create_usercopy("x86_emulator", size, __alignof__(struct x86_emulate_ctxt), - SLAB_ACCOUNT, - 0, - sizeof(struct x86_emulate_ctxt), - NULL); + SLAB_ACCOUNT, useroffset, + size - useroffset, NULL); } static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); -- cgit From cfc481810c903a5f74e5c7bf50ca8e28318dbc44 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Mar 2020 15:56:23 -0800 Subject: KVM: x86: Calculate the supported xcr0 mask at load time Add a new global variable, supported_xcr0, to track which xcr0 bits can be exposed to the guest instead of calculating the mask on every call. The supported bits are constant for a given instance of KVM. This paves the way toward eliminating the ->mpx_supported() call in kvm_mpx_supported(), e.g. eliminates multiple retpolines in VMX's nested VM-Enter path, and eventually toward eliminating ->mpx_supported() altogether. No functional change intended. Reviewed-by: Xiaoyao Li Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a69f7bf020d9..849957f3afb2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -181,6 +181,11 @@ struct kvm_shared_msrs { static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; static struct kvm_shared_msrs __percpu *shared_msrs; +#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ + | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ + | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ + | XFEATURE_MASK_PKRU) + static u64 __read_mostly host_xss; struct kvm_stats_debugfs_item debugfs_entries[] = { @@ -227,6 +232,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { }; u64 __read_mostly host_xcr0; +u64 __read_mostly supported_xcr0; +EXPORT_SYMBOL_GPL(supported_xcr0); struct kmem_cache *x86_fpu_cache; EXPORT_SYMBOL_GPL(x86_fpu_cache); @@ -4114,8 +4121,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~kvm_supported_xcr0() || - mxcsr & ~mxcsr_feature_mask) + if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) return -EINVAL; load_xsave(vcpu, (u8 *)guest_xsave->region); } else { @@ -7352,8 +7358,10 @@ int kvm_arch_init(void *opaque) perf_register_guest_info_callbacks(&kvm_guest_cbs); - if (boot_cpu_has(X86_FEATURE_XSAVE)) + if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + } kvm_lapic_init(); if (pi_inject_timer == -1) -- cgit From c10398b6d0ddb9c8234890828ab83341e11f9840 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Mar 2020 15:56:46 -0800 Subject: KVM: x86: Use KVM cpu caps to mark CR4.LA57 as not-reserved Add accessor(s) for KVM cpu caps and use said accessor to detect hardware support for LA57 instead of manually querying CPUID. Note, the explicit conversion to bool via '!!' in kvm_cpu_cap_has() is technically unnecessary, but it gives people a warm fuzzy feeling. No functional change intended. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 849957f3afb2..389bc80f684c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -925,7 +925,7 @@ static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) { u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); - if (cpuid_ecx(0x7) & feature_bit(LA57)) + if (kvm_cpu_cap_has(X86_FEATURE_LA57)) reserved_bits &= ~X86_CR4_LA57; if (kvm_x86_ops->umip_emulated()) -- cgit From 90d2f60f41f73b90768554e5a30b1cfedd167731 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Mar 2020 15:56:47 -0800 Subject: KVM: x86: Use KVM cpu caps to track UMIP emulation Set UMIP in kvm_cpu_caps when it is emulated by VMX, even though the bit will effectively be dropped by do_host_cpuid(). This allows checking for UMIP emulation via kvm_cpu_caps instead of a dedicated kvm_x86_ops callback. No functional change intended. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 389bc80f684c..51a49a6ed070 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -928,7 +928,7 @@ static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) if (kvm_cpu_cap_has(X86_FEATURE_LA57)) reserved_bits &= ~X86_CR4_LA57; - if (kvm_x86_ops->umip_emulated()) + if (kvm_cpu_cap_has(X86_FEATURE_UMIP)) reserved_bits &= ~X86_CR4_UMIP; return reserved_bits; -- cgit From 139085101f8500b09c681b1e52c3839df681a0d2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Mar 2020 15:56:57 -0800 Subject: KVM: x86: Use KVM cpu caps to detect MSR_TSC_AUX virt support Check for MSR_TSC_AUX virtualization via kvm_cpu_cap_has() and drop ->rdtscp_supported(). Note, vmx_rdtscp_supported() needs to hang around a tiny bit longer due other usage in VMX code. No functional change intended. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51a49a6ed070..fd0889f2f37f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5215,7 +5215,7 @@ static void kvm_init_msr_list(void) continue; break; case MSR_TSC_AUX: - if (!kvm_x86_ops->rdtscp_supported()) + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) continue; break; case MSR_IA32_RTIT_CTL: -- cgit From 7b874c26a62487acaf2e7e179715991c70db25db Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Mar 2020 15:56:59 -0800 Subject: KVM: x86: Check for Intel PT MSR virtualization using KVM cpu caps Use kvm_cpu_cap_has() to check for Intel PT when processing the list of virtualized MSRs to pave the way toward removing ->pt_supported(). No functional change intended. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fd0889f2f37f..f3fac68f0612 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5220,23 +5220,23 @@ static void kvm_init_msr_list(void) break; case MSR_IA32_RTIT_CTL: case MSR_IA32_RTIT_STATUS: - if (!kvm_x86_ops->pt_supported()) + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) continue; break; case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) continue; break; case MSR_IA32_RTIT_OUTPUT_BASE: case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) continue; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; -- cgit From a1bead2abaa162e5e67ad258a06c9d71dddad00d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Mar 2020 15:57:00 -0800 Subject: KVM: VMX: Directly query Intel PT mode when refreshing PMUs Use vmx_pt_mode_is_host_guest() in intel_pmu_refresh() instead of bouncing through kvm_x86_ops->pt_supported, and remove ->pt_supported() as the PMU code was the last remaining user. Opportunistically clean up the wording of a comment that referenced kvm_x86_ops->pt_supported(). No functional change intended. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f3fac68f0612..5be4961d49dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2820,10 +2820,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) return 1; /* - * We do support PT if kvm_x86_ops->pt_supported(), but we do - * not support IA32_XSS[bit 8]. Guests will have to use - * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT - * MSRs. + * KVM supports exposing PT to the guest, but does not support + * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than + * XSAVES/XRSTORS to save/restore PT MSRs. */ if (data != 0) return 1; -- cgit From 600087b6146764999949b4a12ce5f7627602c33a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Mar 2020 15:57:05 -0800 Subject: KVM: Drop largepages_enabled and its accessor/mutator Drop largepages_enabled, kvm_largepages_enabled() and kvm_disable_largepages() now that all users are gone. Note, largepages_enabled was an x86-only flag that got left in common KVM code when KVM gained support for multiple architectures. No functional change intended. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5be4961d49dd..935cd40cbae2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9918,11 +9918,9 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, ugfn = slot->userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot + * other, disable large page support for this slot. */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !kvm_largepages_enabled()) { + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) { unsigned long j; for (j = 0; j < lpages; ++j) -- cgit From 91661989d17ccec17bca199e7cb1f463ba4e5b78 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 2 Mar 2020 15:57:06 -0800 Subject: KVM: x86: Move VMX's host_efer to common x86 code Move host_efer to common x86 code and use it for CPUID's is_efer_nx() to avoid constantly re-reading the MSR. No functional change intended. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 935cd40cbae2..d2f1b4746903 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -186,6 +186,9 @@ static struct kvm_shared_msrs __percpu *shared_msrs; | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU) +u64 __read_mostly host_efer; +EXPORT_SYMBOL_GPL(host_efer); + static u64 __read_mostly host_xss; struct kvm_stats_debugfs_item debugfs_entries[] = { @@ -9612,6 +9615,8 @@ int kvm_arch_hardware_setup(void) { int r; + rdmsrl_safe(MSR_EFER, &host_efer); + r = kvm_x86_ops->hardware_setup(); if (r != 0) return r; -- cgit From 408e9a318f57ba8be82ba01e98cc271b97392187 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 5 Mar 2020 16:11:56 +0100 Subject: KVM: CPUID: add support for supervisor states Current CPUID 0xd enumeration code does not support supervisor states, because KVM only supports setting IA32_XSS to zero. Change it instead to use a new variable supported_xss, to be set from the hardware_setup callback which is in charge of CPU capabilities. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d2f1b4746903..96e897d38a63 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -190,6 +190,8 @@ u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); static u64 __read_mostly host_xss; +u64 __read_mostly supported_xss; +EXPORT_SYMBOL_GPL(supported_xss); struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, @@ -2827,7 +2829,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data != 0) + if (data & ~supported_xss) return 1; vcpu->arch.ia32_xss = data; break; @@ -9617,10 +9619,16 @@ int kvm_arch_hardware_setup(void) rdmsrl_safe(MSR_EFER, &host_efer); + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + r = kvm_x86_ops->hardware_setup(); if (r != 0) return r; + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + supported_xss = 0; + cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); if (kvm_has_tsc_control) { @@ -9637,9 +9645,6 @@ int kvm_arch_hardware_setup(void) kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; } - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - kvm_init_msr_list(); return 0; } -- cgit From 23493d0a1731205a4bccaa0e283da642ca6f6e29 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 4 Mar 2020 17:34:33 -0800 Subject: KVM x86: Extend AMD specific guest behavior to Hygon virtual CPUs Extend guest_cpuid_is_amd() to cover Hygon virtual CPUs and rename it accordingly. Hygon CPUs use an AMD-based core and so have the same basic behavior as AMD CPUs. Fixes: b8f4abb652146 ("x86/kvm: Add Hygon Dhyana support to KVM") Cc: Pu Wen Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 96e897d38a63..cda0b787b2e3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2550,7 +2550,7 @@ static void kvmclock_sync_fn(struct work_struct *work) static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd(vcpu)) + if (guest_cpuid_is_amd_or_hygon(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; -- cgit From f91af5176cce77bb0d3292e46665c30af0792dcd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 4 Mar 2020 17:34:37 -0800 Subject: KVM: x86: Refactor kvm_cpuid() param that controls out-of-range logic Invert and rename the kvm_cpuid() param that controls out-of-range logic to better reflect the semantics of the affected callers, i.e. callers that bypass the out-of-range logic do so because they are looking up an exact guest CPUID entry, e.g. to query the maxphyaddr. Similarly, rename kvm_cpuid()'s internal "found" to "exact" to clarify that it tracks whether or not the exact requested leaf was found, as opposed to any usable leaf being found. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cda0b787b2e3..62cf170b31d4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6241,9 +6241,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, } static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, + bool exact_only) { - return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit); + return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only); } static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) -- cgit From 727a7e27cf88a261c5a0f14f4f9ee4d767352766 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 5 Mar 2020 03:52:50 -0500 Subject: KVM: x86: rename set_cr3 callback and related flags to load_mmu_pgd The set_cr3 callback is not setting the guest CR3, it is setting the root of the guest page tables, either shadow or two-dimensional. To make this clearer as well as to indicate that the MMU calls it via kvm_mmu_load_cr3, rename it to load_mmu_pgd. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 62cf170b31d4..c67324138e17 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8186,8 +8186,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); - if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu)) - kvm_mmu_load_cr3(vcpu); + if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) + kvm_mmu_load_pgd(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb(vcpu, true); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { -- cgit From ab56f8e62dafe4c9bec9fc236937c9884bd9966d Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 12 Mar 2020 05:39:28 -0500 Subject: kvm: svm: Introduce GA Log tracepoint for AVIC GA Log tracepoint is useful when debugging AVIC performance issue as it can be used with perf to count the number of times IOMMU AVIC injects interrupts through the slow-path instead of directly inject interrupts to the target vcpu. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c67324138e17..a7cb85231330 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10562,4 +10562,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); -- cgit From 8e205a6b2a06764a4c2bfc9e1a6a8a8e7920faf8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 14 Mar 2020 12:29:23 +0100 Subject: KVM: X86: correct meaningless kvm_apicv_activated() check After test_and_set_bit() for kvm->arch.apicv_inhibit_reasons, we will always get false when calling kvm_apicv_activated() because it's sure apicv_inhibit_reasons do not equal to 0. What the code wants to do, is check whether APICv was *already* active and if so skip the costly request; we can do this using cmpxchg. Reported-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a7cb85231330..e54c6ad628a8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8049,19 +8049,26 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); */ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { + unsigned long old, new, expected; + if (!kvm_x86_ops->check_apicv_inhibit_reasons || !kvm_x86_ops->check_apicv_inhibit_reasons(bit)) return; - if (activate) { - if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) || - !kvm_apicv_activated(kvm)) - return; - } else { - if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) || - kvm_apicv_activated(kvm)) - return; - } + old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); + do { + expected = new = old; + if (activate) + __clear_bit(bit, &new); + else + __set_bit(bit, &new); + if (new == old) + break; + old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); + } while (old != expected); + + if (!!old == !!new) + return; trace_kvm_apicv_update_request(activate, bit); if (kvm_x86_ops->pre_update_apicv_exec_ctrl) -- cgit From cf6c26ec7bf5386706cd6522708766eb6522995e Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Sat, 29 Feb 2020 10:52:12 +0800 Subject: KVM: x86: Code style cleanup in kvm_arch_dev_ioctl() In kvm_arch_dev_ioctl(), the brackets of case KVM_X86_GET_MCE_CAP_SUPPORTED accidently encapsulates case KVM_GET_MSR_FEATURE_INDEX_LIST and case KVM_GET_MSRS. It doesn't affect functionality but it's misleading. Remove unnecessary brackets and opportunistically add a "break" in the default path. Suggested-by: Sean Christopherson Signed-off-by: Xiaoyao Li Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e54c6ad628a8..a4e62d767dd6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3488,7 +3488,7 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } - case KVM_X86_GET_MCE_CAP_SUPPORTED: { + case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; if (copy_to_user(argp, &kvm_mce_cap_supported, sizeof(kvm_mce_cap_supported))) @@ -3520,9 +3520,9 @@ long kvm_arch_dev_ioctl(struct file *filp, case KVM_GET_MSRS: r = msr_io(NULL, argp, do_get_msr_feature, 1); break; - } default: r = -EINVAL; + break; } out: return r; -- cgit From 8a1038de11a5536c76054061837b11648bec5b46 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 26 Mar 2020 10:20:00 +0800 Subject: KVM: X86: Delay read msr data iff writes ICR MSR Delay read msr data until we identify guest accesses ICR MSR to avoid to penalize all other MSR writes. Signed-off-by: Wanpeng Li Message-Id: <1585189202-1708-2-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a4e62d767dd6..62d614586402 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1595,11 +1595,12 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); - u64 data = kvm_read_edx_eax(vcpu); + u64 data; int ret = 0; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): + data = kvm_read_edx_eax(vcpu); ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); break; default: -- cgit From d5361678e63c8a5e72d75cee6d15b840c44306f2 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 26 Mar 2020 10:20:02 +0800 Subject: KVM: X86: Micro-optimize IPI fastpath delay This patch optimizes the virtual IPI fastpath emulation sequence: write ICR2 send virtual IPI read ICR2 write ICR2 send virtual IPI ==> write ICR write ICR We can observe ~0.67% performance improvement for IPI microbenchmark (https://lore.kernel.org/kvm/20171219085010.4081-1-ynorov@caviumnetworks.com/) on Skylake server. Signed-off-by: Wanpeng Li Message-Id: <1585189202-1708-4-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 62d614586402..6fa014ccd253 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1585,8 +1585,12 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) { + data &= ~(1 << 12); + kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32)); kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); - return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data); + kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data); + trace_kvm_apic_write(APIC_ICR, (u32)data); + return 0; } return 1; -- cgit From b990408537388e9174b642ad36cdef6c47c64d3a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 21 Mar 2020 13:25:55 -0700 Subject: KVM: Pass kvm_init()'s opaque param to additional arch funcs Pass @opaque to kvm_arch_hardware_setup() and kvm_arch_check_processor_compat() to allow architecture specific code to reference @opaque without having to stash it away in a temporary global variable. This will enable x86 to separate its vendor specific callback ops, which are passed via @opaque, into "init" and "runtime" ops without having to stash away the "init" ops. No functional change intended. Reviewed-by: Cornelia Huck Tested-by: Cornelia Huck #s390 Acked-by: Marc Zyngier Signed-off-by: Sean Christopherson Message-Id: <20200321202603.19355-2-sean.j.christopherson@intel.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b6d9ac9533c..a656fba5bd60 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9626,7 +9626,7 @@ void kvm_arch_hardware_disable(void) drop_user_return_notifiers(); } -int kvm_arch_hardware_setup(void) +int kvm_arch_hardware_setup(void *opaque) { int r; @@ -9667,7 +9667,7 @@ void kvm_arch_hardware_unsetup(void) kvm_x86_ops->hardware_unsetup(); } -int kvm_arch_check_processor_compat(void) +int kvm_arch_check_processor_compat(void *opaque) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); -- cgit From d008dfdb0e7012ddff5bd6c0d2abd3b8ec6e77f5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 21 Mar 2020 13:25:56 -0700 Subject: KVM: x86: Move init-only kvm_x86_ops to separate struct Move the kvm_x86_ops functions that are used only within the scope of kvm_init() into a separate struct, kvm_x86_init_ops. In addition to identifying the init-only functions without restorting to code comments, this also sets the stage for waiting until after ->hardware_setup() to set kvm_x86_ops. Setting kvm_x86_ops after ->hardware_setup() is desirable as many of the hooks are not usable until ->hardware_setup() completes. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200321202603.19355-3-sean.j.christopherson@intel.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a656fba5bd60..283ef4d919f8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7303,8 +7303,8 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { + struct kvm_x86_init_ops *ops = opaque; int r; - struct kvm_x86_ops *ops = opaque; if (kvm_x86_ops) { printk(KERN_ERR "kvm: already loaded the other module\n"); @@ -7359,7 +7359,7 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_x86_ops = ops; + kvm_x86_ops = ops->runtime_ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, @@ -9628,6 +9628,7 @@ void kvm_arch_hardware_disable(void) int kvm_arch_hardware_setup(void *opaque) { + struct kvm_x86_init_ops *ops = opaque; int r; rdmsrl_safe(MSR_EFER, &host_efer); @@ -9635,7 +9636,7 @@ int kvm_arch_hardware_setup(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); - r = kvm_x86_ops->hardware_setup(); + r = ops->hardware_setup(); if (r != 0) return r; @@ -9670,13 +9671,14 @@ void kvm_arch_hardware_unsetup(void) int kvm_arch_check_processor_compat(void *opaque) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + struct kvm_x86_init_ops *ops = opaque; WARN_ON(!irqs_disabled()); if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits) return -EIO; - return kvm_x86_ops->check_processor_compatibility(); + return ops->check_processor_compatibility(); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) -- cgit From 69c6f69aa3064ab6cc8426661f125ea75ffe899c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 21 Mar 2020 13:25:59 -0700 Subject: KVM: x86: Set kvm_x86_ops only after ->hardware_setup() completes Set kvm_x86_ops with the vendor's ops only after ->hardware_setup() completes to "prevent" using kvm_x86_ops before they are ready, i.e. to generate a null pointer fault instead of silently consuming unconfigured state. An alternative implementation would be to have ->hardware_setup() return the vendor's ops, but that would require non-trivial refactoring, and would arguably result in less readable code, e.g. ->hardware_setup() would need to use ERR_PTR() in multiple locations, and each vendor's declaration of the runtime ops would be less obvious. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200321202603.19355-6-sean.j.christopherson@intel.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 283ef4d919f8..23b6c2e38d9e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7359,8 +7359,6 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_x86_ops = ops->runtime_ops; - kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, PT_PRESENT_MASK, 0, sme_me_mask); @@ -9640,6 +9638,8 @@ int kvm_arch_hardware_setup(void *opaque) if (r != 0) return r; + kvm_x86_ops = ops->runtime_ops; + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; -- cgit From afaf0b2f9b801c6eb2278b52d49e6a7d7b659cf1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 21 Mar 2020 13:26:00 -0700 Subject: KVM: x86: Copy kvm_x86_ops by value to eliminate layer of indirection Replace the kvm_x86_ops pointer in common x86 with an instance of the struct to save one pointer dereference when invoking functions. Copy the struct by value to set the ops during kvm_init(). Arbitrarily use kvm_x86_ops.hardware_enable to track whether or not the ops have been initialized, i.e. a vendor KVM module has been loaded. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20200321202603.19355-7-sean.j.christopherson@intel.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 356 ++++++++++++++++++++++++++--------------------------- 1 file changed, 178 insertions(+), 178 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 23b6c2e38d9e..f055a79f93b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -110,7 +110,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); -struct kvm_x86_ops *kvm_x86_ops __read_mostly; +struct kvm_x86_ops kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); static bool __read_mostly ignore_msrs = 0; @@ -646,7 +646,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); */ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { - if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) + if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl) return true; kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; @@ -787,7 +787,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_pae(vcpu)) return 1; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) return 1; } else @@ -800,7 +800,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) return 1; - kvm_x86_ops->set_cr0(vcpu, cr0); + kvm_x86_ops.set_cr0(vcpu, cr0); if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); @@ -896,7 +896,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - if (kvm_x86_ops->get_cpl(vcpu) != 0 || + if (kvm_x86_ops.get_cpl(vcpu) != 0 || __kvm_set_xcr(vcpu, index, xcr)) { kvm_inject_gp(vcpu, 0); return 1; @@ -977,7 +977,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } - if (kvm_x86_ops->set_cr4(vcpu, cr4)) + if (kvm_x86_ops.set_cr4(vcpu, cr4)) return 1; if (((cr4 ^ old_cr4) & pdptr_bits) || @@ -1061,7 +1061,7 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) static void kvm_update_dr6(struct kvm_vcpu *vcpu) { if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6); + kvm_x86_ops.set_dr6(vcpu, vcpu->arch.dr6); } static void kvm_update_dr7(struct kvm_vcpu *vcpu) @@ -1072,7 +1072,7 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; - kvm_x86_ops->set_dr7(vcpu, dr7); + kvm_x86_ops.set_dr7(vcpu, dr7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; @@ -1142,7 +1142,7 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) *val = vcpu->arch.dr6; else - *val = kvm_x86_ops->get_dr6(vcpu); + *val = kvm_x86_ops.get_dr6(vcpu); break; case 5: /* fall through */ @@ -1377,7 +1377,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - if (kvm_x86_ops->get_msr_feature(msr)) + if (kvm_x86_ops.get_msr_feature(msr)) return 1; } return 0; @@ -1445,7 +1445,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - kvm_x86_ops->set_efer(vcpu, efer); + kvm_x86_ops.set_efer(vcpu, efer); /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) @@ -1501,7 +1501,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, msr.index = index; msr.host_initiated = host_initiated; - return kvm_x86_ops->set_msr(vcpu, &msr); + return kvm_x86_ops.set_msr(vcpu, &msr); } /* @@ -1519,7 +1519,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, msr.index = index; msr.host_initiated = host_initiated; - ret = kvm_x86_ops->get_msr(vcpu, &msr); + ret = kvm_x86_ops.get_msr(vcpu, &msr); if (!ret) *data = msr.data; return ret; @@ -1905,7 +1905,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { - u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + u64 curr_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } @@ -1947,7 +1947,7 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); return tsc_offset + kvm_scale_tsc(vcpu, host_tsc); } @@ -1955,7 +1955,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { - vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset); + vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset); } static inline bool kvm_check_tsc_unstable(void) @@ -2079,7 +2079,7 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { - u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); } @@ -2677,7 +2677,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa); + kvm_x86_ops.tlb_flush(vcpu, invalidate_gpa); } static void record_steal_time(struct kvm_vcpu *vcpu) @@ -3394,10 +3394,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE); + r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE); break; case KVM_CAP_VAPIC: - r = !kvm_x86_ops->cpu_has_accelerated_tpr(); + r = !kvm_x86_ops.cpu_has_accelerated_tpr(); break; case KVM_CAP_NR_VCPUS: r = KVM_SOFT_MAX_VCPUS; @@ -3424,14 +3424,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_X2APIC_API_VALID_FLAGS; break; case KVM_CAP_NESTED_STATE: - r = kvm_x86_ops->get_nested_state ? - kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0; + r = kvm_x86_ops.get_nested_state ? + kvm_x86_ops.get_nested_state(NULL, NULL, 0) : 0; break; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - r = kvm_x86_ops->enable_direct_tlbflush != NULL; + r = kvm_x86_ops.enable_direct_tlbflush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - r = kvm_x86_ops->nested_enable_evmcs != NULL; + r = kvm_x86_ops.nested_enable_evmcs != NULL; break; default: break; @@ -3547,14 +3547,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { - if (kvm_x86_ops->has_wbinvd_exit()) + if (kvm_x86_ops.has_wbinvd_exit()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) smp_call_function_single(vcpu->cpu, wbinvd_ipi, NULL, 1); } - kvm_x86_ops->vcpu_load(vcpu, cpu); + kvm_x86_ops.vcpu_load(vcpu, cpu); /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { @@ -3621,7 +3621,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) int idx; if (vcpu->preempted) - vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu); + vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu); /* * Disable page faults because we're in atomic context here. @@ -3640,7 +3640,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvm_steal_time_set_preempted(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); pagefault_enable(); - kvm_x86_ops->vcpu_put(vcpu); + kvm_x86_ops.vcpu_put(vcpu); vcpu->arch.last_host_tsc = rdtsc(); /* * If userspace has set any breakpoints or watchpoints, dr6 is restored @@ -3654,7 +3654,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_x86_ops.sync_pir_to_irr(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -3762,7 +3762,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; - kvm_x86_ops->setup_mce(vcpu); + kvm_x86_ops.setup_mce(vcpu); out: return r; } @@ -3866,11 +3866,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; - events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); + events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; - events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); + events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu); events->nmi.pad = 0; events->sipi_vector = 0; /* never valid when reporting to user space */ @@ -3937,13 +3937,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - kvm_x86_ops->set_interrupt_shadow(vcpu, + kvm_x86_ops.set_interrupt_shadow(vcpu, events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) vcpu->arch.nmi_pending = events->nmi.pending; - kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); + kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) @@ -4217,9 +4217,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - if (!kvm_x86_ops->nested_enable_evmcs) + if (!kvm_x86_ops.nested_enable_evmcs) return -ENOTTY; - r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version); + r = kvm_x86_ops.nested_enable_evmcs(vcpu, &vmcs_version); if (!r) { user_ptr = (void __user *)(uintptr_t)cap->args[0]; if (copy_to_user(user_ptr, &vmcs_version, @@ -4228,10 +4228,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, } return r; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - if (!kvm_x86_ops->enable_direct_tlbflush) + if (!kvm_x86_ops.enable_direct_tlbflush) return -ENOTTY; - return kvm_x86_ops->enable_direct_tlbflush(vcpu); + return kvm_x86_ops.enable_direct_tlbflush(vcpu); default: return -EINVAL; @@ -4534,7 +4534,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_data_size; r = -EINVAL; - if (!kvm_x86_ops->get_nested_state) + if (!kvm_x86_ops.get_nested_state) break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); @@ -4542,7 +4542,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (get_user(user_data_size, &user_kvm_nested_state->size)) break; - r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, + r = kvm_x86_ops.get_nested_state(vcpu, user_kvm_nested_state, user_data_size); if (r < 0) break; @@ -4564,7 +4564,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, int idx; r = -EINVAL; - if (!kvm_x86_ops->set_nested_state) + if (!kvm_x86_ops.set_nested_state) break; r = -EFAULT; @@ -4586,7 +4586,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + r = kvm_x86_ops.set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } @@ -4630,14 +4630,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) if (addr > (unsigned int)(-3 * PAGE_SIZE)) return -EINVAL; - ret = kvm_x86_ops->set_tss_addr(kvm, addr); + ret = kvm_x86_ops.set_tss_addr(kvm, addr); return ret; } static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { - return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr); + return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr); } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, @@ -4794,8 +4794,8 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) /* * Flush potentially hardware-cached dirty pages to dirty_bitmap. */ - if (kvm_x86_ops->flush_log_dirty) - kvm_x86_ops->flush_log_dirty(kvm); + if (kvm_x86_ops.flush_log_dirty) + kvm_x86_ops.flush_log_dirty(kvm); } int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, @@ -5148,8 +5148,8 @@ set_identity_unlock: } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; - if (kvm_x86_ops->mem_enc_op) - r = kvm_x86_ops->mem_enc_op(kvm, argp); + if (kvm_x86_ops.mem_enc_op) + r = kvm_x86_ops.mem_enc_op(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -5160,8 +5160,8 @@ set_identity_unlock: goto out; r = -ENOTTY; - if (kvm_x86_ops->mem_enc_reg_region) - r = kvm_x86_ops->mem_enc_reg_region(kvm, ®ion); + if (kvm_x86_ops.mem_enc_reg_region) + r = kvm_x86_ops.mem_enc_reg_region(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -5172,8 +5172,8 @@ set_identity_unlock: goto out; r = -ENOTTY; - if (kvm_x86_ops->mem_enc_unreg_region) - r = kvm_x86_ops->mem_enc_unreg_region(kvm, ®ion); + if (kvm_x86_ops.mem_enc_unreg_region) + r = kvm_x86_ops.mem_enc_unreg_region(kvm, ®ion); break; } case KVM_HYPERV_EVENTFD: { @@ -5268,7 +5268,7 @@ static void kvm_init_msr_list(void) } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { - if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i])) + if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i])) continue; emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; @@ -5331,13 +5331,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) static void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops->set_segment(vcpu, var, seg); + kvm_x86_ops.set_segment(vcpu, var, seg); } void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops->get_segment(vcpu, var, seg); + kvm_x86_ops.get_segment(vcpu, var, seg); } gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, @@ -5357,14 +5357,14 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } @@ -5372,7 +5372,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } @@ -5421,7 +5421,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; @@ -5446,7 +5446,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED @@ -5467,7 +5467,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = 0; - if (!system && kvm_x86_ops->get_cpl(vcpu) == 3) + if (!system && kvm_x86_ops.get_cpl(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); @@ -5520,7 +5520,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = PFERR_WRITE_MASK; - if (!system && kvm_x86_ops->get_cpl(vcpu) == 3) + if (!system && kvm_x86_ops.get_cpl(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, @@ -5583,7 +5583,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { - u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) + u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); /* @@ -5981,7 +5981,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { - return kvm_x86_ops->get_segment_base(vcpu, seg); + return kvm_x86_ops.get_segment_base(vcpu, seg); } static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) @@ -5994,7 +5994,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; - if (kvm_x86_ops->has_wbinvd_exit()) { + if (kvm_x86_ops.has_wbinvd_exit()) { int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); @@ -6099,27 +6099,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { - return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); + return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt)); } static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt); } static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt); } static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt); } static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt); } static unsigned long emulator_get_cached_segment_base( @@ -6241,7 +6241,7 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage, + return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage, &ctxt->exception); } @@ -6279,7 +6279,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) { - kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); + kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked); } static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) @@ -6295,7 +6295,7 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, const char *smstate) { - return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate); + return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate); } static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) @@ -6357,7 +6357,7 @@ static const struct x86_emulate_ops emulate_ops = { static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); + u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -6368,7 +6368,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) if (int_shadow & mask) mask = 0; if (unlikely(int_shadow || mask)) { - kvm_x86_ops->set_interrupt_shadow(vcpu, mask); + kvm_x86_ops.set_interrupt_shadow(vcpu, mask); if (!mask) kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -6410,7 +6410,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int cs_db, cs_l; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); ctxt->gpa_available = false; ctxt->eflags = kvm_get_rflags(vcpu); @@ -6471,7 +6471,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) kvm_queue_exception(vcpu, UD_VECTOR); - if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { + if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -6652,10 +6652,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); int r; - r = kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_x86_ops.skip_emulated_instruction(vcpu); if (unlikely(!r)) return 0; @@ -6890,7 +6890,7 @@ restart: r = 1; if (writeback) { - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; if (!ctxt->have_exception || @@ -6898,8 +6898,8 @@ restart: kvm_rip_write(vcpu, ctxt->eip); if (r && ctxt->tf) r = kvm_vcpu_do_singlestep(vcpu); - if (kvm_x86_ops->update_emulated_instruction) - kvm_x86_ops->update_emulated_instruction(vcpu); + if (kvm_x86_ops.update_emulated_instruction) + kvm_x86_ops.update_emulated_instruction(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -7226,7 +7226,7 @@ static int kvm_is_user_mode(void) int user_mode = 3; if (__this_cpu_read(current_vcpu)) - user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu)); + user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu)); return user_mode != 0; } @@ -7306,7 +7306,7 @@ int kvm_arch_init(void *opaque) struct kvm_x86_init_ops *ops = opaque; int r; - if (kvm_x86_ops) { + if (kvm_x86_ops.hardware_enable) { printk(KERN_ERR "kvm: already loaded the other module\n"); r = -EEXIST; goto out; @@ -7409,7 +7409,7 @@ void kvm_arch_exit(void) #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); #endif - kvm_x86_ops = NULL; + kvm_x86_ops.hardware_enable = NULL; kvm_mmu_module_exit(); free_percpu(shared_msrs); kmem_cache_destroy(x86_fpu_cache); @@ -7547,7 +7547,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } - if (kvm_x86_ops->get_cpl(vcpu) != 0) { + if (kvm_x86_ops.get_cpl(vcpu) != 0) { ret = -KVM_EPERM; goto out; } @@ -7593,7 +7593,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); - kvm_x86_ops->patch_hypercall(vcpu, instruction); + kvm_x86_ops.patch_hypercall(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, &ctxt->exception); @@ -7622,7 +7622,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) { int max_irr, tpr; - if (!kvm_x86_ops->update_cr8_intercept) + if (!kvm_x86_ops.update_cr8_intercept) return; if (!lapic_in_kernel(vcpu)) @@ -7641,7 +7641,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) tpr = kvm_lapic_get_cr8(vcpu); - kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); + kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr); } static int inject_pending_event(struct kvm_vcpu *vcpu) @@ -7651,7 +7651,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) /* try to reinject previous events if any */ if (vcpu->arch.exception.injected) - kvm_x86_ops->queue_exception(vcpu); + kvm_x86_ops.queue_exception(vcpu); /* * Do not inject an NMI or interrupt if there is a pending * exception. Exceptions and interrupts are recognized at @@ -7668,9 +7668,9 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) */ else if (!vcpu->arch.exception.pending) { if (vcpu->arch.nmi_injected) - kvm_x86_ops->set_nmi(vcpu); + kvm_x86_ops.set_nmi(vcpu); else if (vcpu->arch.interrupt.injected) - kvm_x86_ops->set_irq(vcpu); + kvm_x86_ops.set_irq(vcpu); } /* @@ -7679,8 +7679,8 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) * from L2 to L1 due to pending L1 events which require exit * from L2 to L1. */ - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu); + if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) { + r = kvm_x86_ops.check_nested_events(vcpu); if (r != 0) return r; } @@ -7717,7 +7717,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) } } - kvm_x86_ops->queue_exception(vcpu); + kvm_x86_ops.queue_exception(vcpu); } /* Don't consider new event if we re-injected an event */ @@ -7725,14 +7725,14 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) return 0; if (vcpu->arch.smi_pending && !is_smm(vcpu) && - kvm_x86_ops->smi_allowed(vcpu)) { + kvm_x86_ops.smi_allowed(vcpu)) { vcpu->arch.smi_pending = false; ++vcpu->arch.smi_count; enter_smm(vcpu); - } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { + } else if (vcpu->arch.nmi_pending && kvm_x86_ops.nmi_allowed(vcpu)) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - kvm_x86_ops->set_nmi(vcpu); + kvm_x86_ops.set_nmi(vcpu); } else if (kvm_cpu_has_injectable_intr(vcpu)) { /* * Because interrupts can be injected asynchronously, we are @@ -7741,15 +7741,15 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) * proposal and current concerns. Perhaps we should be setting * KVM_REQ_EVENT only on certain events and not unconditionally? */ - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu); + if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) { + r = kvm_x86_ops.check_nested_events(vcpu); if (r != 0) return r; } - if (kvm_x86_ops->interrupt_allowed(vcpu)) { + if (kvm_x86_ops.interrupt_allowed(vcpu)) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - kvm_x86_ops->set_irq(vcpu); + kvm_x86_ops.set_irq(vcpu); } } @@ -7765,7 +7765,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) * If an NMI is already in progress, limit further NMIs to just one. * Otherwise, allow two (and we'll inject the first one immediately). */ - if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) + if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) limit = 1; vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); @@ -7855,11 +7855,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7f7c, seg.limit); put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); - kvm_x86_ops->get_gdt(vcpu, &dt); + kvm_x86_ops.get_gdt(vcpu, &dt); put_smstate(u32, buf, 0x7f74, dt.address); put_smstate(u32, buf, 0x7f70, dt.size); - kvm_x86_ops->get_idt(vcpu, &dt); + kvm_x86_ops.get_idt(vcpu, &dt); put_smstate(u32, buf, 0x7f58, dt.address); put_smstate(u32, buf, 0x7f54, dt.size); @@ -7909,7 +7909,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7e94, seg.limit); put_smstate(u64, buf, 0x7e98, seg.base); - kvm_x86_ops->get_idt(vcpu, &dt); + kvm_x86_ops.get_idt(vcpu, &dt); put_smstate(u32, buf, 0x7e84, dt.size); put_smstate(u64, buf, 0x7e88, dt.address); @@ -7919,7 +7919,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7e74, seg.limit); put_smstate(u64, buf, 0x7e78, seg.base); - kvm_x86_ops->get_gdt(vcpu, &dt); + kvm_x86_ops.get_gdt(vcpu, &dt); put_smstate(u32, buf, 0x7e64, dt.size); put_smstate(u64, buf, 0x7e68, dt.address); @@ -7949,28 +7949,28 @@ static void enter_smm(struct kvm_vcpu *vcpu) * vCPU state (e.g. leave guest mode) after we've saved the state into * the SMM state-save area. */ - kvm_x86_ops->pre_enter_smm(vcpu, buf); + kvm_x86_ops.pre_enter_smm(vcpu, buf); vcpu->arch.hflags |= HF_SMM_MASK; kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); - if (kvm_x86_ops->get_nmi_mask(vcpu)) + if (kvm_x86_ops.get_nmi_mask(vcpu)) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else - kvm_x86_ops->set_nmi_mask(vcpu, true); + kvm_x86_ops.set_nmi_mask(vcpu, true); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0x8000); cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - kvm_x86_ops->set_cr0(vcpu, cr0); + kvm_x86_ops.set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; - kvm_x86_ops->set_cr4(vcpu, 0); + kvm_x86_ops.set_cr4(vcpu, 0); /* Undocumented: IDT limit is set to zero on entry to SMM. */ dt.address = dt.size = 0; - kvm_x86_ops->set_idt(vcpu, &dt); + kvm_x86_ops.set_idt(vcpu, &dt); __kvm_set_dr(vcpu, 7, DR7_FIXED_1); @@ -8001,7 +8001,7 @@ static void enter_smm(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - kvm_x86_ops->set_efer(vcpu, 0); + kvm_x86_ops.set_efer(vcpu, 0); #endif kvm_update_cpuid(vcpu); @@ -8039,7 +8039,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); kvm_apic_update_apicv(vcpu); - kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); + kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); @@ -8054,8 +8054,8 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { unsigned long old, new, expected; - if (!kvm_x86_ops->check_apicv_inhibit_reasons || - !kvm_x86_ops->check_apicv_inhibit_reasons(bit)) + if (!kvm_x86_ops.check_apicv_inhibit_reasons || + !kvm_x86_ops.check_apicv_inhibit_reasons(bit)) return; old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); @@ -8074,8 +8074,8 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) return; trace_kvm_apicv_update_request(activate, bit); - if (kvm_x86_ops->pre_update_apicv_exec_ctrl) - kvm_x86_ops->pre_update_apicv_exec_ctrl(kvm, activate); + if (kvm_x86_ops.pre_update_apicv_exec_ctrl) + kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate); kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -8091,7 +8091,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_x86_ops.sync_pir_to_irr(vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -8111,7 +8111,7 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, vcpu_to_synic(vcpu)->vec_bitmap, 256); - kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap); } int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, @@ -8138,13 +8138,13 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (!kvm_x86_ops->set_apic_access_page_addr) + if (!kvm_x86_ops.set_apic_access_page_addr) return; page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (is_error_page(page)) return; - kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); + kvm_x86_ops.set_apic_access_page_addr(vcpu, page_to_phys(page)); /* * Do not pin apic access page in memory, the MMU notifier @@ -8176,7 +8176,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) { + if (unlikely(!kvm_x86_ops.get_vmcs12_pages(vcpu))) { r = 0; goto out; } @@ -8300,12 +8300,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * SMI. */ if (vcpu->arch.smi_pending && !is_smm(vcpu)) - if (!kvm_x86_ops->enable_smi_window(vcpu)) + if (!kvm_x86_ops.enable_smi_window(vcpu)) req_immediate_exit = true; if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); + kvm_x86_ops.enable_nmi_window(vcpu); if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); + kvm_x86_ops.enable_irq_window(vcpu); WARN_ON(vcpu->arch.exception.pending); } @@ -8322,7 +8322,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - kvm_x86_ops->prepare_guest_switch(vcpu); + kvm_x86_ops.prepare_guest_switch(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -8353,7 +8353,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * notified with kvm_vcpu_kick. */ if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_x86_ops.sync_pir_to_irr(vcpu); if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched() || signal_pending(current)) { @@ -8368,7 +8368,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_x86_ops->request_immediate_exit(vcpu); + kvm_x86_ops.request_immediate_exit(vcpu); } trace_kvm_entry(vcpu->vcpu_id); @@ -8388,7 +8388,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } - kvm_x86_ops->run(vcpu); + kvm_x86_ops.run(vcpu); /* * Do this here before restoring debug registers on the host. And @@ -8398,7 +8398,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); - kvm_x86_ops->sync_dirty_debug_regs(vcpu); + kvm_x86_ops.sync_dirty_debug_regs(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr6(vcpu); kvm_update_dr7(vcpu); @@ -8420,7 +8420,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_fastpath); + kvm_x86_ops.handle_exit_irqoff(vcpu, &exit_fastpath); /* * Consume any pending interrupts, including the possible source of @@ -8463,11 +8463,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath); + r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath); return r; cancel_injection: - kvm_x86_ops->cancel_injection(vcpu); + kvm_x86_ops.cancel_injection(vcpu); if (unlikely(vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); out: @@ -8477,13 +8477,13 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { + (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_x86_ops->post_block) - kvm_x86_ops->post_block(vcpu); + if (kvm_x86_ops.post_block) + kvm_x86_ops.post_block(vcpu); if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; @@ -8509,8 +8509,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) - kvm_x86_ops->check_nested_events(vcpu); + if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) + kvm_x86_ops.check_nested_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); @@ -8666,7 +8666,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_save_current_fpu(vcpu->arch.user_fpu); - /* PKRU is separately restored in kvm_x86_ops->run. */ + /* PKRU is separately restored in kvm_x86_ops.run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); @@ -8869,10 +8869,10 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - kvm_x86_ops->get_idt(vcpu, &dt); + kvm_x86_ops.get_idt(vcpu, &dt); sregs->idt.limit = dt.size; sregs->idt.base = dt.address; - kvm_x86_ops->get_gdt(vcpu, &dt); + kvm_x86_ops.get_gdt(vcpu, &dt); sregs->gdt.limit = dt.size; sregs->gdt.base = dt.address; @@ -9019,10 +9019,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) dt.size = sregs->idt.limit; dt.address = sregs->idt.base; - kvm_x86_ops->set_idt(vcpu, &dt); + kvm_x86_ops.set_idt(vcpu, &dt); dt.size = sregs->gdt.limit; dt.address = sregs->gdt.base; - kvm_x86_ops->set_gdt(vcpu, &dt); + kvm_x86_ops.set_gdt(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; @@ -9032,16 +9032,16 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_set_cr8(vcpu, sregs->cr8); mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - kvm_x86_ops->set_efer(vcpu, sregs->efer); + kvm_x86_ops.set_efer(vcpu, sregs->efer); mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - kvm_x86_ops->set_cr0(vcpu, sregs->cr0); + kvm_x86_ops.set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)); - kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + kvm_x86_ops.set_cr4(vcpu, sregs->cr4); if (cpuid_update_needed) kvm_update_cpuid(vcpu); @@ -9147,7 +9147,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops->update_bp_intercept(vcpu); + kvm_x86_ops.update_bp_intercept(vcpu); r = 0; @@ -9358,7 +9358,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_hv_vcpu_init(vcpu); - r = kvm_x86_ops->vcpu_create(vcpu); + r = kvm_x86_ops.vcpu_create(vcpu); if (r) goto free_guest_fpu; @@ -9425,7 +9425,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvmclock_reset(vcpu); - kvm_x86_ops->vcpu_free(vcpu); + kvm_x86_ops.vcpu_free(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); @@ -9513,7 +9513,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.ia32_xss = 0; - kvm_x86_ops->vcpu_reset(vcpu, init_event); + kvm_x86_ops.vcpu_reset(vcpu, init_event); } void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -9538,7 +9538,7 @@ int kvm_arch_hardware_enable(void) bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); - ret = kvm_x86_ops->hardware_enable(); + ret = kvm_x86_ops.hardware_enable(); if (ret != 0) return ret; @@ -9620,7 +9620,7 @@ int kvm_arch_hardware_enable(void) void kvm_arch_hardware_disable(void) { - kvm_x86_ops->hardware_disable(); + kvm_x86_ops.hardware_disable(); drop_user_return_notifiers(); } @@ -9638,7 +9638,7 @@ int kvm_arch_hardware_setup(void *opaque) if (r != 0) return r; - kvm_x86_ops = ops->runtime_ops; + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; @@ -9665,7 +9665,7 @@ int kvm_arch_hardware_setup(void *opaque) void kvm_arch_hardware_unsetup(void) { - kvm_x86_ops->hardware_unsetup(); + kvm_x86_ops.hardware_unsetup(); } int kvm_arch_check_processor_compat(void *opaque) @@ -9704,7 +9704,7 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) pmu->need_cleanup = true; kvm_make_request(KVM_REQ_PMU, vcpu); } - kvm_x86_ops->sched_in(vcpu, cpu); + kvm_x86_ops.sched_in(vcpu, cpu); } void kvm_arch_free_vm(struct kvm *kvm) @@ -9748,7 +9748,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); - return kvm_x86_ops->vm_init(kvm); + return kvm_x86_ops.vm_init(kvm); } int kvm_arch_post_init_vm(struct kvm *kvm) @@ -9871,8 +9871,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } - if (kvm_x86_ops->vm_destroy) - kvm_x86_ops->vm_destroy(kvm); + if (kvm_x86_ops.vm_destroy) + kvm_x86_ops.vm_destroy(kvm); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); @@ -10010,7 +10010,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, /* * Call kvm_x86_ops dirty logging hooks when they are valid. * - * kvm_x86_ops->slot_disable_log_dirty is called when: + * kvm_x86_ops.slot_disable_log_dirty is called when: * * - KVM_MR_CREATE with dirty logging is disabled * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag @@ -10022,7 +10022,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * any additional overhead from PML when guest is running with dirty * logging disabled for memory slots. * - * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot + * kvm_x86_ops.slot_enable_log_dirty is called when switching new slot * to dirty logging mode. * * If kvm_x86_ops dirty logging hooks are invalid, use write protect. @@ -10038,8 +10038,8 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * See the comments in fast_page_fault(). */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { - if (kvm_x86_ops->slot_enable_log_dirty) { - kvm_x86_ops->slot_enable_log_dirty(kvm, new); + if (kvm_x86_ops.slot_enable_log_dirty) { + kvm_x86_ops.slot_enable_log_dirty(kvm, new); } else { int level = kvm_dirty_log_manual_protect_and_init_set(kvm) ? @@ -10056,8 +10056,8 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, kvm_mmu_slot_remove_write_access(kvm, new, level); } } else { - if (kvm_x86_ops->slot_disable_log_dirty) - kvm_x86_ops->slot_disable_log_dirty(kvm, new); + if (kvm_x86_ops.slot_disable_log_dirty) + kvm_x86_ops.slot_disable_log_dirty(kvm, new); } } @@ -10125,8 +10125,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && - kvm_x86_ops->guest_apic_has_interrupt && - kvm_x86_ops->guest_apic_has_interrupt(vcpu)); + kvm_x86_ops.guest_apic_has_interrupt && + kvm_x86_ops.guest_apic_has_interrupt(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) @@ -10145,7 +10145,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && - kvm_x86_ops->nmi_allowed(vcpu))) + kvm_x86_ops.nmi_allowed(vcpu))) return true; if (kvm_test_request(KVM_REQ_SMI, vcpu) || @@ -10178,7 +10178,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) kvm_test_request(KVM_REQ_EVENT, vcpu)) return true; - if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu)) + if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu)) return true; return false; @@ -10196,7 +10196,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->interrupt_allowed(vcpu); + return kvm_x86_ops.interrupt_allowed(vcpu); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) @@ -10218,7 +10218,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; - rflags = kvm_x86_ops->get_rflags(vcpu); + rflags = kvm_x86_ops.get_rflags(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) rflags &= ~X86_EFLAGS_TF; return rflags; @@ -10230,7 +10230,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; - kvm_x86_ops->set_rflags(vcpu, rflags); + kvm_x86_ops.set_rflags(vcpu, rflags); } void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) @@ -10341,7 +10341,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) + kvm_x86_ops.get_cpl(vcpu) == 0)) return false; return true; @@ -10361,7 +10361,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) * If interrupts are off we cannot even use an artificial * halt state. */ - return kvm_x86_ops->interrupt_allowed(vcpu); + return kvm_x86_ops.interrupt_allowed(vcpu); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, @@ -10490,7 +10490,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, irqfd->producer = prod; - return kvm_x86_ops->update_pi_irte(irqfd->kvm, + return kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 1); } @@ -10510,7 +10510,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); @@ -10519,7 +10519,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); + return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set); } bool kvm_vector_hashing_enabled(void) -- cgit