diff options
Diffstat (limited to 'virt/kvm/kvm_main.c')
-rw-r--r-- | virt/kvm/kvm_main.c | 1704 |
1 files changed, 802 insertions, 902 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fb49c2a60200..eec82775c5bf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. + * Kernel-based Virtual Machine (KVM) Hypervisor * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. @@ -74,6 +71,7 @@ #define ITOA_MAX_LEN 12 MODULE_AUTHOR("Qumranet"); +MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor"); MODULE_LICENSE("GPL"); /* Architectures should define their poll value according to the halt latency */ @@ -91,12 +89,19 @@ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ module_param(halt_poll_ns_grow_start, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); -/* Default resets per-vcpu halt_poll_ns . */ -unsigned int halt_poll_ns_shrink; +/* Default halves per-vcpu halt_poll_ns. */ +unsigned int halt_poll_ns_shrink = 2; module_param(halt_poll_ns_shrink, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); /* + * Allow direct access (from KVM or the CPU) without MMU notifier protection + * to unpinned pages. + */ +static bool allow_unsafe_mappings; +module_param(allow_unsafe_mappings, bool, 0444); + +/* * Ordering of locks: * * kvm->lock --> kvm->slots_lock --> kvm->irq_lock @@ -110,8 +115,7 @@ static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); -struct dentry *kvm_debugfs_dir; -EXPORT_SYMBOL_GPL(kvm_debugfs_dir); +static struct dentry *kvm_debugfs_dir; static const struct file_operations stat_fops_per_vm; @@ -139,8 +143,6 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file) #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ .open = kvm_no_compat_open #endif -static int hardware_enable_all(void); -static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); @@ -156,52 +158,6 @@ __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } -bool kvm_is_zone_device_page(struct page *page) -{ - /* - * The metadata used by is_zone_device_page() to determine whether or - * not a page is ZONE_DEVICE is guaranteed to be valid if and only if - * the device has been pinned, e.g. by get_user_pages(). WARN if the - * page_count() is zero to help detect bad usage of this helper. - */ - if (WARN_ON_ONCE(!page_count(page))) - return false; - - return is_zone_device_page(page); -} - -/* - * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted - * page, NULL otherwise. Note, the list of refcounted PG_reserved page types - * is likely incomplete, it has been compiled purely through people wanting to - * back guest with a certain type of memory and encountering issues. - */ -struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn) -{ - struct page *page; - - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); - if (!PageReserved(page)) - return page; - - /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */ - if (is_zero_pfn(pfn)) - return page; - - /* - * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting - * perspective they are "normal" pages, albeit with slightly different - * usage rules. - */ - if (kvm_is_zone_device_page(page)) - return page; - - return NULL; -} - /* * Switches to specified vcpu, until a matching vcpu_put() */ @@ -311,8 +267,7 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, return called; } -bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, - struct kvm_vcpu *except) +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { struct kvm_vcpu *vcpu; struct cpumask *cpus; @@ -325,22 +280,14 @@ bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask); cpumask_clear(cpus); - kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu == except) - continue; + kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_vcpu_request(vcpu, req, cpus, me); - } called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); put_cpu(); return called; } - -bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) -{ - return kvm_make_all_cpus_request_except(kvm, req, NULL); -} EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); void kvm_flush_remote_tlbs(struct kvm *kvm) @@ -401,12 +348,17 @@ static void kvm_flush_shadow_all(struct kvm *kvm) static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, gfp_t gfp_flags) { + void *page; + gfp_flags |= mc->gfp_zero; if (mc->kmem_cache) return kmem_cache_alloc(mc->kmem_cache, gfp_flags); - else - return (void *)__get_free_page(gfp_flags); + + page = (void *)__get_free_page(gfp_flags); + if (page && mc->init_value) + memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64)); + return page; } int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) @@ -421,6 +373,13 @@ int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, if (WARN_ON_ONCE(!capacity)) return -EIO; + /* + * Custom init values can be used only for page allocations, + * and obviously conflict with __GFP_ZERO. + */ + if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero))) + return -EIO; + mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp); if (!mc->objects) return -ENOMEM; @@ -486,6 +445,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; + rwlock_init(&vcpu->pid_lock); #ifndef __KVM_HAVE_ARCH_WQP rcuwait_init(&vcpu->wait); #endif @@ -513,7 +473,7 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) * the vcpu->pid pointer, and at destruction time all file descriptors * are already gone. */ - put_pid(rcu_dereference_protected(vcpu->pid, 1)); + put_pid(vcpu->pid); free_page((unsigned long)vcpu->run); kmem_cache_free(kvm_vcpu_cache, vcpu); @@ -527,6 +487,14 @@ void kvm_destroy_vcpus(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) { kvm_vcpu_destroy(vcpu); xa_erase(&kvm->vcpu_array, i); + + /* + * Assert that the vCPU isn't visible in any way, to ensure KVM + * doesn't trigger a use-after-free if destroying vCPUs results + * in VM-wide request, e.g. to flush remote TLBs when tearing + * down MMUs, or to mark the VM dead if a KVM_BUG_ON() fires. + */ + WARN_ON_ONCE(xa_load(&kvm->vcpu_array, i) || kvm_get_vcpu(kvm, i)); } atomic_set(&kvm->online_vcpus, 0); @@ -555,6 +523,7 @@ struct kvm_mmu_notifier_range { on_lock_fn_t on_lock; bool flush_on_ret; bool may_block; + bool lockless; }; /* @@ -583,16 +552,14 @@ static void kvm_null_fn(void) } #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn) -static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG; - /* Iterate over each memslot intersecting [start, last] (inclusive) range */ #define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \ for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \ node; \ node = interval_tree_iter_next(node, start, last)) \ -static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, - const struct kvm_mmu_notifier_range *range) +static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm, + const struct kvm_mmu_notifier_range *range) { struct kvm_mmu_notifier_return r = { .ret = false, @@ -611,6 +578,10 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, IS_KVM_NULL_FN(range->handler))) return r; + /* on_lock will never be called for lockless walks */ + if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock))) + return r; + idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { @@ -634,6 +605,11 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, */ gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; + /* + * HVA-based notifications aren't relevant to private + * mappings as they don't have a userspace mapping. + */ + gfn_range.attr_filter = KVM_FILTER_SHARED; /* * {gfn(page) | page intersects with [hva_start, hva_end)} = @@ -642,15 +618,18 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, gfn_range.start = hva_to_gfn_memslot(hva_start, slot); gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); gfn_range.slot = slot; + gfn_range.lockless = range->lockless; if (!r.found_memslot) { r.found_memslot = true; - KVM_MMU_LOCK(kvm); - if (!IS_KVM_NULL_FN(range->on_lock)) - range->on_lock(kvm); - - if (IS_KVM_NULL_FN(range->handler)) - break; + if (!range->lockless) { + KVM_MMU_LOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_lock)) + range->on_lock(kvm); + + if (IS_KVM_NULL_FN(range->handler)) + goto mmu_unlock; + } } r.ret |= range->handler(kvm, &gfn_range); } @@ -659,7 +638,8 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, if (range->flush_on_ret && r.ret) kvm_flush_remote_tlbs(kvm); - if (r.found_memslot) +mmu_unlock: + if (r.found_memslot && !range->lockless) KVM_MMU_UNLOCK(kvm); srcu_read_unlock(&kvm->srcu, idx); @@ -667,84 +647,32 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, return r; } -static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, +static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn, unsigned long start, unsigned long end, - union kvm_mmu_notifier_arg arg, - gfn_handler_t handler) + gfn_handler_t handler, + bool flush_on_ret) { struct kvm *kvm = mmu_notifier_to_kvm(mn); const struct kvm_mmu_notifier_range range = { .start = start, .end = end, - .arg = arg, .handler = handler, .on_lock = (void *)kvm_null_fn, - .flush_on_ret = true, + .flush_on_ret = flush_on_ret, .may_block = false, + .lockless = IS_ENABLED(CONFIG_KVM_MMU_LOCKLESS_AGING), }; - return __kvm_handle_hva_range(kvm, &range).ret; + return kvm_handle_hva_range(kvm, &range).ret; } -static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, - unsigned long start, - unsigned long end, - gfn_handler_t handler) +static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn, + unsigned long start, + unsigned long end, + gfn_handler_t handler) { - struct kvm *kvm = mmu_notifier_to_kvm(mn); - const struct kvm_mmu_notifier_range range = { - .start = start, - .end = end, - .handler = handler, - .on_lock = (void *)kvm_null_fn, - .flush_on_ret = false, - .may_block = false, - }; - - return __kvm_handle_hva_range(kvm, &range).ret; -} - -static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - /* - * Skipping invalid memslots is correct if and only change_pte() is - * surrounded by invalidate_range_{start,end}(), which is currently - * guaranteed by the primary MMU. If that ever changes, KVM needs to - * unmap the memslot instead of skipping the memslot to ensure that KVM - * doesn't hold references to the old PFN. - */ - WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); - - if (range->slot->flags & KVM_MEMSLOT_INVALID) - return false; - - return kvm_set_spte_gfn(kvm, range); -} - -static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address, - pte_t pte) -{ - struct kvm *kvm = mmu_notifier_to_kvm(mn); - const union kvm_mmu_notifier_arg arg = { .pte = pte }; - - trace_kvm_set_spte_hva(address); - - /* - * .change_pte() must be surrounded by .invalidate_range_{start,end}(). - * If mmu_invalidate_in_progress is zero, then no in-progress - * invalidations, including this one, found a relevant memslot at - * start(); rechecking memslots here is unnecessary. Note, a false - * positive (count elevated by a different invalidation) is sub-optimal - * but functionally ok. - */ - WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); - if (!READ_ONCE(kvm->mmu_invalidate_in_progress)) - return; - - kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn); + return kvm_age_hva_range(mn, start, end, handler, false); } void kvm_mmu_invalidate_begin(struct kvm *kvm) @@ -832,15 +760,14 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * mn_active_invalidate_count (see above) instead of * mmu_invalidate_in_progress. */ - gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end, - hva_range.may_block); + gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end); /* * If one or more memslots were found and thus zapped, notify arch code * that guest memory has been reclaimed. This needs to be done *after* * dropping mmu_lock, as x86's reclaim path is slooooow. */ - if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot) + if (kvm_handle_hva_range(kvm, &hva_range).found_memslot) kvm_arch_guest_memory_reclaimed(kvm); return 0; @@ -886,7 +813,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, }; bool wake; - __kvm_handle_hva_range(kvm, &hva_range); + kvm_handle_hva_range(kvm, &hva_range); /* Pairs with the increment in range_start(). */ spin_lock(&kvm->mn_invalidate_lock); @@ -910,8 +837,8 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, { trace_kvm_age_hva(start, end); - return kvm_handle_hva_range(mn, start, end, KVM_MMU_NOTIFIER_NO_ARG, - kvm_age_gfn); + return kvm_age_hva_range(mn, start, end, kvm_age_gfn, + !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG)); } static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, @@ -934,7 +861,7 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, * cadence. If we find this inaccurate, we might come up with a * more sophisticated heuristic later. */ - return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn); + return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn); } static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, @@ -943,8 +870,8 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, { trace_kvm_test_age_hva(address); - return kvm_handle_hva_range_no_flush(mn, address, address + 1, - kvm_test_age_gfn); + return kvm_age_hva_range_no_flush(mn, address, address + 1, + kvm_test_age_gfn); } static void kvm_mmu_notifier_release(struct mmu_notifier *mn, @@ -964,7 +891,6 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .clear_flush_young = kvm_mmu_notifier_clear_flush_young, .clear_young = kvm_mmu_notifier_clear_young, .test_young = kvm_mmu_notifier_test_young, - .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, }; @@ -1020,7 +946,7 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) if (!memslot->dirty_bitmap) return; - kvfree(memslot->dirty_bitmap); + vfree(memslot->dirty_bitmap); memslot->dirty_bitmap = NULL; } @@ -1160,15 +1086,6 @@ out_err: } /* - * Called after the VM is otherwise initialized, but just before adding it to - * the vm_list. - */ -int __weak kvm_arch_post_init_vm(struct kvm *kvm) -{ - return 0; -} - -/* * Called just after removing the VM from the vm_list, but before doing any * other destruction. */ @@ -1190,8 +1107,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); struct kvm_memslots *slots; - int r = -ENOMEM; - int i, j; + int r, i, j; if (!kvm) return ERR_PTR(-ENOMEM); @@ -1228,12 +1144,18 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d", task_pid_nr(current)); + r = -ENOMEM; if (init_srcu_struct(&kvm->srcu)) goto out_err_no_srcu; if (init_srcu_struct(&kvm->irq_srcu)) goto out_err_no_irq_srcu; + r = kvm_init_irq_routing(kvm); + if (r) + goto out_err_no_irq_routing; + refcount_set(&kvm->users_count, 1); + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { for (j = 0; j < 2; j++) { slots = &kvm->__memslots[i][j]; @@ -1251,6 +1173,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]); } + r = -ENOMEM; for (i = 0; i < KVM_NR_BUSES; i++) { rcu_assign_pointer(kvm->buses[i], kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); @@ -1262,7 +1185,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) if (r) goto out_err_no_arch_destroy_vm; - r = hardware_enable_all(); + r = kvm_enable_virtualization(); if (r) goto out_err_no_disable; @@ -1282,10 +1205,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) if (r) goto out_err_no_debugfs; - r = kvm_arch_post_init_vm(kvm); - if (r) - goto out_err; - mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); mutex_unlock(&kvm_lock); @@ -1295,8 +1214,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) return kvm; -out_err: - kvm_destroy_vm_debugfs(kvm); out_err_no_debugfs: kvm_coalesced_mmio_free(kvm); out_no_coalesced_mmio: @@ -1305,13 +1222,15 @@ out_no_coalesced_mmio: mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); #endif out_err_no_mmu_notifier: - hardware_disable_all(); + kvm_disable_virtualization(); out_err_no_disable: kvm_arch_destroy_vm(kvm); out_err_no_arch_destroy_vm: WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm_get_bus(kvm, i)); + kvm_free_irq_routing(kvm); +out_err_no_irq_routing: cleanup_srcu_struct(&kvm->irq_srcu); out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); @@ -1329,6 +1248,12 @@ static void kvm_destroy_devices(struct kvm *kvm) * We do not need to take the kvm->lock here, because nobody else * has a reference to the struct kvm at this point and therefore * cannot access the devices list anyhow. + * + * The device list is generally managed as an rculist, but list_del() + * is used intentionally here. If a bug in KVM introduced a reader that + * was not backed by a reference on the kvm struct, the hope is that + * it'd consume the poisoned forward pointer instead of suffering a + * use-after-free, even though this cannot be guaranteed. */ list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { list_del(&dev->vm_node); @@ -1344,7 +1269,6 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); kvm_destroy_vm_debugfs(kvm); - kvm_arch_sync_events(kvm); mutex_lock(&kvm_lock); list_del(&kvm->vm_list); mutex_unlock(&kvm_lock); @@ -1394,7 +1318,7 @@ static void kvm_destroy_vm(struct kvm *kvm) #endif kvm_arch_free_vm(kvm); preempt_notifier_dec(); - hardware_disable_all(); + kvm_disable_virtualization(); mmdrop(mm); } @@ -1444,6 +1368,65 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) return 0; } +int kvm_trylock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i, j; + + lockdep_assert_held(&kvm->lock); + + kvm_for_each_vcpu(i, vcpu, kvm) + if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock)) + goto out_unlock; + return 0; + +out_unlock: + kvm_for_each_vcpu(j, vcpu, kvm) { + if (i == j) + break; + mutex_unlock(&vcpu->mutex); + } + return -EINTR; +} +EXPORT_SYMBOL_GPL(kvm_trylock_all_vcpus); + +int kvm_lock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i, j; + int r; + + lockdep_assert_held(&kvm->lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + r = mutex_lock_killable_nest_lock(&vcpu->mutex, &kvm->lock); + if (r) + goto out_unlock; + } + return 0; + +out_unlock: + kvm_for_each_vcpu(j, vcpu, kvm) { + if (i == j) + break; + mutex_unlock(&vcpu->mutex); + } + return r; +} +EXPORT_SYMBOL_GPL(kvm_lock_all_vcpus); + +void kvm_unlock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held(&kvm->lock); + + kvm_for_each_vcpu(i, vcpu, kvm) + mutex_unlock(&vcpu->mutex); +} +EXPORT_SYMBOL_GPL(kvm_unlock_all_vcpus); + /* * Allocation size is twice as large as the actual dirty bitmap size. * See kvm_vm_ioctl_get_dirty_log() why this is needed. @@ -1612,15 +1595,14 @@ static int check_memory_region_flags(struct kvm *kvm, if (mem->flags & KVM_MEM_GUEST_MEMFD) valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; -#ifdef CONFIG_HAVE_KVM_READONLY_MEM /* * GUEST_MEMFD is incompatible with read-only memslots, as writes to * read-only memslots have emulated MMIO, not page fault, semantics, * and KVM doesn't allow emulated MMIO for private memory. */ - if (!(mem->flags & KVM_MEM_GUEST_MEMFD)) + if (kvm_arch_has_readonly_mem(kvm) && + !(mem->flags & KVM_MEM_GUEST_MEMFD)) valid_flags |= KVM_MEM_READONLY; -#endif if (mem->flags & ~valid_flags) return -EINVAL; @@ -2007,16 +1989,8 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, return false; } -/* - * Allocate some memory and give it an address in the guest physical address - * space. - * - * Discontiguous memory is allowed, mostly for framebuffers. - * - * Must be called holding kvm->slots_lock for write. - */ -int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region2 *mem) +static int kvm_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region2 *mem) { struct kvm_memory_slot *old, *new; struct kvm_memslots *slots; @@ -2026,6 +2000,8 @@ int __kvm_set_memory_region(struct kvm *kvm, int as_id, id; int r; + lockdep_assert_held(&kvm->slots_lock); + r = check_memory_region_flags(kvm, mem); if (r) return r; @@ -2053,7 +2029,15 @@ int __kvm_set_memory_region(struct kvm *kvm, return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) return -EINVAL; - if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) + + /* + * The size of userspace-defined memory regions is restricted in order + * to play nice with dirty bitmap operations, which are indexed with an + * "unsigned int". KVM's internal memory regions don't support dirty + * logging, and so are exempt. + */ + if (id < KVM_USER_MEM_SLOTS && + (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) return -EINVAL; slots = __kvm_memslots(kvm, as_id); @@ -2137,19 +2121,19 @@ out: kfree(new); return r; } -EXPORT_SYMBOL_GPL(__kvm_set_memory_region); -int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region2 *mem) +int kvm_set_internal_memslot(struct kvm *kvm, + const struct kvm_userspace_memory_region2 *mem) { - int r; + if (WARN_ON_ONCE(mem->slot < KVM_USER_MEM_SLOTS)) + return -EINVAL; - mutex_lock(&kvm->slots_lock); - r = __kvm_set_memory_region(kvm, mem); - mutex_unlock(&kvm->slots_lock); - return r; + if (WARN_ON_ONCE(mem->flags)) + return -EINVAL; + + return kvm_set_memory_region(kvm, mem); } -EXPORT_SYMBOL_GPL(kvm_set_memory_region); +EXPORT_SYMBOL_GPL(kvm_set_internal_memslot); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region2 *mem) @@ -2157,6 +2141,7 @@ static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; + guard(mutex)(&kvm->slots_lock); return kvm_set_memory_region(kvm, mem); } @@ -2432,48 +2417,47 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static u64 kvm_supported_mem_attributes(struct kvm *kvm) +{ + if (!kvm || kvm_arch_has_private_mem(kvm)) + return KVM_MEMORY_ATTRIBUTE_PRIVATE; + + return 0; +} + /* * Returns true if _all_ gfns in the range [@start, @end) have attributes - * matching @attrs. + * such that the bits in @mask match @attrs. */ bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, - unsigned long attrs) + unsigned long mask, unsigned long attrs) { XA_STATE(xas, &kvm->mem_attr_array, start); unsigned long index; - bool has_attrs; void *entry; - rcu_read_lock(); + mask &= kvm_supported_mem_attributes(kvm); + if (attrs & ~mask) + return false; - if (!attrs) { - has_attrs = !xas_find(&xas, end - 1); - goto out; - } + if (end == start + 1) + return (kvm_get_memory_attributes(kvm, start) & mask) == attrs; + + guard(rcu)(); + if (!attrs) + return !xas_find(&xas, end - 1); - has_attrs = true; for (index = start; index < end; index++) { do { entry = xas_next(&xas); } while (xas_retry(&xas, entry)); - if (xas.xa_index != index || xa_to_value(entry) != attrs) { - has_attrs = false; - break; - } + if (xas.xa_index != index || + (xa_to_value(entry) & mask) != attrs) + return false; } -out: - rcu_read_unlock(); - return has_attrs; -} - -static u64 kvm_supported_mem_attributes(struct kvm *kvm) -{ - if (!kvm || kvm_arch_has_private_mem(kvm)) - return KVM_MEMORY_ATTRIBUTE_PRIVATE; - - return 0; + return true; } static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, @@ -2490,6 +2474,14 @@ static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; + /* + * If/when KVM supports more attributes beyond private .vs shared, this + * _could_ set KVM_FILTER_{SHARED,PRIVATE} appropriately if the entire target + * range already has the desired private vs. shared state (it's unclear + * if that is a net win). For now, KVM reaches this point if and only + * if the private flag is being toggled, i.e. all mappings are in play. + */ + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); @@ -2546,6 +2538,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, struct kvm_mmu_notifier_range pre_set_range = { .start = start, .end = end, + .arg.attributes = attributes, .handler = kvm_pre_set_memory_attributes, .on_lock = kvm_mmu_invalidate_begin, .flush_on_ret = true, @@ -2568,7 +2561,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, mutex_lock(&kvm->slots_lock); /* Nothing to do if the entire range as the desired attributes. */ - if (kvm_range_has_memory_attributes(kvm, start, end, attributes)) + if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes)) goto out_unlock; /* @@ -2782,37 +2775,93 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w return gfn_to_hva_memslot_prot(slot, gfn, writable); } -static inline int check_user_page_hwpoison(unsigned long addr) +static bool kvm_is_ad_tracked_page(struct page *page) { - int rc, flags = FOLL_HWPOISON | FOLL_WRITE; + /* + * Per page-flags.h, pages tagged PG_reserved "should in general not be + * touched (e.g. set dirty) except by its owner". + */ + return !PageReserved(page); +} - rc = get_user_pages(addr, 1, flags, NULL); - return rc == -EHWPOISON; +static void kvm_set_page_dirty(struct page *page) +{ + if (kvm_is_ad_tracked_page(page)) + SetPageDirty(page); +} + +static void kvm_set_page_accessed(struct page *page) +{ + if (kvm_is_ad_tracked_page(page)) + mark_page_accessed(page); +} + +void kvm_release_page_clean(struct page *page) +{ + if (!page) + return; + + kvm_set_page_accessed(page); + put_page(page); +} +EXPORT_SYMBOL_GPL(kvm_release_page_clean); + +void kvm_release_page_dirty(struct page *page) +{ + if (!page) + return; + + kvm_set_page_dirty(page); + kvm_release_page_clean(page); +} +EXPORT_SYMBOL_GPL(kvm_release_page_dirty); + +static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, + struct follow_pfnmap_args *map, bool writable) +{ + kvm_pfn_t pfn; + + WARN_ON_ONCE(!!page == !!map); + + if (kfp->map_writable) + *kfp->map_writable = writable; + + if (map) + pfn = map->pfn; + else + pfn = page_to_pfn(page); + + *kfp->refcounted_page = page; + + return pfn; } /* * The fast path to get the writable pfn which will be stored in @pfn, - * true indicates success, otherwise false is returned. It's also the - * only part that runs if we can in atomic context. + * true indicates success, otherwise false is returned. */ -static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, - bool *writable, kvm_pfn_t *pfn) +static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { - struct page *page[1]; + struct page *page; + bool r; /* - * Fast pin a writable pfn only if it is a write fault request - * or the caller allows to map a writable pfn for a read fault - * request. + * Try the fast-only path when the caller wants to pin/get the page for + * writing. If the caller only wants to read the page, KVM must go + * down the full, slow path in order to avoid racing an operation that + * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing + * at the old, read-only page while mm/ points at a new, writable page. */ - if (!(write_fault || writable)) + if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable)) return false; - if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { - *pfn = page_to_pfn(page[0]); + if (kfp->pin) + r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1; + else + r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page); - if (writable) - *writable = true; + if (r) { + *pfn = kvm_resolve_pfn(kfp, page, NULL, true); return true; } @@ -2823,8 +2872,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, * The slow path to get the pfn of the specified host virtual address, * 1 indicates success, -errno is returned if error is detected. */ -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, - bool interruptible, bool *writable, kvm_pfn_t *pfn) +static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { /* * When a VCPU accesses a page that is not mapped into the secondary @@ -2837,37 +2885,35 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, * Note that get_user_page_fast_only() and FOLL_WRITE for now * implicitly honor NUMA hinting faults and don't need this flag. */ - unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT; - struct page *page; + unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags; + struct page *page, *wpage; int npages; - might_sleep(); - - if (writable) - *writable = write_fault; - - if (write_fault) - flags |= FOLL_WRITE; - if (async) - flags |= FOLL_NOWAIT; - if (interruptible) - flags |= FOLL_INTERRUPTIBLE; - - npages = get_user_pages_unlocked(addr, 1, &page, flags); + if (kfp->pin) + npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags); + else + npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); if (npages != 1) return npages; - /* map read fault as writable if possible */ - if (unlikely(!write_fault) && writable) { - struct page *wpage; + /* + * Pinning is mutually exclusive with opportunistically mapping a read + * fault as writable, as KVM should never pin pages when mapping memory + * into the guest (pinning is only for direct accesses from KVM). + */ + if (WARN_ON_ONCE(kfp->map_writable && kfp->pin)) + goto out; - if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { - *writable = true; - put_page(page); - page = wpage; - } + /* map read fault as writable if possible */ + if (!(flags & FOLL_WRITE) && kfp->map_writable && + get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { + put_page(page); + page = wpage; + flags |= FOLL_WRITE; } - *pfn = page_to_pfn(page); + +out: + *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE); return npages; } @@ -2882,34 +2928,29 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) return true; } -static int kvm_try_get_pfn(kvm_pfn_t pfn) -{ - struct page *page = kvm_pfn_to_refcounted_page(pfn); - - if (!page) - return 1; - - return get_page_unless_zero(page); -} - static int hva_to_pfn_remapped(struct vm_area_struct *vma, - unsigned long addr, bool write_fault, - bool *writable, kvm_pfn_t *p_pfn) + struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn) { - kvm_pfn_t pfn; - pte_t *ptep; - pte_t pte; - spinlock_t *ptl; + struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva }; + bool write_fault = kfp->flags & FOLL_WRITE; int r; - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); + /* + * Remapped memory cannot be pinned in any meaningful sense. Bail if + * the caller wants to pin the page, i.e. access the page outside of + * MMU notifier protection, and unsafe umappings are disallowed. + */ + if (kfp->pin && !allow_unsafe_mappings) + return -EINVAL; + + r = follow_pfnmap_start(&args); if (r) { /* * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does * not call the fault handler, so do it here. */ bool unlocked = false; - r = fixup_user_fault(current->mm, addr, + r = fixup_user_fault(current->mm, kfp->hva, (write_fault ? FAULT_FLAG_WRITE : 0), &unlocked); if (unlocked) @@ -2917,189 +2958,110 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, if (r) return r; - r = follow_pte(vma->vm_mm, addr, &ptep, &ptl); + r = follow_pfnmap_start(&args); if (r) return r; } - pte = ptep_get(ptep); - - if (write_fault && !pte_write(pte)) { - pfn = KVM_PFN_ERR_RO_FAULT; + if (write_fault && !args.writable) { + *p_pfn = KVM_PFN_ERR_RO_FAULT; goto out; } - if (writable) - *writable = pte_write(pte); - pfn = pte_pfn(pte); - - /* - * Get a reference here because callers of *hva_to_pfn* and - * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the - * returned pfn. This is only needed if the VMA has VM_MIXEDMAP - * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will - * simply do nothing for reserved pfns. - * - * Whoever called remap_pfn_range is also going to call e.g. - * unmap_mapping_range before the underlying pages are freed, - * causing a call to our MMU notifier. - * - * Certain IO or PFNMAP mappings can be backed with valid - * struct pages, but be allocated without refcounting e.g., - * tail pages of non-compound higher order allocations, which - * would then underflow the refcount when the caller does the - * required put_page. Don't allow those pages here. - */ - if (!kvm_try_get_pfn(pfn)) - r = -EFAULT; - + *p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable); out: - pte_unmap_unlock(ptep, ptl); - *p_pfn = pfn; - + follow_pfnmap_end(&args); return r; } -/* - * Pin guest page in memory and return its pfn. - * @addr: host virtual address which maps memory to the guest - * @atomic: whether this function can sleep - * @interruptible: whether the process can be interrupted by non-fatal signals - * @async: whether this function need to wait IO complete if the - * host page is not in the memory - * @write_fault: whether we should get a writable host page - * @writable: whether it allows to map a writable host page for !@write_fault - * - * The function will map a writable host page for these two cases: - * 1): @write_fault = true - * 2): @write_fault = false && @writable, @writable will tell the caller - * whether the mapping is writable. - */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, - bool *async, bool write_fault, bool *writable) +kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp) { struct vm_area_struct *vma; kvm_pfn_t pfn; int npages, r; - /* we can do it either atomically or asynchronously, not both */ - BUG_ON(atomic && async); - - if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) - return pfn; + might_sleep(); - if (atomic) + if (WARN_ON_ONCE(!kfp->refcounted_page)) return KVM_PFN_ERR_FAULT; - npages = hva_to_pfn_slow(addr, async, write_fault, interruptible, - writable, &pfn); + if (hva_to_pfn_fast(kfp, &pfn)) + return pfn; + + npages = hva_to_pfn_slow(kfp, &pfn); if (npages == 1) return pfn; - if (npages == -EINTR) + if (npages == -EINTR || npages == -EAGAIN) return KVM_PFN_ERR_SIGPENDING; + if (npages == -EHWPOISON) + return KVM_PFN_ERR_HWPOISON; mmap_read_lock(current->mm); - if (npages == -EHWPOISON || - (!async && check_user_page_hwpoison(addr))) { - pfn = KVM_PFN_ERR_HWPOISON; - goto exit; - } - retry: - vma = vma_lookup(current->mm, addr); + vma = vma_lookup(current->mm, kfp->hva); if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { - r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn); + r = hva_to_pfn_remapped(vma, kfp, &pfn); if (r == -EAGAIN) goto retry; if (r < 0) pfn = KVM_PFN_ERR_FAULT; } else { - if (async && vma_is_valid(vma, write_fault)) - *async = true; - pfn = KVM_PFN_ERR_FAULT; + if ((kfp->flags & FOLL_NOWAIT) && + vma_is_valid(vma, kfp->flags & FOLL_WRITE)) + pfn = KVM_PFN_ERR_NEEDS_IO; + else + pfn = KVM_PFN_ERR_FAULT; } -exit: mmap_read_unlock(current->mm); return pfn; } -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool atomic, bool interruptible, bool *async, - bool write_fault, bool *writable, hva_t *hva) +static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp) { - unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); + kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL, + kfp->flags & FOLL_WRITE); - if (hva) - *hva = addr; - - if (addr == KVM_HVA_ERR_RO_BAD) { - if (writable) - *writable = false; + if (kfp->hva == KVM_HVA_ERR_RO_BAD) return KVM_PFN_ERR_RO_FAULT; - } - if (kvm_is_error_hva(addr)) { - if (writable) - *writable = false; + if (kvm_is_error_hva(kfp->hva)) return KVM_PFN_NOSLOT; - } - /* Do not map writable pfn in the readonly memslot. */ - if (writable && memslot_is_readonly(slot)) { - *writable = false; - writable = NULL; + if (memslot_is_readonly(kfp->slot) && kfp->map_writable) { + *kfp->map_writable = false; + kfp->map_writable = NULL; } - return hva_to_pfn(addr, atomic, interruptible, async, write_fault, - writable); -} -EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); - -kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, - bool *writable) -{ - return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false, - NULL, write_fault, writable, NULL); + return hva_to_pfn(kfp); } -EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); -kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) +kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, + unsigned int foll, bool *writable, + struct page **refcounted_page) { - return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true, - NULL, NULL); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); - -kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn) -{ - return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true, - NULL, NULL); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); + struct kvm_follow_pfn kfp = { + .slot = slot, + .gfn = gfn, + .flags = foll, + .map_writable = writable, + .refcounted_page = refcounted_page, + }; -kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); -} -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); + if (WARN_ON_ONCE(!writable || !refcounted_page)) + return KVM_PFN_ERR_FAULT; -kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) -{ - return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn); + *writable = false; + *refcounted_page = NULL; -kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); + return kvm_follow_pfn(&kfp); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); +EXPORT_SYMBOL_GPL(__kvm_faultin_pfn); -int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, - struct page **pages, int nr_pages) +int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, + struct page **pages, int nr_pages) { unsigned long addr; gfn_t entry = 0; @@ -3113,193 +3075,92 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } -EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); +EXPORT_SYMBOL_GPL(kvm_prefetch_pages); /* - * Do not use this helper unless you are absolutely certain the gfn _must_ be - * backed by 'struct page'. A valid example is if the backing memslot is - * controlled by KVM. Note, if the returned page is valid, it's refcount has - * been elevated by gfn_to_pfn(). + * Don't use this API unless you are absolutely, positively certain that KVM + * needs to get a struct page, e.g. to pin the page for firmware DMA. + * + * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate + * its refcount. */ -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) -{ - struct page *page; - kvm_pfn_t pfn; - - pfn = gfn_to_pfn(kvm, gfn); - - if (is_error_noslot_pfn(pfn)) - return KVM_ERR_PTR_BAD_PAGE; - - page = kvm_pfn_to_refcounted_page(pfn); - if (!page) - return KVM_ERR_PTR_BAD_PAGE; - - return page; -} -EXPORT_SYMBOL_GPL(gfn_to_page); +struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write) +{ + struct page *refcounted_page = NULL; + struct kvm_follow_pfn kfp = { + .slot = gfn_to_memslot(kvm, gfn), + .gfn = gfn, + .flags = write ? FOLL_WRITE : 0, + .refcounted_page = &refcounted_page, + }; -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty) -{ - if (dirty) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); + (void)kvm_follow_pfn(&kfp); + return refcounted_page; } +EXPORT_SYMBOL_GPL(__gfn_to_page); -int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) +int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, + bool writable) { - kvm_pfn_t pfn; - void *hva = NULL; - struct page *page = KVM_UNMAPPED_PAGE; + struct kvm_follow_pfn kfp = { + .slot = gfn_to_memslot(vcpu->kvm, gfn), + .gfn = gfn, + .flags = writable ? FOLL_WRITE : 0, + .refcounted_page = &map->pinned_page, + .pin = true, + }; - if (!map) - return -EINVAL; + map->pinned_page = NULL; + map->page = NULL; + map->hva = NULL; + map->gfn = gfn; + map->writable = writable; - pfn = gfn_to_pfn(vcpu->kvm, gfn); - if (is_error_noslot_pfn(pfn)) + map->pfn = kvm_follow_pfn(&kfp); + if (is_error_noslot_pfn(map->pfn)) return -EINVAL; - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - hva = kmap(page); + if (pfn_valid(map->pfn)) { + map->page = pfn_to_page(map->pfn); + map->hva = kmap(map->page); #ifdef CONFIG_HAS_IOMEM } else { - hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); + map->hva = memremap(pfn_to_hpa(map->pfn), PAGE_SIZE, MEMREMAP_WB); #endif } - if (!hva) - return -EFAULT; - - map->page = page; - map->hva = hva; - map->pfn = pfn; - map->gfn = gfn; - - return 0; + return map->hva ? 0 : -EFAULT; } -EXPORT_SYMBOL_GPL(kvm_vcpu_map); +EXPORT_SYMBOL_GPL(__kvm_vcpu_map); -void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) { - if (!map) - return; - if (!map->hva) return; - if (map->page != KVM_UNMAPPED_PAGE) + if (map->page) kunmap(map->page); #ifdef CONFIG_HAS_IOMEM else memunmap(map->hva); #endif - if (dirty) + if (map->writable) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); - kvm_release_pfn(map->pfn, dirty); + if (map->pinned_page) { + if (map->writable) + kvm_set_page_dirty(map->pinned_page); + kvm_set_page_accessed(map->pinned_page); + unpin_user_page(map->pinned_page); + } map->hva = NULL; map->page = NULL; + map->pinned_page = NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); -static bool kvm_is_ad_tracked_page(struct page *page) -{ - /* - * Per page-flags.h, pages tagged PG_reserved "should in general not be - * touched (e.g. set dirty) except by its owner". - */ - return !PageReserved(page); -} - -static void kvm_set_page_dirty(struct page *page) -{ - if (kvm_is_ad_tracked_page(page)) - SetPageDirty(page); -} - -static void kvm_set_page_accessed(struct page *page) -{ - if (kvm_is_ad_tracked_page(page)) - mark_page_accessed(page); -} - -void kvm_release_page_clean(struct page *page) -{ - WARN_ON(is_error_page(page)); - - kvm_set_page_accessed(page); - put_page(page); -} -EXPORT_SYMBOL_GPL(kvm_release_page_clean); - -void kvm_release_pfn_clean(kvm_pfn_t pfn) -{ - struct page *page; - - if (is_error_noslot_pfn(pfn)) - return; - - page = kvm_pfn_to_refcounted_page(pfn); - if (!page) - return; - - kvm_release_page_clean(page); -} -EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); - -void kvm_release_page_dirty(struct page *page) -{ - WARN_ON(is_error_page(page)); - - kvm_set_page_dirty(page); - kvm_release_page_clean(page); -} -EXPORT_SYMBOL_GPL(kvm_release_page_dirty); - -void kvm_release_pfn_dirty(kvm_pfn_t pfn) -{ - struct page *page; - - if (is_error_noslot_pfn(pfn)) - return; - - page = kvm_pfn_to_refcounted_page(pfn); - if (!page) - return; - - kvm_release_page_dirty(page); -} -EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); - -/* - * Note, checking for an error/noslot pfn is the caller's responsibility when - * directly marking a page dirty/accessed. Unlike the "release" helpers, the - * "set" helpers are not to be used when the pfn might point at garbage. - */ -void kvm_set_pfn_dirty(kvm_pfn_t pfn) -{ - if (WARN_ON(is_error_noslot_pfn(pfn))) - return; - - if (pfn_valid(pfn)) - kvm_set_page_dirty(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); - -void kvm_set_pfn_accessed(kvm_pfn_t pfn) -{ - if (WARN_ON(is_error_noslot_pfn(pfn))) - return; - - if (pfn_valid(pfn)) - kvm_set_page_accessed(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); - static int next_segment(unsigned long len, int offset) { if (len > PAGE_SIZE - offset) @@ -3308,12 +3169,16 @@ static int next_segment(unsigned long len, int offset) return len; } +/* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */ static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, int len) { int r; unsigned long addr; + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) + return -EFAULT; + addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; @@ -3387,6 +3252,9 @@ static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, int r; unsigned long addr; + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) + return -EFAULT; + addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); if (kvm_is_error_hva(addr)) return -EFAULT; @@ -3409,6 +3277,7 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, } EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); +/* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */ static int __kvm_write_guest_page(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn, const void *data, int offset, int len) @@ -3416,6 +3285,9 @@ static int __kvm_write_guest_page(struct kvm *kvm, int r; unsigned long addr; + if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) + return -EFAULT; + addr = gfn_to_hva_memslot(memslot, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; @@ -3619,7 +3491,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) int ret; while ((seg = next_segment(len, offset)) != 0) { - ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len); + ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg); if (ret < 0) return ret; offset = 0; @@ -3926,7 +3798,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); /* * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. */ -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait) { int me, cpu; @@ -3955,28 +3827,41 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) */ if (kvm_arch_vcpu_should_kick(vcpu)) { cpu = READ_ONCE(vcpu->cpu); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - smp_send_reschedule(cpu); + if (cpu != me && (unsigned int)cpu < nr_cpu_ids && cpu_online(cpu)) { + /* + * Use a reschedule IPI to kick the vCPU if the caller + * doesn't need to wait for a response, as KVM allows + * kicking vCPUs while IRQs are disabled, but using the + * SMP function call framework with IRQs disabled can + * deadlock due to taking cross-CPU locks. + */ + if (wait) + smp_call_function_single(cpu, ack_kick, NULL, wait); + else + smp_send_reschedule(cpu); + } } out: put_cpu(); } -EXPORT_SYMBOL_GPL(kvm_vcpu_kick); +EXPORT_SYMBOL_GPL(__kvm_vcpu_kick); #endif /* !CONFIG_S390 */ int kvm_vcpu_yield_to(struct kvm_vcpu *target) { - struct pid *pid; struct task_struct *task = NULL; - int ret = 0; + int ret; + + if (!read_trylock(&target->pid_lock)) + return 0; + + if (target->pid) + task = get_pid_task(target->pid, PIDTYPE_PID); + + read_unlock(&target->pid_lock); - rcu_read_lock(); - pid = rcu_dereference(target->pid); - if (pid) - task = get_pid_task(pid, PIDTYPE_PID); - rcu_read_unlock(); if (!task) - return ret; + return 0; ret = yield_to(task, 1); put_task_struct(task); @@ -4065,58 +3950,71 @@ bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { + int nr_vcpus, start, i, idx, yielded; struct kvm *kvm = me->kvm; struct kvm_vcpu *vcpu; - int last_boosted_vcpu = me->kvm->last_boosted_vcpu; - unsigned long i; - int yielded = 0; int try = 3; - int pass; + + nr_vcpus = atomic_read(&kvm->online_vcpus); + if (nr_vcpus < 2) + return; + + /* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */ + smp_rmb(); kvm_vcpu_set_in_spin_loop(me, true); + /* - * We boost the priority of a VCPU that is runnable but not - * currently running, because it got preempted by something - * else and called schedule in __vcpu_run. Hopefully that - * VCPU is holding the lock that we need and will release it. - * We approximate round-robin by starting at the last boosted VCPU. + * The current vCPU ("me") is spinning in kernel mode, i.e. is likely + * waiting for a resource to become available. Attempt to yield to a + * vCPU that is runnable, but not currently running, e.g. because the + * vCPU was preempted by a higher priority task. With luck, the vCPU + * that was preempted is holding a lock or some other resource that the + * current vCPU is waiting to acquire, and yielding to the other vCPU + * will allow it to make forward progress and release the lock (or kick + * the spinning vCPU, etc). + * + * Since KVM has no insight into what exactly the guest is doing, + * approximate a round-robin selection by iterating over all vCPUs, + * starting at the last boosted vCPU. I.e. if N=kvm->last_boosted_vcpu, + * iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed. + * + * Note, this is inherently racy, e.g. if multiple vCPUs are spinning, + * they may all try to yield to the same vCPU(s). But as above, this + * is all best effort due to KVM's lack of visibility into the guest. */ - for (pass = 0; pass < 2 && !yielded && try; pass++) { - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!pass && i <= last_boosted_vcpu) { - i = last_boosted_vcpu; - continue; - } else if (pass && i > last_boosted_vcpu) - break; - if (!READ_ONCE(vcpu->ready)) - continue; - if (vcpu == me) - continue; - if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu)) - continue; + start = READ_ONCE(kvm->last_boosted_vcpu) + 1; + for (i = 0; i < nr_vcpus; i++) { + idx = (start + i) % nr_vcpus; + if (idx == me->vcpu_idx) + continue; - /* - * Treat the target vCPU as being in-kernel if it has a - * pending interrupt, as the vCPU trying to yield may - * be spinning waiting on IPI delivery, i.e. the target - * vCPU is in-kernel for the purposes of directed yield. - */ - if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && - !kvm_arch_dy_has_pending_interrupt(vcpu) && - !kvm_arch_vcpu_preempted_in_kernel(vcpu)) - continue; - if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) - continue; + vcpu = xa_load(&kvm->vcpu_array, idx); + if (!READ_ONCE(vcpu->ready)) + continue; + if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu)) + continue; - yielded = kvm_vcpu_yield_to(vcpu); - if (yielded > 0) { - kvm->last_boosted_vcpu = i; - break; - } else if (yielded < 0) { - try--; - if (!try) - break; - } + /* + * Treat the target vCPU as being in-kernel if it has a pending + * interrupt, as the vCPU trying to yield may be spinning + * waiting on IPI delivery, i.e. the target vCPU is in-kernel + * for the purposes of directed yield. + */ + if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && + !kvm_arch_dy_has_pending_interrupt(vcpu) && + !kvm_arch_vcpu_preempted_in_kernel(vcpu)) + continue; + + if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) + continue; + + yielded = kvm_vcpu_yield_to(vcpu); + if (yielded > 0) { + WRITE_ONCE(kvm->last_boosted_vcpu, i); + break; + } else if (yielded < 0 && !--try) { + break; } } kvm_vcpu_set_in_spin_loop(me, false); @@ -4213,9 +4111,9 @@ static int vcpu_get_pid(void *data, u64 *val) { struct kvm_vcpu *vcpu = data; - rcu_read_lock(); - *val = pid_nr(rcu_dereference(vcpu->pid)); - rcu_read_unlock(); + read_lock(&vcpu->pid_lock); + *val = pid_nr(vcpu->pid); + read_unlock(&vcpu->pid_lock); return 0; } @@ -4242,12 +4140,21 @@ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) /* * Creates some virtual cpus. Good luck creating more than one. */ -static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id) { int r; struct kvm_vcpu *vcpu; struct page *page; + /* + * KVM tracks vCPU IDs as 'int', be kind to userspace and reject + * too-large values instead of silently truncating. + * + * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first + * changing the storage type (at the very least, IDs should be tracked + * as unsigned ints). + */ + BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX); if (id >= KVM_MAX_VCPU_IDS) return -EINVAL; @@ -4287,7 +4194,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto vcpu_free_run_page; if (kvm->dirty_ring_size) { - r = kvm_dirty_ring_alloc(&vcpu->dirty_ring, + r = kvm_dirty_ring_alloc(kvm, &vcpu->dirty_ring, id, kvm->dirty_ring_size); if (r) goto arch_vcpu_destroy; @@ -4295,32 +4202,30 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) mutex_lock(&kvm->lock); -#ifdef CONFIG_LOCKDEP - /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */ - mutex_lock(&vcpu->mutex); - mutex_unlock(&vcpu->mutex); -#endif - if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; goto unlock_vcpu_destroy; } vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); - r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT); + r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT); + WARN_ON_ONCE(r == -EBUSY); if (r) goto unlock_vcpu_destroy; - /* Now it's all set up, let userspace reach it */ + /* + * Now it's all set up, let userspace reach it. Grab the vCPU's mutex + * so that userspace can't invoke vCPU ioctl()s until the vCPU is fully + * visible (per online_vcpus), e.g. so that KVM doesn't get tricked + * into a NULL-pointer dereference because KVM thinks the _current_ + * vCPU doesn't exist. As a bonus, taking vcpu->mutex ensures lockdep + * knows it's taken *inside* kvm->lock. + */ + mutex_lock(&vcpu->mutex); kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); if (r < 0) - goto kvm_put_xa_release; - - if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) { - r = -EINVAL; - goto kvm_put_xa_release; - } + goto kvm_put_xa_erase; /* * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu @@ -4328,15 +4233,17 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) */ smp_wmb(); atomic_inc(&kvm->online_vcpus); + mutex_unlock(&vcpu->mutex); mutex_unlock(&kvm->lock); kvm_arch_vcpu_postcreate(vcpu); kvm_create_vcpu_debugfs(vcpu); return r; -kvm_put_xa_release: +kvm_put_xa_erase: + mutex_unlock(&vcpu->mutex); kvm_put_kvm_no_destroy(kvm); - xa_release(&kvm->vcpu_array, vcpu->vcpu_idx); + xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx); unlock_vcpu_destroy: mutex_unlock(&kvm->lock); kvm_dirty_ring_free(&vcpu->dirty_ring); @@ -4401,20 +4308,92 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) if (fd < 0) return fd; - file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY); + file = anon_inode_getfile_fmode(name, &kvm_vcpu_stats_fops, vcpu, + O_RDONLY, FMODE_PREAD); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } kvm_get_kvm(vcpu->kvm); - - file->f_mode |= FMODE_PREAD; fd_install(fd, file); return fd; } +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY +static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) +{ + int idx; + long r; + u64 full_size; + + if (range->flags) + return -EINVAL; + + if (!PAGE_ALIGNED(range->gpa) || + !PAGE_ALIGNED(range->size) || + range->gpa + range->size <= range->gpa) + return -EINVAL; + + vcpu_load(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + + full_size = range->size; + do { + if (signal_pending(current)) { + r = -EINTR; + break; + } + + r = kvm_arch_vcpu_pre_fault_memory(vcpu, range); + if (WARN_ON_ONCE(r == 0 || r == -EIO)) + break; + + if (r < 0) + break; + + range->size -= r; + range->gpa += r; + cond_resched(); + } while (range->size); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + vcpu_put(vcpu); + + /* Return success if at least one page was mapped successfully. */ + return full_size == range->size ? r : 0; +} +#endif + +static int kvm_wait_for_vcpu_online(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + /* + * In practice, this happy path will always be taken, as a well-behaved + * VMM will never invoke a vCPU ioctl() before KVM_CREATE_VCPU returns. + */ + if (likely(vcpu->vcpu_idx < atomic_read(&kvm->online_vcpus))) + return 0; + + /* + * Acquire and release the vCPU's mutex to wait for vCPU creation to + * complete (kvm_vm_ioctl_create_vcpu() holds the mutex until the vCPU + * is fully online). + */ + if (mutex_lock_killable(&vcpu->mutex)) + return -EINTR; + + mutex_unlock(&vcpu->mutex); + + if (WARN_ON_ONCE(!kvm_get_vcpu(kvm, vcpu->vcpu_idx))) + return -EIO; + + return 0; +} + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4431,6 +4410,15 @@ static long kvm_vcpu_ioctl(struct file *filp, return -EINVAL; /* + * Wait for the vCPU to be online before handling the ioctl(), as KVM + * assumes the vCPU is reachable via vcpu_array, i.e. may dereference + * a NULL pointer if userspace invokes an ioctl() before KVM is ready. + */ + r = kvm_wait_for_vcpu_online(vcpu); + if (r) + return r; + + /* * Some architectures have vcpu ioctls that are asynchronous to vcpu * execution; mutex_lock() would break them. */ @@ -4446,7 +4434,14 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -EINVAL; if (arg) goto out; - oldpid = rcu_access_pointer(vcpu->pid); + + /* + * Note, vcpu->pid is primarily protected by vcpu->mutex. The + * dedicated r/w lock allows other tasks, e.g. other vCPUs, to + * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield + * directly to this vCPU + */ + oldpid = vcpu->pid; if (unlikely(oldpid != task_pid(current))) { /* The thread running this VCPU changed. */ struct pid *newpid; @@ -4456,12 +4451,16 @@ static long kvm_vcpu_ioctl(struct file *filp, break; newpid = get_task_pid(current, PIDTYPE_PID); - rcu_assign_pointer(vcpu->pid, newpid); - if (oldpid) - synchronize_rcu(); + write_lock(&vcpu->pid_lock); + vcpu->pid = newpid; + write_unlock(&vcpu->pid_lock); + put_pid(oldpid); } + vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe); r = kvm_arch_vcpu_ioctl_run(vcpu); + vcpu->wants_to_run = false; + trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } @@ -4469,7 +4468,7 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_regs *kvm_regs; r = -ENOMEM; - kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); + kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); if (!kvm_regs) goto out; r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); @@ -4496,8 +4495,7 @@ out_free1: break; } case KVM_GET_SREGS: { - kvm_sregs = kzalloc(sizeof(struct kvm_sregs), - GFP_KERNEL_ACCOUNT); + kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); r = -ENOMEM; if (!kvm_sregs) goto out; @@ -4589,7 +4587,7 @@ out_free1: break; } case KVM_GET_FPU: { - fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); + fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); r = -ENOMEM; if (!fpu) goto out; @@ -4616,6 +4614,20 @@ out_free1: r = kvm_vcpu_ioctl_get_stats_fd(vcpu); break; } +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY + case KVM_PRE_FAULT_MEMORY: { + struct kvm_pre_fault_memory range; + + r = -EFAULT; + if (copy_from_user(&range, argp, sizeof(range))) + break; + r = kvm_vcpu_pre_fault_memory(vcpu, &range); + /* Pass back leftover range. */ + if (copy_to_user(argp, &range, sizeof(range))) + r = -EFAULT; + break; + } +#endif default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } @@ -4725,7 +4737,8 @@ static int kvm_device_release(struct inode *inode, struct file *filp) if (dev->ops->release) { mutex_lock(&kvm->lock); - list_del(&dev->vm_node); + list_del_rcu(&dev->vm_node); + synchronize_rcu(); dev->ops->release(dev); mutex_unlock(&kvm->lock); } @@ -4808,7 +4821,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm, kfree(dev); return ret; } - list_add(&dev->vm_node, &kvm->devices); + list_add_rcu(&dev->vm_node, &kvm->devices); mutex_unlock(&kvm->lock); if (ops->init) @@ -4819,7 +4832,8 @@ static int kvm_ioctl_create_device(struct kvm *kvm, if (ret < 0) { kvm_put_kvm_no_destroy(kvm); mutex_lock(&kvm->lock); - list_del(&dev->vm_node); + list_del_rcu(&dev->vm_node); + synchronize_rcu(); if (ops->release) ops->release(dev); mutex_unlock(&kvm->lock); @@ -4918,7 +4932,7 @@ static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) return -EINVAL; /* Should be bigger to keep the reserved entries, or a page */ - if (size < kvm_dirty_ring_get_rsvd_entries() * + if (size < kvm_dirty_ring_get_rsvd_entries(kvm) * sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE) return -EINVAL; @@ -5089,16 +5103,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) if (fd < 0) return fd; - file = anon_inode_getfile("kvm-vm-stats", - &kvm_vm_stats_fops, kvm, O_RDONLY); + file = anon_inode_getfile_fmode("kvm-vm-stats", + &kvm_vm_stats_fops, kvm, O_RDONLY, FMODE_PREAD); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } kvm_get_kvm(kvm); - - file->f_mode |= FMODE_PREAD; fd_install(fd, file); return fd; @@ -5535,137 +5547,68 @@ static struct miscdevice kvm_dev = { }; #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +bool enable_virt_at_load = true; +module_param(enable_virt_at_load, bool, 0444); +EXPORT_SYMBOL_GPL(enable_virt_at_load); + __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); -static DEFINE_PER_CPU(bool, hardware_enabled); +static DEFINE_PER_CPU(bool, virtualization_enabled); +static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; -static int __hardware_enable_nolock(void) +__weak void kvm_arch_enable_virtualization(void) { - if (__this_cpu_read(hardware_enabled)) + +} + +__weak void kvm_arch_disable_virtualization(void) +{ + +} + +static int kvm_enable_virtualization_cpu(void) +{ + if (__this_cpu_read(virtualization_enabled)) return 0; - if (kvm_arch_hardware_enable()) { + if (kvm_arch_enable_virtualization_cpu()) { pr_info("kvm: enabling virtualization on CPU%d failed\n", raw_smp_processor_id()); return -EIO; } - __this_cpu_write(hardware_enabled, true); + __this_cpu_write(virtualization_enabled, true); return 0; } -static void hardware_enable_nolock(void *failed) -{ - if (__hardware_enable_nolock()) - atomic_inc(failed); -} - static int kvm_online_cpu(unsigned int cpu) { - int ret = 0; - /* * Abort the CPU online process if hardware virtualization cannot * be enabled. Otherwise running VMs would encounter unrecoverable * errors when scheduled to this CPU. */ - mutex_lock(&kvm_lock); - if (kvm_usage_count) - ret = __hardware_enable_nolock(); - mutex_unlock(&kvm_lock); - return ret; + return kvm_enable_virtualization_cpu(); } -static void hardware_disable_nolock(void *junk) +static void kvm_disable_virtualization_cpu(void *ign) { - /* - * Note, hardware_disable_all_nolock() tells all online CPUs to disable - * hardware, not just CPUs that successfully enabled hardware! - */ - if (!__this_cpu_read(hardware_enabled)) + if (!__this_cpu_read(virtualization_enabled)) return; - kvm_arch_hardware_disable(); + kvm_arch_disable_virtualization_cpu(); - __this_cpu_write(hardware_enabled, false); + __this_cpu_write(virtualization_enabled, false); } static int kvm_offline_cpu(unsigned int cpu) { - mutex_lock(&kvm_lock); - if (kvm_usage_count) - hardware_disable_nolock(NULL); - mutex_unlock(&kvm_lock); + kvm_disable_virtualization_cpu(NULL); return 0; } -static void hardware_disable_all_nolock(void) -{ - BUG_ON(!kvm_usage_count); - - kvm_usage_count--; - if (!kvm_usage_count) - on_each_cpu(hardware_disable_nolock, NULL, 1); -} - -static void hardware_disable_all(void) -{ - cpus_read_lock(); - mutex_lock(&kvm_lock); - hardware_disable_all_nolock(); - mutex_unlock(&kvm_lock); - cpus_read_unlock(); -} - -static int hardware_enable_all(void) -{ - atomic_t failed = ATOMIC_INIT(0); - int r; - - /* - * Do not enable hardware virtualization if the system is going down. - * If userspace initiated a forced reboot, e.g. reboot -f, then it's - * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling - * after kvm_reboot() is called. Note, this relies on system_state - * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops - * hook instead of registering a dedicated reboot notifier (the latter - * runs before system_state is updated). - */ - if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || - system_state == SYSTEM_RESTART) - return -EBUSY; - - /* - * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() - * is called, and so on_each_cpu() between them includes the CPU that - * is being onlined. As a result, hardware_enable_nolock() may get - * invoked before kvm_online_cpu(), which also enables hardware if the - * usage count is non-zero. Disable CPU hotplug to avoid attempting to - * enable hardware multiple times. - */ - cpus_read_lock(); - mutex_lock(&kvm_lock); - - r = 0; - - kvm_usage_count++; - if (kvm_usage_count == 1) { - on_each_cpu(hardware_enable_nolock, &failed, 1); - - if (atomic_read(&failed)) { - hardware_disable_all_nolock(); - r = -EBUSY; - } - } - - mutex_unlock(&kvm_lock); - cpus_read_unlock(); - - return r; -} - static void kvm_shutdown(void) { /* @@ -5681,34 +5624,32 @@ static void kvm_shutdown(void) */ pr_info("kvm: exiting hardware virtualization\n"); kvm_rebooting = true; - on_each_cpu(hardware_disable_nolock, NULL, 1); + on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); } static int kvm_suspend(void) { /* * Secondary CPUs and CPU hotplug are disabled across the suspend/resume - * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count - * is stable. Assert that kvm_lock is not held to ensure the system - * isn't suspended while KVM is enabling hardware. Hardware enabling - * can be preempted, but the task cannot be frozen until it has dropped - * all locks (userspace tasks are frozen via a fake signal). + * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage + * count is stable. Assert that kvm_usage_lock is not held to ensure + * the system isn't suspended while KVM is enabling hardware. Hardware + * enabling can be preempted, but the task cannot be frozen until it has + * dropped all locks (userspace tasks are frozen via a fake signal). */ - lockdep_assert_not_held(&kvm_lock); + lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); - if (kvm_usage_count) - hardware_disable_nolock(NULL); + kvm_disable_virtualization_cpu(NULL); return 0; } static void kvm_resume(void) { - lockdep_assert_not_held(&kvm_lock); + lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); - if (kvm_usage_count) - WARN_ON_ONCE(__hardware_enable_nolock()); + WARN_ON_ONCE(kvm_enable_virtualization_cpu()); } static struct syscore_ops kvm_syscore_ops = { @@ -5716,13 +5657,87 @@ static struct syscore_ops kvm_syscore_ops = { .resume = kvm_resume, .shutdown = kvm_shutdown, }; + +int kvm_enable_virtualization(void) +{ + int r; + + guard(mutex)(&kvm_usage_lock); + + if (kvm_usage_count++) + return 0; + + kvm_arch_enable_virtualization(); + + r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", + kvm_online_cpu, kvm_offline_cpu); + if (r) + goto err_cpuhp; + + register_syscore_ops(&kvm_syscore_ops); + + /* + * Undo virtualization enabling and bail if the system is going down. + * If userspace initiated a forced reboot, e.g. reboot -f, then it's + * possible for an in-flight operation to enable virtualization after + * syscore_shutdown() is called, i.e. without kvm_shutdown() being + * invoked. Note, this relies on system_state being set _before_ + * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked + * or this CPU observes the impending shutdown. Which is why KVM uses + * a syscore ops hook instead of registering a dedicated reboot + * notifier (the latter runs before system_state is updated). + */ + if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || + system_state == SYSTEM_RESTART) { + r = -EBUSY; + goto err_rebooting; + } + + return 0; + +err_rebooting: + unregister_syscore_ops(&kvm_syscore_ops); + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); +err_cpuhp: + kvm_arch_disable_virtualization(); + --kvm_usage_count; + return r; +} +EXPORT_SYMBOL_GPL(kvm_enable_virtualization); + +void kvm_disable_virtualization(void) +{ + guard(mutex)(&kvm_usage_lock); + + if (--kvm_usage_count) + return; + + unregister_syscore_ops(&kvm_syscore_ops); + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); + kvm_arch_disable_virtualization(); +} +EXPORT_SYMBOL_GPL(kvm_disable_virtualization); + +static int kvm_init_virtualization(void) +{ + if (enable_virt_at_load) + return kvm_enable_virtualization(); + + return 0; +} + +static void kvm_uninit_virtualization(void) +{ + if (enable_virt_at_load) + kvm_disable_virtualization(); +} #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ -static int hardware_enable_all(void) +static int kvm_init_virtualization(void) { return 0; } -static void hardware_disable_all(void) +static void kvm_uninit_virtualization(void) { } @@ -5820,7 +5835,6 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -/* kvm_io_bus_write - called under kvm->slots_lock */ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { @@ -5841,7 +5855,6 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, } EXPORT_SYMBOL_GPL(kvm_io_bus_write); -/* kvm_io_bus_write_cookie - called under kvm->slots_lock */ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie) { @@ -5891,7 +5904,6 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -/* kvm_io_bus_read - called under kvm->slots_lock */ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val) { @@ -5910,6 +5922,7 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, r = __kvm_io_bus_read(vcpu, bus, &range, val); return r < 0 ? r : 0; } +EXPORT_SYMBOL_GPL(kvm_io_bus_read); int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev) @@ -6155,7 +6168,6 @@ static const struct file_operations stat_fops_per_vm = { .release = kvm_debugfs_release, .read = simple_attr_read, .write = simple_attr_write, - .llseek = no_llseek, }; static int vm_stat_get(void *_offset, u64 *val) @@ -6250,7 +6262,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) active = kvm_active_vms; mutex_unlock(&kvm_lock); - env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); + env = kzalloc(sizeof(*env), GFP_KERNEL); if (!env) return; @@ -6266,7 +6278,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) add_uevent_var(env, "PID=%d", kvm->userspace_pid); if (!IS_ERR(kvm->debugfs_dentry)) { - char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); + char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); if (p) { tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); @@ -6326,8 +6338,9 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu) WRITE_ONCE(vcpu->ready, false); __this_cpu_write(kvm_running_vcpu, vcpu); - kvm_arch_sched_in(vcpu, cpu); kvm_arch_vcpu_load(vcpu, cpu); + + WRITE_ONCE(vcpu->scheduled_out, false); } static void kvm_sched_out(struct preempt_notifier *pn, @@ -6335,7 +6348,9 @@ static void kvm_sched_out(struct preempt_notifier *pn, { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); - if (current->on_rq) { + WRITE_ONCE(vcpu->scheduled_out, true); + + if (task_is_runnable(current) && vcpu->wants_to_run) { WRITE_ONCE(vcpu->preempted, true); WRITE_ONCE(vcpu->ready, true); } @@ -6421,15 +6436,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) int r; int cpu; -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", - kvm_online_cpu, kvm_offline_cpu); - if (r) - return r; - - register_syscore_ops(&kvm_syscore_ops); -#endif - /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) vcpu_align = __alignof__(struct kvm_vcpu); @@ -6440,10 +6446,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) offsetofend(struct kvm_vcpu, stats_id) - offsetof(struct kvm_vcpu, arch), NULL); - if (!kvm_vcpu_cache) { - r = -ENOMEM; - goto err_vcpu_cache; - } + if (!kvm_vcpu_cache) + return -ENOMEM; for_each_possible_cpu(cpu) { if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), @@ -6477,6 +6481,10 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) kvm_gmem_init(module); + r = kvm_init_virtualization(); + if (r) + goto err_virt; + /* * Registration _must_ be the very last thing done, as this exposes * /dev/kvm to userspace, i.e. all infrastructure must be setup! @@ -6490,6 +6498,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) return 0; err_register: + kvm_uninit_virtualization(); +err_virt: kvm_vfio_ops_exit(); err_vfio: kvm_async_pf_deinit(); @@ -6500,11 +6510,6 @@ err_cpu_kick_mask: for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); -err_vcpu_cache: -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING - unregister_syscore_ops(&kvm_syscore_ops); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); -#endif return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6520,119 +6525,14 @@ void kvm_exit(void) */ misc_deregister(&kvm_dev); + kvm_uninit_virtualization(); + debugfs_remove_recursive(kvm_debugfs_dir); for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING - unregister_syscore_ops(&kvm_syscore_ops); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); -#endif kvm_irqfd_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); - -struct kvm_vm_worker_thread_context { - struct kvm *kvm; - struct task_struct *parent; - struct completion init_done; - kvm_vm_thread_fn_t thread_fn; - uintptr_t data; - int err; -}; - -static int kvm_vm_worker_thread(void *context) -{ - /* - * The init_context is allocated on the stack of the parent thread, so - * we have to locally copy anything that is needed beyond initialization - */ - struct kvm_vm_worker_thread_context *init_context = context; - struct task_struct *parent; - struct kvm *kvm = init_context->kvm; - kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; - uintptr_t data = init_context->data; - int err; - - err = kthread_park(current); - /* kthread_park(current) is never supposed to return an error */ - WARN_ON(err != 0); - if (err) - goto init_complete; - - err = cgroup_attach_task_all(init_context->parent, current); - if (err) { - kvm_err("%s: cgroup_attach_task_all failed with err %d\n", - __func__, err); - goto init_complete; - } - - set_user_nice(current, task_nice(init_context->parent)); - -init_complete: - init_context->err = err; - complete(&init_context->init_done); - init_context = NULL; - - if (err) - goto out; - - /* Wait to be woken up by the spawner before proceeding. */ - kthread_parkme(); - - if (!kthread_should_stop()) - err = thread_fn(kvm, data); - -out: - /* - * Move kthread back to its original cgroup to prevent it lingering in - * the cgroup of the VM process, after the latter finishes its - * execution. - * - * kthread_stop() waits on the 'exited' completion condition which is - * set in exit_mm(), via mm_release(), in do_exit(). However, the - * kthread is removed from the cgroup in the cgroup_exit() which is - * called after the exit_mm(). This causes the kthread_stop() to return - * before the kthread actually quits the cgroup. - */ - rcu_read_lock(); - parent = rcu_dereference(current->real_parent); - get_task_struct(parent); - rcu_read_unlock(); - cgroup_attach_task_all(parent, current); - put_task_struct(parent); - - return err; -} - -int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, - uintptr_t data, const char *name, - struct task_struct **thread_ptr) -{ - struct kvm_vm_worker_thread_context init_context = {}; - struct task_struct *thread; - - *thread_ptr = NULL; - init_context.kvm = kvm; - init_context.parent = current; - init_context.thread_fn = thread_fn; - init_context.data = data; - init_completion(&init_context.init_done); - - thread = kthread_run(kvm_vm_worker_thread, &init_context, - "%s-%d", name, task_pid_nr(current)); - if (IS_ERR(thread)) - return PTR_ERR(thread); - - /* kthread_run is never supposed to return NULL */ - WARN_ON(thread == NULL); - - wait_for_completion(&init_context.init_done); - - if (!init_context.err) - *thread_ptr = thread; - - return init_context.err; -} |