summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2023-09-01 15:49:45 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2023-09-01 15:50:38 -0400
commitd011151616e73de20c139580b73fa4c7042bd861 (patch)
tree4864312c57760ea18a316a26069200d2ff154a6e
parent6d5e3c318a33edb1f9176964c4ed7f076fc4248c (diff)
parentd10f3780bc2f80744d291e118c0c8bade54ed3b8 (diff)
Merge branch 'kvm-x86-mmu-6.6' into HEAD
KVM x86 MMU changes for 6.6: - Rip out the ancient MMU_DEBUG crud and replace the useful bits with CONFIG_KVM_PROVE_MMU - Overhaul KVM's page-track APIs, and KVMGT's usage, to reduce the API surface that is needed by external users (currently only KVMGT), and fix a variety of issues in the process - Fix KVM's handling of !visible guest roots to avoid premature triple fault injection by loading a dummy root backed by the zero page
-rw-r--r--arch/x86/include/asm/kvm_host.h16
-rw-r--r--arch/x86/include/asm/kvm_page_track.h67
-rw-r--r--arch/x86/kvm/Kconfig13
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmu/mmu.c319
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h24
-rw-r--r--arch/x86/kvm/mmu/page_track.c252
-rw-r--r--arch/x86/kvm/mmu/page_track.h58
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h41
-rw-r--r--arch/x86/kvm/mmu/spte.c6
-rw-r--r--arch/x86/kvm/mmu/spte.h21
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c11
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c33
-rw-r--r--arch/x86/kvm/x86.c22
-rw-r--r--drivers/gpu/drm/i915/gvt/gtt.c102
-rw-r--r--drivers/gpu/drm/i915/gvt/gtt.h1
-rw-r--r--drivers/gpu/drm/i915/gvt/gvt.h3
-rw-r--r--drivers/gpu/drm/i915/gvt/kvmgt.c120
-rw-r--r--drivers/gpu/drm/i915/gvt/page_track.c10
-rw-r--r--include/linux/kvm_host.h19
20 files changed, 562 insertions, 578 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e3c9ff4146fc..1a4def36d5bb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -288,13 +288,13 @@ struct kvm_kernel_irq_routing_entry;
* kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
* also includes TDP pages) to determine whether or not a page can be used in
* the given MMU context. This is a subset of the overall kvm_cpu_role to
- * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating
- * 2 bytes per gfn instead of 4 bytes per gfn.
+ * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows
+ * allocating 2 bytes per gfn instead of 4 bytes per gfn.
*
* Upper-level shadow pages having gptes are tracked for write-protection via
- * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create
- * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise
- * gfn_track will overflow and explosions will ensure.
+ * gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must
+ * not create more than 2^16-1 upper-level shadow pages at a single gfn,
+ * otherwise gfn_write_track will overflow and explosions will ensue.
*
* A unique shadow page (SP) for a gfn is created if and only if an existing SP
* cannot be reused. The ability to reuse a SP is tracked by its role, which
@@ -1023,7 +1023,7 @@ struct kvm_lpage_info {
struct kvm_arch_memory_slot {
struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
- unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
+ unsigned short *gfn_write_track;
};
/*
@@ -1265,8 +1265,9 @@ struct kvm_arch {
* create an NX huge page (without hanging the guest).
*/
struct list_head possible_nx_huge_pages;
- struct kvm_page_track_notifier_node mmu_sp_tracker;
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
struct kvm_page_track_notifier_head track_notifier_head;
+#endif
/*
* Protects marking pages unsync during page faults, as TDP MMU page
* faults only take mmu_lock for read. For simplicity, the unsync
@@ -1853,7 +1854,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
-void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index eb186bc57f6a..3d040741044b 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -2,11 +2,9 @@
#ifndef _ASM_X86_KVM_PAGE_TRACK_H
#define _ASM_X86_KVM_PAGE_TRACK_H
-enum kvm_page_track_mode {
- KVM_PAGE_TRACK_WRITE,
- KVM_PAGE_TRACK_MAX,
-};
+#include <linux/kvm_types.h>
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
/*
* The notifier represented by @kvm_page_track_notifier_node is linked into
* the head which will be notified when guest is triggering the track event.
@@ -26,54 +24,39 @@ struct kvm_page_track_notifier_node {
* It is called when guest is writing the write-tracked page
* and write emulation is finished at that time.
*
- * @vcpu: the vcpu where the write access happened.
* @gpa: the physical address written by guest.
* @new: the data was written to the address.
* @bytes: the written length.
* @node: this node
*/
- void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
- int bytes, struct kvm_page_track_notifier_node *node);
+ void (*track_write)(gpa_t gpa, const u8 *new, int bytes,
+ struct kvm_page_track_notifier_node *node);
+
/*
- * It is called when memory slot is being moved or removed
- * users can drop write-protection for the pages in that memory slot
+ * Invoked when a memory region is removed from the guest. Or in KVM
+ * terms, when a memslot is deleted.
*
- * @kvm: the kvm where memory slot being moved or removed
- * @slot: the memory slot being moved or removed
- * @node: this node
+ * @gfn: base gfn of the region being removed
+ * @nr_pages: number of pages in the to-be-removed region
+ * @node: this node
*/
- void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_page_track_notifier_node *node);
+ void (*track_remove_region)(gfn_t gfn, unsigned long nr_pages,
+ struct kvm_page_track_notifier_node *node);
};
-int kvm_page_track_init(struct kvm *kvm);
-void kvm_page_track_cleanup(struct kvm *kvm);
+int kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n);
+void kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n);
-bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
-int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
-
-void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
-int kvm_page_track_create_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- unsigned long npages);
-
-void kvm_slot_page_track_add_page(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn,
- enum kvm_page_track_mode mode);
-void kvm_slot_page_track_remove_page(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn,
- enum kvm_page_track_mode mode);
-bool kvm_slot_page_track_is_active(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- gfn_t gfn, enum kvm_page_track_mode mode);
+int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn);
+int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn);
+#else
+/*
+ * Allow defining a node in a structure even if page tracking is disabled, e.g.
+ * to play nice with testing headers via direct inclusion from the command line.
+ */
+struct kvm_page_track_notifier_node {};
+#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
-void
-kvm_page_track_register_notifier(struct kvm *kvm,
- struct kvm_page_track_notifier_node *n);
-void
-kvm_page_track_unregister_notifier(struct kvm *kvm,
- struct kvm_page_track_notifier_node *n);
-void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
- int bytes);
-void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
#endif
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 66dbd1f4d57d..ed90f148140d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -138,6 +138,19 @@ config KVM_XEN
If in doubt, say "N".
+config KVM_PROVE_MMU
+ bool "Prove KVM MMU correctness"
+ depends on DEBUG_KERNEL
+ depends on KVM
+ depends on EXPERT
+ help
+ Enables runtime assertions in KVM's MMU that are too costly to enable
+ in anything remotely resembling a production environment, e.g. this
+ gates code that verifies a to-be-freed page table doesn't have any
+ present SPTEs.
+
+ If in doubt, say "N".
+
config KVM_EXTERNAL_WRITE_TRACKING
bool
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 92d5a1924fc1..253fb2093d5d 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -121,6 +121,8 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes);
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 276157f8496c..e1d011c67cc6 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -25,6 +25,7 @@
#include "kvm_cache_regs.h"
#include "smm.h"
#include "kvm_emulate.h"
+#include "page_track.h"
#include "cpuid.h"
#include "spte.h"
@@ -53,7 +54,7 @@
#include <asm/io.h>
#include <asm/set_memory.h>
#include <asm/vmx.h>
-#include <asm/kvm_page_track.h>
+
#include "trace.h"
extern bool itlb_multihit_kvm_mitigation;
@@ -115,11 +116,6 @@ static int max_huge_page_level __read_mostly;
static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
-#ifdef MMU_DEBUG
-bool dbg = 0;
-module_param(dbg, bool, 0644);
-#endif
-
#define PTE_PREFETCH_NUM 8
#include <trace/events/kvm.h>
@@ -486,7 +482,7 @@ retry:
*/
static void mmu_spte_set(u64 *sptep, u64 new_spte)
{
- WARN_ON(is_shadow_present_pte(*sptep));
+ WARN_ON_ONCE(is_shadow_present_pte(*sptep));
__set_spte(sptep, new_spte);
}
@@ -498,7 +494,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
{
u64 old_spte = *sptep;
- WARN_ON(!is_shadow_present_pte(new_spte));
+ WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
check_spte_writable_invariants(new_spte);
if (!is_shadow_present_pte(old_spte)) {
@@ -511,7 +507,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
- WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+ WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
return old_spte;
}
@@ -593,7 +589,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
* by a refcounted page, the refcount is elevated.
*/
page = kvm_pfn_to_refcounted_page(pfn);
- WARN_ON(page && !page_count(page));
+ WARN_ON_ONCE(page && !page_count(page));
if (is_accessed_spte(old_spte))
kvm_set_pfn_accessed(pfn);
@@ -808,7 +804,7 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
linfo->disallow_lpage += count;
- WARN_ON(linfo->disallow_lpage < 0);
+ WARN_ON_ONCE(linfo->disallow_lpage < 0);
}
}
@@ -835,8 +831,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
/* the non-leaf shadow pages are keeping readonly. */
if (sp->role.level > PG_LEVEL_4K)
- return kvm_slot_page_track_add_page(kvm, slot, gfn,
- KVM_PAGE_TRACK_WRITE);
+ return __kvm_write_track_add_gfn(kvm, slot, gfn);
kvm_mmu_gfn_disallow_lpage(slot, gfn);
@@ -882,8 +877,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, gfn);
if (sp->role.level > PG_LEVEL_4K)
- return kvm_slot_page_track_remove_page(kvm, slot, gfn,
- KVM_PAGE_TRACK_WRITE);
+ return __kvm_write_track_remove_gfn(kvm, slot, gfn);
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
@@ -937,10 +931,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
int count = 0;
if (!rmap_head->val) {
- rmap_printk("%p %llx 0->1\n", spte, *spte);
rmap_head->val = (unsigned long)spte;
} else if (!(rmap_head->val & 1)) {
- rmap_printk("%p %llx 1->many\n", spte, *spte);
desc = kvm_mmu_memory_cache_alloc(cache);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
@@ -949,7 +941,6 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
- rmap_printk("%p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
count = desc->tail_count + desc->spte_count;
@@ -969,7 +960,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
return count;
}
-static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
+static void pte_list_desc_remove_entry(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
struct pte_list_desc *desc, int i)
{
struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
@@ -980,7 +972,7 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
* when adding an entry and the previous head is full, and heads are
* removed (this flow) when they become empty.
*/
- BUG_ON(j < 0);
+ KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
/*
* Replace the to-be-freed SPTE with the last valid entry from the head
@@ -1005,35 +997,34 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
mmu_free_pte_list_desc(head_desc);
}
-static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
+static void pte_list_remove(struct kvm *kvm, u64 *spte,
+ struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
int i;
- if (!rmap_head->val) {
- pr_err("%s: %p 0->BUG\n", __func__, spte);
- BUG();
- } else if (!(rmap_head->val & 1)) {
- rmap_printk("%p 1->0\n", spte);
- if ((u64 *)rmap_head->val != spte) {
- pr_err("%s: %p 1->BUG\n", __func__, spte);
- BUG();
- }
+ if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
+ return;
+
+ if (!(rmap_head->val & 1)) {
+ if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
+ return;
+
rmap_head->val = 0;
} else {
- rmap_printk("%p many->many\n", spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
- pte_list_desc_remove_entry(rmap_head, desc, i);
+ pte_list_desc_remove_entry(kvm, rmap_head,
+ desc, i);
return;
}
}
desc = desc->more;
}
- pr_err("%s: %p many->many\n", __func__, spte);
- BUG();
+
+ KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
}
}
@@ -1041,7 +1032,7 @@ static void kvm_zap_one_rmap_spte(struct kvm *kvm,
struct kvm_rmap_head *rmap_head, u64 *sptep)
{
mmu_spte_clear_track_bits(kvm, sptep);
- pte_list_remove(sptep, rmap_head);
+ pte_list_remove(kvm, sptep, rmap_head);
}
/* Return true if at least one SPTE was zapped, false otherwise */
@@ -1116,7 +1107,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
slot = __gfn_to_memslot(slots, gfn);
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
- pte_list_remove(spte, rmap_head);
+ pte_list_remove(kvm, spte, rmap_head);
}
/*
@@ -1208,7 +1199,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
struct kvm_mmu_page *sp;
sp = sptep_to_sp(sptep);
- WARN_ON(sp->role.level == PG_LEVEL_4K);
+ WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K);
drop_spte(kvm, sptep);
@@ -1237,8 +1228,6 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
!(pt_protect && is_mmu_writable_spte(spte)))
return false;
- rmap_printk("spte %p %llx\n", sptep, *sptep);
-
if (pt_protect)
spte &= ~shadow_mmu_writable_mask;
spte = spte & ~PT_WRITABLE_MASK;
@@ -1263,9 +1252,7 @@ static bool spte_clear_dirty(u64 *sptep)
{
u64 spte = *sptep;
- rmap_printk("spte %p %llx\n", sptep, *sptep);
-
- MMU_WARN_ON(!spte_ad_enabled(spte));
+ KVM_MMU_WARN_ON(!spte_ad_enabled(spte));
spte &= ~shadow_dirty_mask;
return mmu_spte_update(sptep, spte);
}
@@ -1471,14 +1458,11 @@ static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
u64 new_spte;
kvm_pfn_t new_pfn;
- WARN_ON(pte_huge(pte));
+ WARN_ON_ONCE(pte_huge(pte));
new_pfn = pte_pfn(pte);
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
- rmap_printk("spte %p %llx gfn %llx (%d)\n",
- sptep, *sptep, gfn, level);
-
need_flush = true;
if (pte_write(pte)) {
@@ -1706,21 +1690,19 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
return young;
}
-#ifdef MMU_DEBUG
-static int is_empty_shadow_page(u64 *spt)
+static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
{
- u64 *pos;
- u64 *end;
+#ifdef CONFIG_KVM_PROVE_MMU
+ int i;
- for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++)
- if (is_shadow_present_pte(*pos)) {
- printk(KERN_ERR "%s: %p %llx\n", __func__,
- pos, *pos);
- return 0;
- }
- return 1;
-}
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
+ pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
+ sp->spt[i], &sp->spt[i],
+ kvm_mmu_page_get_gfn(sp, i));
+ }
#endif
+}
/*
* This value is the sum of all of the kvm instances's
@@ -1748,7 +1730,8 @@ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
{
- MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
+ kvm_mmu_check_sptes_at_free(sp);
+
hlist_del(&sp->hash_link);
list_del(&sp->link);
free_page((unsigned long)sp->spt);
@@ -1771,16 +1754,16 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
pte_list_add(cache, parent_pte, &sp->parent_ptes);
}
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
+static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
u64 *parent_pte)
{
- pte_list_remove(parent_pte, &sp->parent_ptes);
+ pte_list_remove(kvm, parent_pte, &sp->parent_ptes);
}
-static void drop_parent_pte(struct kvm_mmu_page *sp,
+static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
u64 *parent_pte)
{
- mmu_page_remove_parent_pte(sp, parent_pte);
+ mmu_page_remove_parent_pte(kvm, sp, parent_pte);
mmu_spte_clear_no_track(parent_pte);
}
@@ -1836,7 +1819,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
{
--sp->unsync_children;
- WARN_ON((int)sp->unsync_children < 0);
+ WARN_ON_ONCE((int)sp->unsync_children < 0);
__clear_bit(idx, sp->unsync_child_bitmap);
}
@@ -1894,7 +1877,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- WARN_ON(!sp->unsync);
+ WARN_ON_ONCE(!sp->unsync);
trace_kvm_mmu_sync_page(sp);
sp->unsync = 0;
--kvm->stat.mmu_unsync;
@@ -2069,11 +2052,11 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec,
if (pvec->nr == 0)
return 0;
- WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+ WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX);
sp = pvec->page[0].sp;
level = sp->role.level;
- WARN_ON(level == PG_LEVEL_4K);
+ WARN_ON_ONCE(level == PG_LEVEL_4K);
parents->parent[level-2] = sp;
@@ -2095,7 +2078,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
if (!sp)
return;
- WARN_ON(idx == INVALID_INDEX);
+ WARN_ON_ONCE(idx == INVALID_INDEX);
clear_unsync_child_bit(sp, idx);
level++;
} while (!sp->unsync_children);
@@ -2216,7 +2199,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
if (ret < 0)
break;
- WARN_ON(!list_empty(&invalid_list));
+ WARN_ON_ONCE(!list_empty(&invalid_list));
if (ret > 0)
kvm_flush_remote_tlbs(kvm);
}
@@ -2495,7 +2478,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (child->role.access == direct_access)
return;
- drop_parent_pte(child, sptep);
+ drop_parent_pte(vcpu->kvm, child, sptep);
kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
}
}
@@ -2513,7 +2496,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
drop_spte(kvm, spte);
} else {
child = spte_to_child_sp(pte);
- drop_parent_pte(child, spte);
+ drop_parent_pte(kvm, child, spte);
/*
* Recursively zap nested TDP SPs, parentless SPs are
@@ -2544,13 +2527,13 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm,
return zapped;
}
-static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *sptep;
struct rmap_iterator iter;
while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
- drop_parent_pte(sp, sptep);
+ drop_parent_pte(kvm, sp, sptep);
}
static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -2589,7 +2572,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
- kvm_mmu_unlink_parents(sp);
+ kvm_mmu_unlink_parents(kvm, sp);
/* Zapping children means active_mmu_pages has become unstable. */
list_unstable = *nr_zapped;
@@ -2671,7 +2654,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
kvm_flush_remote_tlbs(kvm);
list_for_each_entry_safe(sp, nsp, invalid_list, link) {
- WARN_ON(!sp->role.invalid || sp->root_count);
+ WARN_ON_ONCE(!sp->role.invalid || sp->root_count);
kvm_mmu_free_shadow_page(sp);
}
}
@@ -2771,12 +2754,9 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
LIST_HEAD(invalid_list);
int r;
- pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
r = 0;
write_lock(&kvm->mmu_lock);
for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
- pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
- sp->role.word);
r = 1;
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
@@ -2827,7 +2807,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
* track machinery is used to write-protect upper-level shadow pages,
* i.e. this guards the role.level == 4K assertion below!
*/
- if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
+ if (kvm_gfn_is_write_tracked(kvm, slot, gfn))
return -EPERM;
/*
@@ -2869,7 +2849,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
continue;
}
- WARN_ON(sp->role.level != PG_LEVEL_4K);
+ WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
kvm_unsync_page(kvm, sp);
}
if (locked)
@@ -2934,9 +2914,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
bool prefetch = !fault || fault->prefetch;
bool write_fault = fault && fault->write;
- pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
- *sptep, write_fault, gfn);
-
if (unlikely(is_noslot_pfn(pfn))) {
vcpu->stat.pf_mmio_spte_created++;
mark_mmio_spte(vcpu, sptep, gfn, pte_access);
@@ -2953,11 +2930,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
u64 pte = *sptep;
child = spte_to_child_sp(pte);
- drop_parent_pte(child, sptep);
+ drop_parent_pte(vcpu->kvm, child, sptep);
flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
- pgprintk("hfn old %llx new %llx\n",
- spte_to_pfn(*sptep), pfn);
drop_spte(vcpu->kvm, sptep);
flush = true;
} else
@@ -2982,8 +2957,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
if (flush)
kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
- pgprintk("%s: setting spte %llx\n", __func__, *sptep);
-
if (!was_rmapped) {
WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
rmap_add(vcpu, slot, sptep, gfn, pte_access);
@@ -3029,7 +3002,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
u64 *spte, *start = NULL;
int i;
- WARN_ON(!sp->role.direct);
+ WARN_ON_ONCE(!sp->role.direct);
i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
spte = sp->spt + i;
@@ -3570,12 +3543,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (!VALID_PAGE(*root_hpa))
return;
- /*
- * The "root" may be a special root, e.g. a PAE entry, treat it as a
- * SPTE to ensure any non-PA bits are dropped.
- */
- sp = spte_to_child_sp(*root_hpa);
- if (WARN_ON(!sp))
+ sp = root_to_sp(*root_hpa);
+ if (WARN_ON_ONCE(!sp))
return;
if (is_tdp_mmu_page(sp))
@@ -3620,7 +3589,9 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
&invalid_list);
if (free_active_root) {
- if (to_shadow_page(mmu->root.hpa)) {
+ if (kvm_mmu_is_dummy_root(mmu->root.hpa)) {
+ /* Nothing to cleanup for dummy roots. */
+ } else if (root_to_sp(mmu->root.hpa)) {
mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
} else if (mmu->pae_root) {
for (i = 0; i < 4; ++i) {
@@ -3644,6 +3615,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
{
unsigned long roots_to_free = 0;
+ struct kvm_mmu_page *sp;
hpa_t root_hpa;
int i;
@@ -3658,8 +3630,8 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
if (!VALID_PAGE(root_hpa))
continue;
- if (!to_shadow_page(root_hpa) ||
- to_shadow_page(root_hpa)->role.guest_mode)
+ sp = root_to_sp(root_hpa);
+ if (!sp || sp->role.guest_mode)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
}
@@ -3667,19 +3639,6 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
-
-static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
-{
- int ret = 0;
-
- if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- ret = 1;
- }
-
- return ret;
-}
-
static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
u8 level)
{
@@ -3817,8 +3776,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
root_gfn = root_pgd >> PAGE_SHIFT;
- if (mmu_check_root(vcpu, root_gfn))
- return 1;
+ if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
+ mmu->root.hpa = kvm_mmu_get_dummy_root();
+ return 0;
+ }
/*
* On SVM, reading PDPTRs might access guest memory, which might fault
@@ -3830,8 +3791,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (!(pdptrs[i] & PT_PRESENT_MASK))
continue;
- if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
- return 1;
+ if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT))
+ pdptrs[i] = 0;
}
}
@@ -3998,7 +3959,7 @@ static bool is_unsync_root(hpa_t root)
{
struct kvm_mmu_page *sp;
- if (!VALID_PAGE(root))
+ if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root))
return false;
/*
@@ -4014,7 +3975,7 @@ static bool is_unsync_root(hpa_t root)
* requirement isn't satisfied.
*/
smp_rmb();
- sp = to_shadow_page(root);
+ sp = root_to_sp(root);
/*
* PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
@@ -4044,11 +4005,12 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu->root.hpa;
- sp = to_shadow_page(root);
if (!is_unsync_root(root))
return;
+ sp = root_to_sp(root);
+
write_lock(&vcpu->kvm->mmu_lock);
mmu_sync_children(vcpu, sp, true);
write_unlock(&vcpu->kvm->mmu_lock);
@@ -4190,7 +4152,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
return RET_PF_EMULATE;
reserved = get_mmio_spte(vcpu, addr, &spte);
- if (WARN_ON(reserved))
+ if (WARN_ON_ONCE(reserved))
return -EINVAL;
if (is_mmio_spte(spte)) {
@@ -4228,7 +4190,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
* guest is writing the page which is write tracked which can
* not be fixed by page fault handler.
*/
- if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
+ if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn))
return true;
return false;
@@ -4378,7 +4340,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
- struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
+ struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
/* Special roots, e.g. pae_root, are not backed by shadow pages. */
if (sp && is_obsolete_sp(vcpu->kvm, sp))
@@ -4403,6 +4365,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
{
int r;
+ /* Dummy roots are used only for shadowing bad guest roots. */
+ if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa)))
+ return RET_PF_RETRY;
+
if (page_fault_handle_page_track(vcpu, fault))
return RET_PF_EMULATE;
@@ -4439,8 +4405,6 @@ out_unlock:
static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
- pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
-
/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
fault->max_level = PG_LEVEL_2M;
return direct_page_fault(vcpu, fault);
@@ -4558,9 +4522,19 @@ static void nonpaging_init_context(struct kvm_mmu *context)
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
union kvm_mmu_page_role role)
{
- return (role.direct || pgd == root->pgd) &&
- VALID_PAGE(root->hpa) &&
- role.word == to_shadow_page(root->hpa)->role.word;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root->hpa))
+ return false;
+
+ if (!role.direct && pgd != root->pgd)
+ return false;
+
+ sp = root_to_sp(root->hpa);
+ if (WARN_ON_ONCE(!sp))
+ return false;
+
+ return role.word == sp->role.word;
}
/*
@@ -4630,11 +4604,10 @@ static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
gpa_t new_pgd, union kvm_mmu_page_role new_role)
{
/*
- * For now, limit the caching to 64-bit hosts+VMs in order to avoid
- * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
- * later if necessary.
+ * Limit reuse to 64-bit hosts+VMs without "special" roots in order to
+ * avoid having to deal with PDPTEs and other complexities.
*/
- if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
+ if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa))
kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
if (VALID_PAGE(mmu->root.hpa))
@@ -4680,9 +4653,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
* If this is a direct root page, it doesn't have a write flooding
* count. Otherwise, clear the write flooding count.
*/
- if (!new_role.direct)
- __clear_sp_write_flooding_count(
- to_shadow_page(vcpu->arch.mmu->root.hpa));
+ if (!new_role.direct) {
+ struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+ if (!WARN_ON_ONCE(!sp))
+ __clear_sp_write_flooding_count(sp);
+ }
}
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
@@ -5449,8 +5425,8 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
* physical address properties) in a single VM would require tracking
* all relevant CPUID information in kvm_mmu_page_role. That is very
* undesirable as it would increase the memory requirements for
- * gfn_track (see struct kvm_mmu_page_role comments). For now that
- * problem is swept under the rug; KVM's CPUID API is horrific and
+ * gfn_write_track (see struct kvm_mmu_page_role comments). For now
+ * that problem is swept under the rug; KVM's CPUID API is horrific and
* it's all but impossible to solve it without introducing a new API.
*/
vcpu->arch.root_mmu.root_role.word = 0;
@@ -5513,9 +5489,9 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
- WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
+ WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
- WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
+ WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
}
@@ -5528,16 +5504,21 @@ static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
/*
* When freeing obsolete roots, treat roots as obsolete if they don't
- * have an associated shadow page. This does mean KVM will get false
+ * have an associated shadow page, as it's impossible to determine if
+ * such roots are fresh or stale. This does mean KVM will get false
* positives and free roots that don't strictly need to be freed, but
* such false positives are relatively rare:
*
- * (a) only PAE paging and nested NPT has roots without shadow pages
+ * (a) only PAE paging and nested NPT have roots without shadow pages
+ * (or any shadow paging flavor with a dummy root, see note below)
* (b) remote reloads due to a memslot update obsoletes _all_ roots
* (c) KVM doesn't track previous roots for PAE paging, and the guest
* is unlikely to zap an in-use PGD.
+ *
+ * Note! Dummy roots are unique in that they are obsoleted by memslot
+ * _creation_! See also FNAME(fetch).
*/
- sp = to_shadow_page(root_hpa);
+ sp = root_to_sp(root_hpa);
return !sp || is_obsolete_sp(kvm, sp);
}
@@ -5616,9 +5597,6 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
{
unsigned offset, pte_size, misaligned;
- pgprintk("misaligned: gpa %llx bytes %d role %x\n",
- gpa, bytes, sp->role.word);
-
offset = offset_in_page(gpa);
pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
@@ -5666,9 +5644,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
return spte;
}
-static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes,
- struct kvm_page_track_notifier_node *node)
+void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *sp;
@@ -5684,8 +5661,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
return;
- pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
-
write_lock(&vcpu->kvm->mmu_lock);
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
@@ -5724,7 +5699,18 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
int r, emulation_type = EMULTYPE_PF;
bool direct = vcpu->arch.mmu->root_role.direct;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ /*
+ * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
+ * checks when emulating instructions that triggers implicit access.
+ * WARN if hardware generates a fault with an error code that collides
+ * with the KVM-defined value. Clear the flag and continue on, i.e.
+ * don't terminate the VM, as KVM can't possibly be relying on a flag
+ * that KVM doesn't know about.
+ */
+ if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
+ error_code &= ~PFERR_IMPLICIT_ACCESS;
+
+ if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
r = RET_PF_INVALID;
@@ -6081,7 +6067,7 @@ restart:
* pages. Skip the bogus page, otherwise we'll get stuck in an
* infinite loop if the page gets put back on the list (again).
*/
- if (WARN_ON(sp->role.invalid))
+ if (WARN_ON_ONCE(sp->role.invalid))
continue;
/*
@@ -6181,16 +6167,8 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
}
-static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_page_track_notifier_node *node)
-{
- kvm_mmu_zap_all_fast(kvm);
-}
-
int kvm_mmu_init_vm(struct kvm *kvm)
{
- struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
int r;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
@@ -6204,10 +6182,6 @@ int kvm_mmu_init_vm(struct kvm *kvm)
return r;
}
- node->track_write = kvm_mmu_pte_write;
- node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
- kvm_page_track_register_notifier(kvm, node);
-
kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
@@ -6228,10 +6202,6 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm)
void kvm_mmu_uninit_vm(struct kvm *kvm)
{
- struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
-
- kvm_page_track_unregister_notifier(kvm, node);
-
if (tdp_mmu_enabled)
kvm_mmu_uninit_tdp_mmu(kvm);
@@ -6700,7 +6670,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
*/
}
-void kvm_mmu_zap_all(struct kvm *kvm)
+static void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
@@ -6709,7 +6679,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
write_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
- if (WARN_ON(sp->role.invalid))
+ if (WARN_ON_ONCE(sp->role.invalid))
continue;
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
@@ -6725,9 +6695,20 @@ restart:
write_unlock(&kvm->mmu_lock);
}
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ kvm_mmu_zap_all(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_mmu_zap_all_fast(kvm);
+}
+
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
{
- WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+ WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
gen &= MMIO_SPTE_GEN_MASK;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 86cb83bb3480..b102014e2c60 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -6,18 +6,10 @@
#include <linux/kvm_host.h>
#include <asm/kvm_host.h>
-#undef MMU_DEBUG
-
-#ifdef MMU_DEBUG
-extern bool dbg;
-
-#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0)
-#define MMU_WARN_ON(x) WARN_ON(x)
+#ifdef CONFIG_KVM_PROVE_MMU
+#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
#else
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
-#define MMU_WARN_ON(x) do { } while (0)
+#define KVM_MMU_WARN_ON(x) BUILD_BUG_ON_INVALID(x)
#endif
/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
@@ -44,6 +36,16 @@ extern bool dbg;
#define INVALID_PAE_ROOT 0
#define IS_VALID_PAE_ROOT(x) (!!(x))
+static inline hpa_t kvm_mmu_get_dummy_root(void)
+{
+ return my_zero_pfn(0) << PAGE_SHIFT;
+}
+
+static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
+{
+ return is_zero_pfn(shadow_page >> PAGE_SHIFT);
+}
+
typedef u64 __rcu *tdp_ptep_t;
struct kvm_mmu_page {
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 0a2ac438d647..c87da11f3a04 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -12,13 +12,13 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/lockdep.h>
#include <linux/kvm_host.h>
#include <linux/rculist.h>
-#include <asm/kvm_page_track.h>
-
#include "mmu.h"
#include "mmu_internal.h"
+#include "page_track.h"
bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
{
@@ -28,103 +28,64 @@ bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
{
- int i;
-
- for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
- kvfree(slot->arch.gfn_track[i]);
- slot->arch.gfn_track[i] = NULL;
- }
+ kvfree(slot->arch.gfn_write_track);
+ slot->arch.gfn_write_track = NULL;
}
-int kvm_page_track_create_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- unsigned long npages)
+static int __kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot,
+ unsigned long npages)
{
- int i;
-
- for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
- if (i == KVM_PAGE_TRACK_WRITE &&
- !kvm_page_track_write_tracking_enabled(kvm))
- continue;
-
- slot->arch.gfn_track[i] =
- __vcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
- GFP_KERNEL_ACCOUNT);
- if (!slot->arch.gfn_track[i])
- goto track_free;
- }
+ const size_t size = sizeof(*slot->arch.gfn_write_track);
- return 0;
+ if (!slot->arch.gfn_write_track)
+ slot->arch.gfn_write_track = __vcalloc(npages, size,
+ GFP_KERNEL_ACCOUNT);
-track_free:
- kvm_page_track_free_memslot(slot);
- return -ENOMEM;
+ return slot->arch.gfn_write_track ? 0 : -ENOMEM;
}
-static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long npages)
{
- if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX)
- return false;
+ if (!kvm_page_track_write_tracking_enabled(kvm))
+ return 0;
- return true;
+ return __kvm_page_track_write_tracking_alloc(slot, npages);
}
int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot)
{
- unsigned short *gfn_track;
-
- if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE])
- return 0;
-
- gfn_track = __vcalloc(slot->npages, sizeof(*gfn_track),
- GFP_KERNEL_ACCOUNT);
- if (gfn_track == NULL)
- return -ENOMEM;
-
- slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE] = gfn_track;
- return 0;
+ return __kvm_page_track_write_tracking_alloc(slot, slot->npages);
}
-static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
- enum kvm_page_track_mode mode, short count)
+static void update_gfn_write_track(struct kvm_memory_slot *slot, gfn_t gfn,
+ short count)
{
int index, val;
index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
- val = slot->arch.gfn_track[mode][index];
+ val = slot->arch.gfn_write_track[index];
- if (WARN_ON(val + count < 0 || val + count > USHRT_MAX))
+ if (WARN_ON_ONCE(val + count < 0 || val + count > USHRT_MAX))
return;
- slot->arch.gfn_track[mode][index] += count;
+ slot->arch.gfn_write_track[index] += count;
}
-/*
- * add guest page to the tracking pool so that corresponding access on that
- * page will be intercepted.
- *
- * It should be called under the protection both of mmu-lock and kvm->srcu
- * or kvm->slots_lock.
- *
- * @kvm: the guest instance we are interested in.
- * @slot: the @gfn belongs to.
- * @gfn: the guest page.
- * @mode: tracking mode, currently only write track is supported.
- */
-void kvm_slot_page_track_add_page(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn,
- enum kvm_page_track_mode mode)
+void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn)
{
+ lockdep_assert_held_write(&kvm->mmu_lock);
- if (WARN_ON(!page_track_mode_is_valid(mode)))
- return;
+ lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) ||
+ srcu_read_lock_held(&kvm->srcu));
- if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE &&
- !kvm_page_track_write_tracking_enabled(kvm)))
+ if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm))
return;
- update_gfn_track(slot, gfn, mode, 1);
+ update_gfn_write_track(slot, gfn, 1);
/*
* new track stops large page mapping for the
@@ -132,37 +93,22 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
*/
kvm_mmu_gfn_disallow_lpage(slot, gfn);
- if (mode == KVM_PAGE_TRACK_WRITE)
- if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
- kvm_flush_remote_tlbs(kvm);
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
+ kvm_flush_remote_tlbs(kvm);
}
-EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page);
-/*
- * remove the guest page from the tracking pool which stops the interception
- * of corresponding access on that page. It is the opposed operation of
- * kvm_slot_page_track_add_page().
- *
- * It should be called under the protection both of mmu-lock and kvm->srcu
- * or kvm->slots_lock.
- *
- * @kvm: the guest instance we are interested in.
- * @slot: the @gfn belongs to.
- * @gfn: the guest page.
- * @mode: tracking mode, currently only write track is supported.
- */
-void kvm_slot_page_track_remove_page(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn,
- enum kvm_page_track_mode mode)
+void __kvm_write_track_remove_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn)
{
- if (WARN_ON(!page_track_mode_is_valid(mode)))
- return;
+ lockdep_assert_held_write(&kvm->mmu_lock);
- if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE &&
- !kvm_page_track_write_tracking_enabled(kvm)))
+ lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) ||
+ srcu_read_lock_held(&kvm->srcu));
+
+ if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm))
return;
- update_gfn_track(slot, gfn, mode, -1);
+ update_gfn_write_track(slot, gfn, -1);
/*
* allow large page mapping for the tracked page
@@ -170,31 +116,26 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
*/
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
-EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page);
/*
* check if the corresponding access on the specified guest page is tracked.
*/
-bool kvm_slot_page_track_is_active(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- gfn_t gfn, enum kvm_page_track_mode mode)
+bool kvm_gfn_is_write_tracked(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn)
{
int index;
- if (WARN_ON(!page_track_mode_is_valid(mode)))
- return false;
-
if (!slot)
return false;
- if (mode == KVM_PAGE_TRACK_WRITE &&
- !kvm_page_track_write_tracking_enabled(kvm))
+ if (!kvm_page_track_write_tracking_enabled(kvm))
return false;
index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
- return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
+ return !!READ_ONCE(slot->arch.gfn_write_track[index]);
}
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
void kvm_page_track_cleanup(struct kvm *kvm)
{
struct kvm_page_track_notifier_head *head;
@@ -216,17 +157,22 @@ int kvm_page_track_init(struct kvm *kvm)
* register the notifier so that event interception for the tracked guest
* pages can be received.
*/
-void
-kvm_page_track_register_notifier(struct kvm *kvm,
- struct kvm_page_track_notifier_node *n)
+int kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
{
struct kvm_page_track_notifier_head *head;
+ if (!kvm || kvm->mm != current->mm)
+ return -ESRCH;
+
+ kvm_get_kvm(kvm);
+
head = &kvm->arch.track_notifier_head;
write_lock(&kvm->mmu_lock);
hlist_add_head_rcu(&n->node, &head->track_notifier_list);
write_unlock(&kvm->mmu_lock);
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
@@ -234,9 +180,8 @@ EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
* stop receiving the event interception. It is the opposed operation of
* kvm_page_track_register_notifier().
*/
-void
-kvm_page_track_unregister_notifier(struct kvm *kvm,
- struct kvm_page_track_notifier_node *n)
+void kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
{
struct kvm_page_track_notifier_head *head;
@@ -246,6 +191,8 @@ kvm_page_track_unregister_notifier(struct kvm *kvm,
hlist_del_rcu(&n->node);
write_unlock(&kvm->mmu_lock);
synchronize_srcu(&head->track_srcu);
+
+ kvm_put_kvm(kvm);
}
EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
@@ -256,34 +203,30 @@ EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
* The node should figure out if the written page is the one that node is
* interested in by itself.
*/
-void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
- int bytes)
+void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes)
{
struct kvm_page_track_notifier_head *head;
struct kvm_page_track_notifier_node *n;
int idx;
- head = &vcpu->kvm->arch.track_notifier_head;
+ head = &kvm->arch.track_notifier_head;
if (hlist_empty(&head->track_notifier_list))
return;
idx = srcu_read_lock(&head->track_srcu);
hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
- srcu_read_lock_held(&head->track_srcu))
+ srcu_read_lock_held(&head->track_srcu))
if (n->track_write)
- n->track_write(vcpu, gpa, new, bytes, n);
+ n->track_write(gpa, new, bytes, n);
srcu_read_unlock(&head->track_srcu, idx);
}
/*
- * Notify the node that memory slot is being removed or moved so that it can
- * drop write-protection for the pages in the memory slot.
- *
- * The node should figure out it has any write-protected pages in this slot
- * by itself.
+ * Notify external page track nodes that a memory region is being removed from
+ * the VM, e.g. so that users can free any associated metadata.
*/
-void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
+void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
struct kvm_page_track_notifier_head *head;
struct kvm_page_track_notifier_node *n;
@@ -296,8 +239,69 @@ void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
idx = srcu_read_lock(&head->track_srcu);
hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
- srcu_read_lock_held(&head->track_srcu))
- if (n->track_flush_slot)
- n->track_flush_slot(kvm, slot, n);
+ srcu_read_lock_held(&head->track_srcu))
+ if (n->track_remove_region)
+ n->track_remove_region(slot->base_gfn, slot->npages, n);
srcu_read_unlock(&head->track_srcu, idx);
}
+
+/*
+ * add guest page to the tracking pool so that corresponding access on that
+ * page will be intercepted.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @gfn: the guest page.
+ */
+int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!slot) {
+ srcu_read_unlock(&kvm->srcu, idx);
+ return -EINVAL;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ __kvm_write_track_add_gfn(kvm, slot, gfn);
+ write_unlock(&kvm->mmu_lock);
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_track_add_gfn);
+
+/*
+ * remove the guest page from the tracking pool which stops the interception
+ * of corresponding access on that page.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @gfn: the guest page.
+ */
+int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!slot) {
+ srcu_read_unlock(&kvm->srcu, idx);
+ return -EINVAL;
+ }
+
+ write_lock(&kvm->mmu_lock);
+ __kvm_write_track_remove_gfn(kvm, slot, gfn);
+ write_unlock(&kvm->mmu_lock);
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_write_track_remove_gfn);
+#endif
diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h
new file mode 100644
index 000000000000..d4d72ed999b1
--- /dev/null
+++ b/arch/x86/kvm/mmu/page_track.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_PAGE_TRACK_H
+#define __KVM_X86_PAGE_TRACK_H
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_page_track.h>
+
+
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long npages);
+
+void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn);
+void __kvm_write_track_remove_gfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn);
+
+bool kvm_gfn_is_write_tracked(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, gfn_t gfn);
+
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+int kvm_page_track_init(struct kvm *kvm);
+void kvm_page_track_cleanup(struct kvm *kvm);
+
+void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes);
+void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
+
+static inline bool kvm_page_track_has_external_user(struct kvm *kvm)
+{
+ return !hlist_empty(&kvm->arch.track_notifier_head.track_notifier_list);
+}
+#else
+static inline int kvm_page_track_init(struct kvm *kvm) { return 0; }
+static inline void kvm_page_track_cleanup(struct kvm *kvm) { }
+
+static inline void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa,
+ const u8 *new, int bytes) { }
+static inline void kvm_page_track_delete_slot(struct kvm *kvm,
+ struct kvm_memory_slot *slot) { }
+
+static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return false; }
+
+#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
+
+static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes)
+{
+ __kvm_page_track_write(vcpu->kvm, gpa, new, bytes);
+
+ kvm_mmu_track_write(vcpu, gpa, new, bytes);
+}
+
+#endif /* __KVM_X86_PAGE_TRACK_H */
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 0662e0278e70..c85255073f67 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -338,7 +338,6 @@ retry_walk:
}
#endif
walker->max_level = walker->level;
- ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
/*
* FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
@@ -348,9 +347,21 @@ retry_walk:
nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
pte_access = ~0;
+
+ /*
+ * Queue a page fault for injection if this assertion fails, as callers
+ * assume that walker.fault contains sane info on a walk failure. I.e.
+ * avoid making the situation worse by inducing even worse badness
+ * between when the assertion fails and when KVM kicks the vCPU out to
+ * userspace (because the VM is bugged).
+ */
+ if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm))
+ goto error;
+
++walker->level;
do {
+ struct kvm_memory_slot *slot;
unsigned long host_addr;
pt_access = pte_access;
@@ -381,7 +392,11 @@ retry_walk:
if (unlikely(real_gpa == INVALID_GPA))
return 0;
- host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa),
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa));
+ if (!kvm_is_visible_memslot(slot))
+ goto error;
+
+ host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa),
&walker->pte_writable[walker->level - 1]);
if (unlikely(kvm_is_error_hva(host_addr)))
goto error;
@@ -456,9 +471,6 @@ retry_walk:
goto retry_walk;
}
- pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, walker->pte_access,
- walker->pt_access[walker->level - 1]);
return 1;
error:
@@ -529,8 +541,6 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
- pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
@@ -638,8 +648,19 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (FNAME(gpte_changed)(vcpu, gw, top_level))
goto out_gpte_changed;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+ goto out_gpte_changed;
+
+ /*
+ * Load a new root and retry the faulting instruction in the extremely
+ * unlikely scenario that the guest root gfn became visible between
+ * loading a dummy root and handling the resulting page fault, e.g. if
+ * userspace create a memslot in the interim.
+ */
+ if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
+ kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu);
goto out_gpte_changed;
+ }
for_each_shadow_entry(vcpu, fault->addr, it) {
gfn_t table_gfn;
@@ -758,7 +779,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
struct guest_walker walker;
int r;
- pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
WARN_ON_ONCE(fault->is_tdp);
/*
@@ -773,7 +793,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
- pgprintk("%s: guest page fault\n", __func__);
if (!fault->prefetch)
kvm_inject_emulated_page_fault(vcpu, &walker.fault);
@@ -837,7 +856,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
{
int offset = 0;
- WARN_ON(sp->role.level != PG_LEVEL_4K);
+ WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
if (PTTYPE == 32)
offset = sp->role.quadrant << SPTE_LEVEL_BITS;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index cf2c6426a6fc..4a599130e9c9 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -61,7 +61,7 @@ static u64 generation_mmio_spte_mask(u64 gen)
{
u64 mask;
- WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
+ WARN_ON_ONCE(gen & ~MMIO_SPTE_GEN_MASK);
mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -221,8 +221,6 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* shadow pages and unsync'ing pages is not allowed.
*/
if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) {
- pgprintk("%s: found shadow page for %llx, marking ro\n",
- __func__, gfn);
wrprot = true;
pte_access &= ~ACC_WRITE_MASK;
spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
@@ -242,7 +240,7 @@ out:
if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) {
/* Enforced by kvm_mmu_hugepage_adjust. */
- WARN_ON(level > PG_LEVEL_4K);
+ WARN_ON_ONCE(level > PG_LEVEL_4K);
mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 1279db2eab44..a129951c9a88 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -3,6 +3,7 @@
#ifndef KVM_X86_MMU_SPTE_H
#define KVM_X86_MMU_SPTE_H
+#include "mmu.h"
#include "mmu_internal.h"
/*
@@ -236,6 +237,18 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
return to_shadow_page(__pa(sptep));
}
+static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
+{
+ if (kvm_mmu_is_dummy_root(root))
+ return NULL;
+
+ /*
+ * The "root" may be a special root, e.g. a PAE entry, treat it as a
+ * SPTE to ensure any non-PA bits are dropped.
+ */
+ return spte_to_child_sp(root);
+}
+
static inline bool is_mmio_spte(u64 spte)
{
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
@@ -265,13 +278,13 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
static inline bool spte_ad_enabled(u64 spte)
{
- MMU_WARN_ON(!is_shadow_present_pte(spte));
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED;
}
static inline bool spte_ad_need_write_protect(u64 spte)
{
- MMU_WARN_ON(!is_shadow_present_pte(spte));
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
/*
* This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0',
* and non-TDP SPTEs will never set these bits. Optimize for 64-bit
@@ -282,13 +295,13 @@ static inline bool spte_ad_need_write_protect(u64 spte)
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
- MMU_WARN_ON(!is_shadow_present_pte(spte));
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}
static inline u64 spte_shadow_dirty_mask(u64 spte)
{
- MMU_WARN_ON(!is_shadow_present_pte(spte));
+ KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index d2eb0d4f8710..bd30ebfb2f2c 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -39,13 +39,14 @@ void tdp_iter_restart(struct tdp_iter *iter)
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn)
{
- int root_level = root->role.level;
-
- WARN_ON(root_level < 1);
- WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
+ if (WARN_ON_ONCE(!root || (root->role.level < 1) ||
+ (root->role.level > PT64_ROOT_MAX_LEVEL))) {
+ iter->valid = false;
+ return;
+ }
iter->next_last_level_gfn = next_last_level_gfn;
- iter->root_level = root_level;
+ iter->root_level = root->role.level;
iter->min_level = min_level;
iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
iter->as_id = kvm_mmu_page_as_id(root);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6250bd3d20c1..6c63f2d1675f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -475,9 +475,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
bool is_leaf = is_present && is_last_spte(new_spte, level);
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
- WARN_ON(level > PT64_ROOT_MAX_LEVEL);
- WARN_ON(level < PG_LEVEL_4K);
- WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
+ WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
+ WARN_ON_ONCE(level < PG_LEVEL_4K);
+ WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
/*
* If this warning were to trigger it would indicate that there was a
@@ -522,9 +522,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* impact the guest since both the former and current SPTEs
* are nonpresent.
*/
- if (WARN_ON(!is_mmio_spte(old_spte) &&
- !is_mmio_spte(new_spte) &&
- !is_removed_spte(new_spte)))
+ if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
+ !is_mmio_spte(new_spte) &&
+ !is_removed_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
"different nonpresent SPTE, unless one or both\n"
@@ -661,7 +661,7 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
* should be used. If operating under the MMU lock in write mode, the
* use of the removed SPTE should not be necessary.
*/
- WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
+ WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
@@ -689,7 +689,7 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
else
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
- for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
+ for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
/*
* Yield if the MMU lock is contended or this thread needs to return control
@@ -709,7 +709,7 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
struct tdp_iter *iter,
bool flush, bool shared)
{
- WARN_ON(iter->yielded);
+ WARN_ON_ONCE(iter->yielded);
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
@@ -728,7 +728,7 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
rcu_read_lock();
- WARN_ON(iter->gfn > iter->next_last_level_gfn);
+ WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
iter->yielded = true;
}
@@ -1241,7 +1241,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte;
/* Huge pages aren't expected to be modified without first being zapped. */
- WARN_ON(pte_huge(range->arg.pte) || range->start + 1 != range->end);
+ WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
if (iter->level != PG_LEVEL_4K ||
!is_shadow_present_pte(iter->old_spte))
@@ -1548,8 +1548,8 @@ retry:
if (!is_shadow_present_pte(iter.old_spte))
continue;
- MMU_WARN_ON(kvm_ad_enabled() &&
- spte_ad_need_write_protect(iter.old_spte));
+ KVM_MMU_WARN_ON(kvm_ad_enabled() &&
+ spte_ad_need_write_protect(iter.old_spte));
if (!(iter.old_spte & dbit))
continue;
@@ -1600,6 +1600,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
shadow_dirty_mask;
struct tdp_iter iter;
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
@@ -1607,8 +1609,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
if (!mask)
break;
- MMU_WARN_ON(kvm_ad_enabled() &&
- spte_ad_need_write_protect(iter.old_spte));
+ KVM_MMU_WARN_ON(kvm_ad_enabled() &&
+ spte_ad_need_write_protect(iter.old_spte));
if (iter.level > PG_LEVEL_4K ||
!(mask & (1UL << (iter.gfn - gfn))))
@@ -1646,7 +1648,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_mmu_page *root;
- lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root(kvm, root, slot->as_id)
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b38a046690e..6c9c81e82e65 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -25,6 +25,7 @@
#include "tss.h"
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
+#include "mmu/page_track.h"
#include "x86.h"
#include "cpuid.h"
#include "pmu.h"
@@ -12632,6 +12633,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ /*
+ * KVM doesn't support moving memslots when there are external page
+ * trackers attached to the VM, i.e. if KVMGT is in use.
+ */
+ if (change == KVM_MR_MOVE && kvm_page_track_has_external_user(kvm))
+ return -EINVAL;
+
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
return -EINVAL;
@@ -12786,6 +12794,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ if (change == KVM_MR_DELETE)
+ kvm_page_track_delete_slot(kvm, old);
+
if (!kvm->arch.n_requested_mmu_pages &&
(change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
unsigned long nr_mmu_pages;
@@ -12802,17 +12813,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_arch_free_memslot(kvm, old);
}
-void kvm_arch_flush_shadow_all(struct kvm *kvm)
-{
- kvm_mmu_zap_all(kvm);
-}
-
-void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot)
-{
- kvm_page_track_flush_slot(kvm, slot);
-}
-
static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
return (is_guest_mode(vcpu) &&
diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 4ec85308379a..094fca9b0e73 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -49,22 +49,6 @@
static bool enable_out_of_sync = false;
static int preallocated_oos_pages = 8192;
-static bool intel_gvt_is_valid_gfn(struct intel_vgpu *vgpu, unsigned long gfn)
-{
- struct kvm *kvm = vgpu->vfio_device.kvm;
- int idx;
- bool ret;
-
- if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status))
- return false;
-
- idx = srcu_read_lock(&kvm->srcu);
- ret = kvm_is_visible_gfn(kvm, gfn);
- srcu_read_unlock(&kvm->srcu, idx);
-
- return ret;
-}
-
/*
* validate a gm address and related range size,
* translate it to host gm address
@@ -1161,31 +1145,6 @@ static inline void ppgtt_generate_shadow_entry(struct intel_gvt_gtt_entry *se,
ops->set_pfn(se, s->shadow_page.mfn);
}
-/*
- * Check if can do 2M page
- * @vgpu: target vgpu
- * @entry: target pfn's gtt entry
- *
- * Return 1 if 2MB huge gtt shadowing is possible, 0 if miscondition,
- * negative if found err.
- */
-static int is_2MB_gtt_possible(struct intel_vgpu *vgpu,
- struct intel_gvt_gtt_entry *entry)
-{
- const struct intel_gvt_gtt_pte_ops *ops = vgpu->gvt->gtt.pte_ops;
- kvm_pfn_t pfn;
-
- if (!HAS_PAGE_SIZES(vgpu->gvt->gt->i915, I915_GTT_PAGE_SIZE_2M))
- return 0;
-
- if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status))
- return -EINVAL;
- pfn = gfn_to_pfn(vgpu->vfio_device.kvm, ops->get_pfn(entry));
- if (is_error_noslot_pfn(pfn))
- return -EINVAL;
- return PageTransHuge(pfn_to_page(pfn));
-}
-
static int split_2MB_gtt_entry(struct intel_vgpu *vgpu,
struct intel_vgpu_ppgtt_spt *spt, unsigned long index,
struct intel_gvt_gtt_entry *se)
@@ -1279,7 +1238,7 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
{
const struct intel_gvt_gtt_pte_ops *pte_ops = vgpu->gvt->gtt.pte_ops;
struct intel_gvt_gtt_entry se = *ge;
- unsigned long gfn, page_size = PAGE_SIZE;
+ unsigned long gfn;
dma_addr_t dma_addr;
int ret;
@@ -1291,6 +1250,9 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
switch (ge->type) {
case GTT_TYPE_PPGTT_PTE_4K_ENTRY:
gvt_vdbg_mm("shadow 4K gtt entry\n");
+ ret = intel_gvt_dma_map_guest_page(vgpu, gfn, PAGE_SIZE, &dma_addr);
+ if (ret)
+ return -ENXIO;
break;
case GTT_TYPE_PPGTT_PTE_64K_ENTRY:
gvt_vdbg_mm("shadow 64K gtt entry\n");
@@ -1302,25 +1264,20 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
return split_64KB_gtt_entry(vgpu, spt, index, &se);
case GTT_TYPE_PPGTT_PTE_2M_ENTRY:
gvt_vdbg_mm("shadow 2M gtt entry\n");
- ret = is_2MB_gtt_possible(vgpu, ge);
- if (ret == 0)
+ if (!HAS_PAGE_SIZES(vgpu->gvt->gt->i915, I915_GTT_PAGE_SIZE_2M) ||
+ intel_gvt_dma_map_guest_page(vgpu, gfn,
+ I915_GTT_PAGE_SIZE_2M, &dma_addr))
return split_2MB_gtt_entry(vgpu, spt, index, &se);
- else if (ret < 0)
- return ret;
- page_size = I915_GTT_PAGE_SIZE_2M;
break;
case GTT_TYPE_PPGTT_PTE_1G_ENTRY:
gvt_vgpu_err("GVT doesn't support 1GB entry\n");
return -EINVAL;
default:
GEM_BUG_ON(1);
+ return -EINVAL;
}
- /* direct shadow */
- ret = intel_gvt_dma_map_guest_page(vgpu, gfn, page_size, &dma_addr);
- if (ret)
- return -ENXIO;
-
+ /* Successfully shadowed a 4K or 2M page (without splitting). */
pte_ops->set_pfn(&se, dma_addr >> PAGE_SHIFT);
ppgtt_set_shadow_entry(spt, &se, index);
return 0;
@@ -1329,11 +1286,9 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
static int ppgtt_populate_spt(struct intel_vgpu_ppgtt_spt *spt)
{
struct intel_vgpu *vgpu = spt->vgpu;
- struct intel_gvt *gvt = vgpu->gvt;
- const struct intel_gvt_gtt_pte_ops *ops = gvt->gtt.pte_ops;
struct intel_vgpu_ppgtt_spt *s;
struct intel_gvt_gtt_entry se, ge;
- unsigned long gfn, i;
+ unsigned long i;
int ret;
trace_spt_change(spt->vgpu->id, "born", spt,
@@ -1350,13 +1305,6 @@ static int ppgtt_populate_spt(struct intel_vgpu_ppgtt_spt *spt)
ppgtt_generate_shadow_entry(&se, s, &ge);
ppgtt_set_shadow_entry(spt, &se, i);
} else {
- gfn = ops->get_pfn(&ge);
- if (!intel_gvt_is_valid_gfn(vgpu, gfn)) {
- ops->set_pfn(&se, gvt->gtt.scratch_mfn);
- ppgtt_set_shadow_entry(spt, &se, i);
- continue;
- }
-
ret = ppgtt_populate_shadow_entry(vgpu, spt, i, &ge);
if (ret)
goto fail;
@@ -1845,6 +1793,9 @@ static int shadow_ppgtt_mm(struct intel_vgpu_mm *mm)
if (mm->ppgtt_mm.shadowed)
return 0;
+ if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status))
+ return -EINVAL;
+
mm->ppgtt_mm.shadowed = true;
for (index = 0; index < ARRAY_SIZE(mm->ppgtt_mm.guest_pdps); index++) {
@@ -2331,14 +2282,6 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
m.val64 = e.val64;
m.type = e.type;
- /* one PTE update may be issued in multiple writes and the
- * first write may not construct a valid gfn
- */
- if (!intel_gvt_is_valid_gfn(vgpu, gfn)) {
- ops->set_pfn(&m, gvt->gtt.scratch_mfn);
- goto out;
- }
-
ret = intel_gvt_dma_map_guest_page(vgpu, gfn, PAGE_SIZE,
&dma_addr);
if (ret) {
@@ -2355,7 +2298,6 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
ops->clear_present(&m);
}
-out:
ggtt_set_guest_entry(ggtt_mm, &e, g_gtt_index);
ggtt_get_host_entry(ggtt_mm, &e, g_gtt_index);
@@ -2876,24 +2818,6 @@ void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu, bool invalidate_old)
}
/**
- * intel_vgpu_reset_gtt - reset the all GTT related status
- * @vgpu: a vGPU
- *
- * This function is called from vfio core to reset reset all
- * GTT related status, including GGTT, PPGTT, scratch page.
- *
- */
-void intel_vgpu_reset_gtt(struct intel_vgpu *vgpu)
-{
- /* Shadow pages are only created when there is no page
- * table tracking data, so remove page tracking data after
- * removing the shadow pages.
- */
- intel_vgpu_destroy_all_ppgtt_mm(vgpu);
- intel_vgpu_reset_ggtt(vgpu, true);
-}
-
-/**
* intel_gvt_restore_ggtt - restore all vGPU's ggtt entries
* @gvt: intel gvt device
*
diff --git a/drivers/gpu/drm/i915/gvt/gtt.h b/drivers/gpu/drm/i915/gvt/gtt.h
index a3b0f59ec8bd..4cb183e06e95 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.h
+++ b/drivers/gpu/drm/i915/gvt/gtt.h
@@ -224,7 +224,6 @@ void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu, bool invalidate_old);
void intel_vgpu_invalidate_ppgtt(struct intel_vgpu *vgpu);
int intel_gvt_init_gtt(struct intel_gvt *gvt);
-void intel_vgpu_reset_gtt(struct intel_vgpu *vgpu);
void intel_gvt_clean_gtt(struct intel_gvt *gvt);
struct intel_vgpu_mm *intel_gvt_find_ppgtt_mm(struct intel_vgpu *vgpu,
diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h
index 2d65800d8e93..53a0a42a50db 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.h
+++ b/drivers/gpu/drm/i915/gvt/gvt.h
@@ -34,10 +34,11 @@
#define _GVT_H_
#include <uapi/linux/pci_regs.h>
-#include <linux/kvm_host.h>
#include <linux/vfio.h>
#include <linux/mdev.h>
+#include <asm/kvm_page_track.h>
+
#include "i915_drv.h"
#include "intel_gvt.h"
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index de675d799c7d..aaed3969f204 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -106,12 +106,10 @@ struct gvt_dma {
#define vfio_dev_to_vgpu(vfio_dev) \
container_of((vfio_dev), struct intel_vgpu, vfio_device)
-static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *val, int len,
- struct kvm_page_track_notifier_node *node);
-static void kvmgt_page_track_flush_slot(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_page_track_notifier_node *node);
+static void kvmgt_page_track_write(gpa_t gpa, const u8 *val, int len,
+ struct kvm_page_track_notifier_node *node);
+static void kvmgt_page_track_remove_region(gfn_t gfn, unsigned long nr_pages,
+ struct kvm_page_track_notifier_node *node);
static ssize_t intel_vgpu_show_description(struct mdev_type *mtype, char *buf)
{
@@ -161,8 +159,7 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
if (npage == 0)
base_page = cur_page;
- else if (base_page + npage != cur_page) {
- gvt_vgpu_err("The pages are not continuous\n");
+ else if (page_to_pfn(base_page) + npage != page_to_pfn(cur_page)) {
ret = -EINVAL;
npage++;
goto err;
@@ -172,7 +169,8 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
*page = base_page;
return 0;
err:
- gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
+ if (npage)
+ gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
return ret;
}
@@ -352,6 +350,8 @@ __kvmgt_protect_table_find(struct intel_vgpu *info, gfn_t gfn)
{
struct kvmgt_pgfn *p, *res = NULL;
+ lockdep_assert_held(&info->vgpu_lock);
+
hash_for_each_possible(info->ptable, p, hnode, gfn) {
if (gfn == p->gfn) {
res = p;
@@ -654,21 +654,19 @@ out:
static int intel_vgpu_open_device(struct vfio_device *vfio_dev)
{
struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
-
- if (!vgpu->vfio_device.kvm ||
- vgpu->vfio_device.kvm->mm != current->mm) {
- gvt_vgpu_err("KVM is required to use Intel vGPU\n");
- return -ESRCH;
- }
+ int ret;
if (__kvmgt_vgpu_exist(vgpu))
return -EEXIST;
vgpu->track_node.track_write = kvmgt_page_track_write;
- vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
- kvm_get_kvm(vgpu->vfio_device.kvm);
- kvm_page_track_register_notifier(vgpu->vfio_device.kvm,
- &vgpu->track_node);
+ vgpu->track_node.track_remove_region = kvmgt_page_track_remove_region;
+ ret = kvm_page_track_register_notifier(vgpu->vfio_device.kvm,
+ &vgpu->track_node);
+ if (ret) {
+ gvt_vgpu_err("KVM is required to use Intel vGPU\n");
+ return ret;
+ }
set_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status);
@@ -703,7 +701,6 @@ static void intel_vgpu_close_device(struct vfio_device *vfio_dev)
kvm_page_track_unregister_notifier(vgpu->vfio_device.kvm,
&vgpu->track_node);
- kvm_put_kvm(vgpu->vfio_device.kvm);
kvmgt_protect_table_destroy(vgpu);
gvt_cache_destroy(vgpu);
@@ -1546,95 +1543,70 @@ static struct mdev_driver intel_vgpu_mdev_driver = {
int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn)
{
- struct kvm *kvm = info->vfio_device.kvm;
- struct kvm_memory_slot *slot;
- int idx;
+ int r;
if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, info->status))
return -ESRCH;
- idx = srcu_read_lock(&kvm->srcu);
- slot = gfn_to_memslot(kvm, gfn);
- if (!slot) {
- srcu_read_unlock(&kvm->srcu, idx);
- return -EINVAL;
- }
-
- write_lock(&kvm->mmu_lock);
-
if (kvmgt_gfn_is_write_protected(info, gfn))
- goto out;
+ return 0;
- kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
- kvmgt_protect_table_add(info, gfn);
+ r = kvm_write_track_add_gfn(info->vfio_device.kvm, gfn);
+ if (r)
+ return r;
-out:
- write_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
+ kvmgt_protect_table_add(info, gfn);
return 0;
}
int intel_gvt_page_track_remove(struct intel_vgpu *info, u64 gfn)
{
- struct kvm *kvm = info->vfio_device.kvm;
- struct kvm_memory_slot *slot;
- int idx;
+ int r;
if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, info->status))
return -ESRCH;
- idx = srcu_read_lock(&kvm->srcu);
- slot = gfn_to_memslot(kvm, gfn);
- if (!slot) {
- srcu_read_unlock(&kvm->srcu, idx);
- return -EINVAL;
- }
-
- write_lock(&kvm->mmu_lock);
-
if (!kvmgt_gfn_is_write_protected(info, gfn))
- goto out;
+ return 0;
- kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
- kvmgt_protect_table_del(info, gfn);
+ r = kvm_write_track_remove_gfn(info->vfio_device.kvm, gfn);
+ if (r)
+ return r;
-out:
- write_unlock(&kvm->mmu_lock);
- srcu_read_unlock(&kvm->srcu, idx);
+ kvmgt_protect_table_del(info, gfn);
return 0;
}
-static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *val, int len,
- struct kvm_page_track_notifier_node *node)
+static void kvmgt_page_track_write(gpa_t gpa, const u8 *val, int len,
+ struct kvm_page_track_notifier_node *node)
{
struct intel_vgpu *info =
container_of(node, struct intel_vgpu, track_node);
- if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
+ mutex_lock(&info->vgpu_lock);
+
+ if (kvmgt_gfn_is_write_protected(info, gpa >> PAGE_SHIFT))
intel_vgpu_page_track_handler(info, gpa,
(void *)val, len);
+
+ mutex_unlock(&info->vgpu_lock);
}
-static void kvmgt_page_track_flush_slot(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- struct kvm_page_track_notifier_node *node)
+static void kvmgt_page_track_remove_region(gfn_t gfn, unsigned long nr_pages,
+ struct kvm_page_track_notifier_node *node)
{
- int i;
- gfn_t gfn;
+ unsigned long i;
struct intel_vgpu *info =
container_of(node, struct intel_vgpu, track_node);
- write_lock(&kvm->mmu_lock);
- for (i = 0; i < slot->npages; i++) {
- gfn = slot->base_gfn + i;
- if (kvmgt_gfn_is_write_protected(info, gfn)) {
- kvm_slot_page_track_remove_page(kvm, slot, gfn,
- KVM_PAGE_TRACK_WRITE);
- kvmgt_protect_table_del(info, gfn);
- }
+ mutex_lock(&info->vgpu_lock);
+
+ for (i = 0; i < nr_pages; i++) {
+ if (kvmgt_gfn_is_write_protected(info, gfn + i))
+ kvmgt_protect_table_del(info, gfn + i);
}
- write_unlock(&kvm->mmu_lock);
+
+ mutex_unlock(&info->vgpu_lock);
}
void intel_vgpu_detach_regions(struct intel_vgpu *vgpu)
diff --git a/drivers/gpu/drm/i915/gvt/page_track.c b/drivers/gpu/drm/i915/gvt/page_track.c
index df34e73cba41..60a65435556d 100644
--- a/drivers/gpu/drm/i915/gvt/page_track.c
+++ b/drivers/gpu/drm/i915/gvt/page_track.c
@@ -162,13 +162,9 @@ int intel_vgpu_page_track_handler(struct intel_vgpu *vgpu, u64 gpa,
struct intel_vgpu_page_track *page_track;
int ret = 0;
- mutex_lock(&vgpu->vgpu_lock);
-
page_track = intel_vgpu_find_page_track(vgpu, gpa >> PAGE_SHIFT);
- if (!page_track) {
- ret = -ENXIO;
- goto out;
- }
+ if (!page_track)
+ return -ENXIO;
if (unlikely(vgpu->failsafe)) {
/* Remove write protection to prevent furture traps. */
@@ -179,7 +175,5 @@ int intel_vgpu_page_track_handler(struct intel_vgpu *vgpu, u64 gpa,
gvt_err("guest page write error, gpa %llx\n", gpa);
}
-out:
- mutex_unlock(&vgpu->vgpu_lock);
return ret;
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1b583f35547e..fb6c6109fdca 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -867,6 +867,25 @@ static inline void kvm_vm_bugged(struct kvm *kvm)
unlikely(__ret); \
})
+/*
+ * Note, "data corruption" refers to corruption of host kernel data structures,
+ * not guest data. Guest data corruption, suspected or confirmed, that is tied
+ * and contained to a single VM should *never* BUG() and potentially panic the
+ * host, i.e. use this variant of KVM_BUG() if and only if a KVM data structure
+ * is corrupted and that corruption can have a cascading effect to other parts
+ * of the hosts and/or to other VMs.
+ */
+#define KVM_BUG_ON_DATA_CORRUPTION(cond, kvm) \
+({ \
+ bool __ret = !!(cond); \
+ \
+ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) \
+ BUG_ON(__ret); \
+ else if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \
+ kvm_vm_bugged(kvm); \
+ unlikely(__ret); \
+})
+
static inline void kvm_vcpu_srcu_read_lock(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_PROVE_RCU