From 0dbd054699661dfffbc1c148664f8d03fd132569 Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Tue, 16 Jan 2024 18:00:25 +0800
Subject: KVM: x86/mmu: Use KMEM_CACHE instead of kmem_cache_create()

Use the new KMEM_CACHE() macro instead of direct kmem_cache_create
to simplify the creation of SLAB caches.

Note, KMEM_CACHE() uses the required alignment of the struct, '8' as the
alignment, whereas KVM's existing code passes '0'.  In the end, the two
values yield the same result as x86's minimum slab alignment is also '8'
(which is not at all coincidental).

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Link: https://lore.kernel.org/r/20240116100025.95702-1-chentao@kylinos.cn
[sean: call out alignment behavior]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2d6cdeab1f8a..3c193b096b45 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6997,9 +6997,7 @@ int kvm_mmu_vendor_module_init(void)
 
 	kvm_mmu_reset_all_pte_masks();
 
-	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
-					    sizeof(struct pte_list_desc),
-					    0, SLAB_ACCOUNT, NULL);
+	pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT);
 	if (!pte_list_desc_cache)
 		goto out;
 
-- 
cgit 


From 474b99ed703b7e4031f3925adacf19e7c8af2075 Mon Sep 17 00:00:00 2001
From: Mingwei Zhang <mizhang@google.com>
Date: Fri, 2 Feb 2024 16:23:40 -0800
Subject: KVM: x86/mmu: Don't acquire mmu_lock when using indirect_shadow_pages
 as a heuristic

Drop KVM's completely pointless acquisition of mmu_lock when deciding
whether or not to unprotect any shadow pages residing at the gfn before
resuming the guest to let it retry an instruction that KVM failed to
emulated.  In this case, indirect_shadow_pages is used as a coarse-grained
heuristic to check if there is any chance of there being a relevant shadow
page to unprotected.  But acquiring mmu_lock largely defeats any benefit
to the heuristic, as taking mmu_lock for write is likely far more costly
to the VM as a whole than unnecessarily walking mmu_page_hash.

Furthermore, the current code is already prone to false negatives and
false positives, as it drops mmu_lock before checking the flag and
unprotecting shadow pages.  And as evidenced by the lack of bug reports,
neither false positives nor false negatives are problematic.  A false
positive simply means that KVM will try to unprotect shadow pages that
have already been zapped.  And a false negative means that KVM will
resume the guest without unprotecting the gfn, i.e. if a shadow page was
_just_ created, the vCPU will hit the same page fault and do the whole
dance all over again, and detect and unprotect the shadow page the second
time around (or not, if something else zaps it first).

Reported-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Mingwei Zhang <mizhang@google.com>
[sean: drop READ_ONCE() and comment change, rewrite changelog]
Link: https://lore.kernel.org/r/20240203002343.383056-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 363b1c080205..7015f8786397 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8789,13 +8789,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 
 	/* The instructions are well-emulated on direct mmu. */
 	if (vcpu->arch.mmu->root_role.direct) {
-		unsigned int indirect_shadow_pages;
-
-		write_lock(&vcpu->kvm->mmu_lock);
-		indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
-		write_unlock(&vcpu->kvm->mmu_lock);
-
-		if (indirect_shadow_pages)
+		if (vcpu->kvm->arch.indirect_shadow_pages)
 			kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 
 		return true;
-- 
cgit 


From 515c18a64e704bc932c5a64e25aaeb712252cf0b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 2 Feb 2024 16:23:41 -0800
Subject: KVM: x86: Drop dedicated logic for direct MMUs in
 reexecute_instruction()

Now that KVM doesn't pointlessly acquire mmu_lock for direct MMUs, drop
the dedicated path entirely and always query indirect_shadow_pages when
deciding whether or not to try unprotecting the gfn.  For indirect, a.k.a.
shadow MMUs, checking indirect_shadow_pages is harmless; unless *every*
shadow page was somehow zapped while KVM was attempting to emulate the
instruction, indirect_shadow_pages is guaranteed to be non-zero.

Well, unless the instruction used a direct hugepage with 2-level paging
for its code page, but in that case, there's obviously nothing to
unprotect.  And in the extremely unlikely case all shadow pages were
zapped, there's again obviously nothing to unprotect.

Link: https://lore.kernel.org/r/20240203002343.383056-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7015f8786397..ac3ea5829df6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8787,27 +8787,27 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 
 	kvm_release_pfn_clean(pfn);
 
-	/* The instructions are well-emulated on direct mmu. */
-	if (vcpu->arch.mmu->root_role.direct) {
-		if (vcpu->kvm->arch.indirect_shadow_pages)
-			kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
-
-		return true;
-	}
-
 	/*
-	 * if emulation was due to access to shadowed page table
-	 * and it failed try to unshadow page and re-enter the
-	 * guest to let CPU execute the instruction.
+	 * If emulation may have been triggered by a write to a shadowed page
+	 * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
+	 * guest to let the CPU re-execute the instruction in the hope that the
+	 * CPU can cleanly execute the instruction that KVM failed to emulate.
 	 */
-	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+	if (vcpu->kvm->arch.indirect_shadow_pages)
+		kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 
 	/*
-	 * If the access faults on its page table, it can not
-	 * be fixed by unprotecting shadow page and it should
-	 * be reported to userspace.
+	 * If the failed instruction faulted on an access to page tables that
+	 * are used to translate any part of the instruction, KVM can't resolve
+	 * the issue by unprotecting the gfn, as zapping the shadow page will
+	 * result in the instruction taking a !PRESENT page fault and thus put
+	 * the vCPU into an infinite loop of page faults.  E.g. KVM will create
+	 * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
+	 * then zap the SPTE to unprotect the gfn, and then do it all over
+	 * again.  Report the error to userspace.
 	 */
-	return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
+	return vcpu->arch.mmu->root_role.direct ||
+	       !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
 }
 
 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
-- 
cgit 


From dfeef3d3f310ee464493e848383c4e9fe879089a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 2 Feb 2024 16:23:42 -0800
Subject: KVM: x86: Drop superfluous check on direct MMU vs. WRITE_PF_TO_SP
 flag

Remove reexecute_instruction()'s final check on the MMU being direct, as
EMULTYPE_WRITE_PF_TO_SP is only ever set if the MMU is indirect, i.e. is a
shadow MMU.  Prior to commit 93c05d3ef252 ("KVM: x86: improve
reexecute_instruction"), the flag simply didn't exist (and KVM actually
returned "true" unconditionally for both types of MMUs).  I.e. the
explicit check for a direct MMU is simply leftover artifact from old code.

Link: https://lore.kernel.org/r/20240203002343.383056-4-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ac3ea5829df6..48ec889452e2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8806,8 +8806,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	 * then zap the SPTE to unprotect the gfn, and then do it all over
 	 * again.  Report the error to userspace.
 	 */
-	return vcpu->arch.mmu->root_role.direct ||
-	       !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
+	return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
 }
 
 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
-- 
cgit 


From 8ca983631f3c4ba16ac70d3310a31316e06f9e36 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 10 Jan 2024 18:00:41 -0800
Subject: KVM: x86/mmu: Zap invalidated TDP MMU roots at 4KiB granularity

Zap invalidated TDP MMU roots at maximum granularity, i.e. with more
frequent conditional resched checkpoints, in order to avoid running for an
extended duration (milliseconds, or worse) without honoring a reschedule
request.  And for kernels running with full or real-time preempt models,
zapping at 4KiB granularity also provides significantly reduced latency
for other tasks that are contending for mmu_lock (which isn't necessarily
an overall win for KVM, but KVM should do its best to honor the kernel's
preemption model).

To keep KVM's assertion that zapping at 1GiB granularity is functionally
ok, which is the main reason 1GiB was selected in the past, skip straight
to zapping at 1GiB if KVM is configured to prove the MMU.  Zapping roots
is far more common than a vCPU replacing a 1GiB page table with a hugepage,
e.g. generally happens multiple times during boot, and so keeping the test
coverage provided by root zaps is desirable, just not for production.

Cc: David Matlack <dmatlack@google.com>
Cc: Pattara Teerapong <pteerapong@google.com>
Link: https://lore.kernel.org/r/20240111020048.844847-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6ae19b4ee5b1..372da098d3ce 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -734,15 +734,26 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
 	rcu_read_lock();
 
 	/*
-	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
-	 * split the zap into two passes.  On the first pass, zap at the 1gb
-	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
-	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
-	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
+	 * Zap roots in multiple passes of decreasing granularity, i.e. zap at
+	 * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
+	 * preempt models) or mmu_lock contention (full or real-time models).
+	 * Zapping at finer granularity marginally increases the total time of
+	 * the zap, but in most cases the zap itself isn't latency sensitive.
 	 *
-	 * Because zapping a SP recurses on its children, stepping down to
-	 * PG_LEVEL_4K in the iterator itself is unnecessary.
+	 * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
+	 * in order to mimic the page fault path, which can replace a 1GiB page
+	 * table with an equivalent 1GiB hugepage, i.e. can get saddled with
+	 * zapping a 1GiB region that's fully populated with 4KiB SPTEs.  This
+	 * allows verifying that KVM can safely zap 1GiB regions, e.g. without
+	 * inducing RCU stalls, without relying on a relatively rare event
+	 * (zapping roots is orders of magnitude more common).  Note, because
+	 * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
+	 * in the iterator itself is unnecessary.
 	 */
+	if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
+		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
+		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
+	}
 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
 
-- 
cgit 


From fcdffe97f80e6fb488f6b5c6bd38f6cd899944ab Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 10 Jan 2024 18:00:42 -0800
Subject: KVM: x86/mmu: Don't do TLB flush when zappings SPTEs in invalid roots

Don't force a TLB flush when zapping SPTEs in invalid roots as vCPUs
can't be actively using invalid roots (zapping SPTEs in invalid roots is
necessary only to ensure KVM doesn't mark a page accessed/dirty after it
is freed by the primary MMU).

Link: https://lore.kernel.org/r/20240111020048.844847-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 372da098d3ce..68920877370b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -811,7 +811,13 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
 			continue;
 
 		tdp_mmu_iter_set_spte(kvm, &iter, 0);
-		flush = true;
+
+		/*
+		 * Zappings SPTEs in invalid roots doesn't require a TLB flush,
+		 * see kvm_tdp_mmu_zap_invalidated_roots() for details.
+		 */
+		if (!root->role.invalid)
+			flush = true;
 	}
 
 	rcu_read_unlock();
-- 
cgit 


From 6577f1efdff443277b19c0fbe4b933404e7c84e6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 10 Jan 2024 18:00:43 -0800
Subject: KVM: x86/mmu: Allow passing '-1' for "all" as_id for TDP MMU
 iterators

Modify for_each_tdp_mmu_root() and __for_each_tdp_mmu_root_yield_safe() to
accept -1 for _as_id to mean "process all memslot address spaces".  That
way code that wants to process both SMM and !SMM doesn't need to iterate
over roots twice (and likely copy+paste code in the process).

Deliberately don't cast _as_id to an "int", just in case not casting helps
the compiler elide the "_as_id >=0" check when being passed an unsigned
value, e.g. from a memslot.

No functional change intended.

Link: https://lore.kernel.org/r/20240111020048.844847-4-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 68920877370b..60fff2aad59e 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -149,11 +149,11 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  * If shared is set, this function is operating under the MMU lock in read
  * mode.
  */
-#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\
-	for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid);	\
-	     ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root;	\
-	     _root = tdp_mmu_next_root(_kvm, _root, _only_valid))	\
-		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)	\
+	for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid);		\
+	     ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root;		\
+	     _root = tdp_mmu_next_root(_kvm, _root, _only_valid))		\
+		if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) {	\
 		} else
 
 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)	\
@@ -171,10 +171,10 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  * Holding mmu_lock for write obviates the need for RCU protection as the list
  * is guaranteed to be stable.
  */
-#define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
-	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
-		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
-		    kvm_mmu_page_as_id(_root) != _as_id) {		\
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
+	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)		\
+		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&		\
+		    _as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) {	\
 		} else
 
 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
-- 
cgit 


From 99b85fda91b164b91a0d4e0aae376f32dc38d59c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 10 Jan 2024 18:00:44 -0800
Subject: KVM: x86/mmu: Skip invalid roots when zapping leaf SPTEs for GFN
 range

When zapping a GFN in response to an APICv or MTRR change, don't zap SPTEs
for invalid roots as KVM only needs to ensure the guest can't use stale
mappings for the GFN.  Unlike kvm_tdp_mmu_unmap_gfn_range(), which must
zap "unreachable" SPTEs to ensure KVM doesn't mark a page accessed/dirty,
kvm_tdp_mmu_zap_leafs() isn't used (and isn't intended to be used) to
handle freeing of host memory.

Link: https://lore.kernel.org/r/20240111020048.844847-5-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 60fff2aad59e..1a9c16e5c287 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -830,16 +830,16 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
 }
 
 /*
- * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
- * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
- * more SPTEs were zapped since the MMU lock was last acquired.
+ * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
+ * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
+ * one or more SPTEs were zapped since the MMU lock was last acquired.
  */
 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
 {
 	struct kvm_mmu_page *root;
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
-	for_each_tdp_mmu_root_yield_safe(kvm, root)
+	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
 
 	return flush;
-- 
cgit 


From d746182337c205660fd4d8eaa5fdc4f4e8320b9a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 10 Jan 2024 18:00:45 -0800
Subject: KVM: x86/mmu: Skip invalid TDP MMU roots when write-protecting SPTEs

When write-protecting SPTEs, don't process invalid roots as invalid roots
are unreachable, i.e. can't be used to access guest memory and thus don't
need to be write-protected.

Note, this is *almost* a nop for kvm_tdp_mmu_clear_dirty_pt_masked(),
which is called under slots_lock, i.e. is mutually exclusive with
kvm_mmu_zap_all_fast().  But it's possible for something other than the
"fast zap" thread to grab a reference to an invalid root and thus keep a
root alive (but completely empty) after kvm_mmu_zap_all_fast() completes.

The kvm_tdp_mmu_write_protect_gfn() case is more interesting as KVM write-
protects SPTEs for reasons other than dirty logging, e.g. if a KVM creates
a SPTE for a nested VM while a fast zap is in-progress.

Add another TDP MMU iterator to visit only valid roots, and
opportunistically convert kvm_tdp_mmu_get_vcpu_root_hpa() to said iterator.

Link: https://lore.kernel.org/r/20240111020048.844847-6-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 1a9c16e5c287..e0a8343f66dc 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -171,12 +171,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
  * Holding mmu_lock for write obviates the need for RCU protection as the list
  * is guaranteed to be stable.
  */
-#define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
+#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid)		\
 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)		\
 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&		\
-		    _as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) {	\
+		    ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||	\
+		     ((_only_valid) && (_root)->role.invalid))) {		\
 		} else
 
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
+	__for_each_tdp_mmu_root(_kvm, _root, _as_id, false)
+
+#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id)		\
+	__for_each_tdp_mmu_root(_kvm, _root, _as_id, true)
+
 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_page *sp;
@@ -224,11 +231,8 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
 
-	/*
-	 * Check for an existing root before allocating a new one.  Note, the
-	 * role check prevents consuming an invalid root.
-	 */
-	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+	/* Check for an existing root before allocating a new one. */
+	for_each_valid_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
 		if (root->role.word == role.word &&
 		    kvm_tdp_mmu_get_root(root))
 			goto out;
@@ -1639,7 +1643,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 {
 	struct kvm_mmu_page *root;
 
-	for_each_tdp_mmu_root(kvm, root, slot->as_id)
+	for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
 }
 
@@ -1757,7 +1761,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
 	bool spte_set = false;
 
 	lockdep_assert_held_write(&kvm->mmu_lock);
-	for_each_tdp_mmu_root(kvm, root, slot->as_id)
+	for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
 
 	return spte_set;
-- 
cgit 


From f5238c2a60f1e0eb48ce21037bce6f4781afa37f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 10 Jan 2024 18:00:46 -0800
Subject: KVM: x86/mmu: Check for usable TDP MMU root while holding mmu_lock
 for read

When allocating a new TDP MMU root, check for a usable root while holding
mmu_lock for read and only acquire mmu_lock for write if a new root needs
to be created.  There is no need to serialize other MMU operations if a
vCPU is simply grabbing a reference to an existing root, holding mmu_lock
for write is "necessary" (spoiler alert, it's not strictly necessary) only
to ensure KVM doesn't end up with duplicate roots.

Allowing vCPUs to get "new" roots in parallel is beneficial to VM boot and
to setups that frequently delete memslots, i.e. which force all vCPUs to
reload all roots.

Link: https://lore.kernel.org/r/20240111020048.844847-7-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c     |  8 +++----
 arch/x86/kvm/mmu/tdp_mmu.c | 60 ++++++++++++++++++++++++++++++++++++++--------
 arch/x86/kvm/mmu/tdp_mmu.h |  2 +-
 3 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3c193b096b45..c9150080fbd2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3693,15 +3693,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 	unsigned i;
 	int r;
 
+	if (tdp_mmu_enabled)
+		return kvm_tdp_mmu_alloc_root(vcpu);
+
 	write_lock(&vcpu->kvm->mmu_lock);
 	r = make_mmu_pages_available(vcpu);
 	if (r < 0)
 		goto out_unlock;
 
-	if (tdp_mmu_enabled) {
-		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
-		mmu->root.hpa = root;
-	} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
+	if (shadow_root_level >= PT64_ROOT_4LEVEL) {
 		root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
 		mmu->root.hpa = root;
 	} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index e0a8343f66dc..9a8250a14fc1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -223,21 +223,52 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
 }
 
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
+static struct kvm_mmu_page *kvm_tdp_mmu_try_get_root(struct kvm_vcpu *vcpu)
 {
 	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
+	int as_id = kvm_mmu_role_as_id(role);
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_mmu_page *root;
 
-	lockdep_assert_held_write(&kvm->mmu_lock);
-
-	/* Check for an existing root before allocating a new one. */
-	for_each_valid_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
-		if (root->role.word == role.word &&
-		    kvm_tdp_mmu_get_root(root))
-			goto out;
+	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
+		if (root->role.word == role.word)
+			return root;
 	}
 
+	return NULL;
+}
+
+int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu *mmu = vcpu->arch.mmu;
+	union kvm_mmu_page_role role = mmu->root_role;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_mmu_page *root;
+
+	/*
+	 * Check for an existing root while holding mmu_lock for read to avoid
+	 * unnecessary serialization if multiple vCPUs are loading a new root.
+	 * E.g. when bringing up secondary vCPUs, KVM will already have created
+	 * a valid root on behalf of the primary vCPU.
+	 */
+	read_lock(&kvm->mmu_lock);
+	root = kvm_tdp_mmu_try_get_root(vcpu);
+	read_unlock(&kvm->mmu_lock);
+
+	if (root)
+		goto out;
+
+	write_lock(&kvm->mmu_lock);
+
+	/*
+	 * Recheck for an existing root after acquiring mmu_lock for write.  It
+	 * is possible a new usable root was created between dropping mmu_lock
+	 * (for read) and acquiring it for write.
+	 */
+	root = kvm_tdp_mmu_try_get_root(vcpu);
+	if (root)
+		goto out_unlock;
+
 	root = tdp_mmu_alloc_sp(vcpu);
 	tdp_mmu_init_sp(root, NULL, 0, role);
 
@@ -254,8 +285,17 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 
+out_unlock:
+	write_unlock(&kvm->mmu_lock);
 out:
-	return __pa(root->spt);
+	/*
+	 * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
+	 * and actually consuming the root if it's invalidated after dropping
+	 * mmu_lock, and the root can't be freed as this vCPU holds a reference.
+	 */
+	mmu->root.hpa = __pa(root->spt);
+	mmu->root.pgd = 0;
+	return 0;
 }
 
 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -917,7 +957,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
  * the VM is being destroyed).
  *
  * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
- * See kvm_tdp_mmu_get_vcpu_root_hpa().
+ * See kvm_tdp_mmu_alloc_root().
  */
 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 20d97aa46c49..6e1ea04ca885 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -10,7 +10,7 @@
 void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
 
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
+int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu);
 
 __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
 {
-- 
cgit 


From dab285e4ec736d964cfa6c7fd6eebd22666b5ebc Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 10 Jan 2024 18:00:47 -0800
Subject: KVM: x86/mmu: Alloc TDP MMU roots while holding mmu_lock for read

Allocate TDP MMU roots while holding mmu_lock for read, and instead use
tdp_mmu_pages_lock to guard against duplicate roots.  This allows KVM to
create new roots without forcing kvm_tdp_mmu_zap_invalidated_roots() to
yield, e.g. allows vCPUs to load new roots after memslot deletion without
forcing the zap thread to detect contention and yield (or complete if the
kernel isn't preemptible).

Note, creating a new TDP MMU root as an mmu_lock reader is safe for two
reasons: (1) paths that must guarantee all roots/SPTEs are *visited* take
mmu_lock for write and so are still mutually exclusive, e.g. mmu_notifier
invalidations, and (2) paths that require all roots/SPTEs to *observe*
some given state without holding mmu_lock for write must ensure freshness
through some other means, e.g. toggling dirty logging must first wait for
SRCU readers to recognize the memslot flags change before processing
existing roots/SPTEs.

Link: https://lore.kernel.org/r/20240111020048.844847-8-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 55 +++++++++++++++++++---------------------------
 1 file changed, 22 insertions(+), 33 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 9a8250a14fc1..d078157e62aa 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -223,51 +223,42 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
 }
 
-static struct kvm_mmu_page *kvm_tdp_mmu_try_get_root(struct kvm_vcpu *vcpu)
-{
-	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
-	int as_id = kvm_mmu_role_as_id(role);
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_mmu_page *root;
-
-	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
-		if (root->role.word == role.word)
-			return root;
-	}
-
-	return NULL;
-}
-
 int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
 	union kvm_mmu_page_role role = mmu->root_role;
+	int as_id = kvm_mmu_role_as_id(role);
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_mmu_page *root;
 
 	/*
-	 * Check for an existing root while holding mmu_lock for read to avoid
+	 * Check for an existing root before acquiring the pages lock to avoid
 	 * unnecessary serialization if multiple vCPUs are loading a new root.
 	 * E.g. when bringing up secondary vCPUs, KVM will already have created
 	 * a valid root on behalf of the primary vCPU.
 	 */
 	read_lock(&kvm->mmu_lock);
-	root = kvm_tdp_mmu_try_get_root(vcpu);
-	read_unlock(&kvm->mmu_lock);
 
-	if (root)
-		goto out;
+	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
+		if (root->role.word == role.word)
+			goto out_read_unlock;
+	}
 
-	write_lock(&kvm->mmu_lock);
+	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 
 	/*
-	 * Recheck for an existing root after acquiring mmu_lock for write.  It
-	 * is possible a new usable root was created between dropping mmu_lock
-	 * (for read) and acquiring it for write.
+	 * Recheck for an existing root after acquiring the pages lock, another
+	 * vCPU may have raced ahead and created a new usable root.  Manually
+	 * walk the list of roots as the standard macros assume that the pages
+	 * lock is *not* held.  WARN if grabbing a reference to a usable root
+	 * fails, as the last reference to a root can only be put *after* the
+	 * root has been invalidated, which requires holding mmu_lock for write.
 	 */
-	root = kvm_tdp_mmu_try_get_root(vcpu);
-	if (root)
-		goto out_unlock;
+	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
+		if (root->role.word == role.word &&
+		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
+			goto out_spin_unlock;
+	}
 
 	root = tdp_mmu_alloc_sp(vcpu);
 	tdp_mmu_init_sp(root, NULL, 0, role);
@@ -280,14 +271,12 @@ int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
 	 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
 	 */
 	refcount_set(&root->tdp_mmu_root_count, 2);
-
-	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
-	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
 
-out_unlock:
-	write_unlock(&kvm->mmu_lock);
-out:
+out_spin_unlock:
+	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+out_read_unlock:
+	read_unlock(&kvm->mmu_lock);
 	/*
 	 * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
 	 * and actually consuming the root if it's invalidated after dropping
-- 
cgit 


From 576a15de8d299d9d225b86504547ff6498bc2eeb Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 10 Jan 2024 18:00:48 -0800
Subject: KVM: x86/mmu: Free TDP MMU roots while holding mmy_lock for read

Free TDP MMU roots from vCPU context while holding mmu_lock for read, it
is completely legal to invoke kvm_tdp_mmu_put_root() as a reader.  This
eliminates the last mmu_lock writer in the TDP MMU's "fast zap" path
after requesting vCPUs to reload roots, i.e. allows KVM to zap invalidated
roots, free obsolete roots, and allocate new roots in parallel.

On large VMs, e.g. 100+ vCPUs, allowing the bulk of the "fast zap"
operation to run in parallel with freeing and allocating roots reduces the
worst case latency for a vCPU to reload a root from 2-3ms to <100us.

Link: https://lore.kernel.org/r/20240111020048.844847-9-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c9150080fbd2..e5e2af69e24d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3575,10 +3575,14 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 	if (WARN_ON_ONCE(!sp))
 		return;
 
-	if (is_tdp_mmu_page(sp))
+	if (is_tdp_mmu_page(sp)) {
+		lockdep_assert_held_read(&kvm->mmu_lock);
 		kvm_tdp_mmu_put_root(kvm, sp);
-	else if (!--sp->root_count && sp->role.invalid)
-		kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+	} else {
+		lockdep_assert_held_write(&kvm->mmu_lock);
+		if (!--sp->root_count && sp->role.invalid)
+			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+	}
 
 	*root_hpa = INVALID_PAGE;
 }
@@ -3587,6 +3591,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
 			ulong roots_to_free)
 {
+	bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct;
 	int i;
 	LIST_HEAD(invalid_list);
 	bool free_active_root;
@@ -3609,7 +3614,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
 			return;
 	}
 
-	write_lock(&kvm->mmu_lock);
+	if (is_tdp_mmu)
+		read_lock(&kvm->mmu_lock);
+	else
+		write_lock(&kvm->mmu_lock);
 
 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
 		if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
@@ -3635,8 +3643,13 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
 		mmu->root.pgd = 0;
 	}
 
-	kvm_mmu_commit_zap_page(kvm, &invalid_list);
-	write_unlock(&kvm->mmu_lock);
+	if (is_tdp_mmu) {
+		read_unlock(&kvm->mmu_lock);
+		WARN_ON_ONCE(!list_empty(&invalid_list));
+	} else {
+		kvm_mmu_commit_zap_page(kvm, &invalid_list);
+		write_unlock(&kvm->mmu_lock);
+	}
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
 
-- 
cgit 


From a364c014a2c1ad6e011bc5fdb8afb9d4ba316956 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@google.com>
Date: Tue, 13 Feb 2024 11:23:40 -0800
Subject: kvm/x86: allocate the write-tracking metadata on-demand

The write-track is used externally only by the gpu/drm/i915 driver.
Currently, it is always enabled, if a kernel has been compiled with this
driver.

Enabling the write-track mechanism adds a two-byte overhead per page across
all memory slots. It isn't significant for regular VMs. However in gVisor,
where the entire process virtual address space is mapped into the VM, even
with a 39-bit address space, the overhead amounts to 256MB.

Rework the write-tracking mechanism to enable it on-demand in
kvm_page_track_register_notifier.

Here is Sean's comment about the locking scheme:

The only potential hiccup would be if taking slots_arch_lock would
deadlock, but it should be impossible for slots_arch_lock to be taken in
any other path that involves VFIO and/or KVMGT *and* can be coincident.
Except for kvm_arch_destroy_vm() (which deletes KVM's internal
memslots), slots_arch_lock is taken only through KVM ioctls(), and the
caller of kvm_page_track_register_notifier() *must* hold a reference to
the VM.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sean Christopherson <seanjc@google.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Andrei Vagin <avagin@google.com>
Link: https://lore.kernel.org/r/20240213192340.2023366-1-avagin@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h |  9 ++++++
 arch/x86/kvm/mmu/page_track.c   | 68 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b5b2d0fde579..7d33a2605ad5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1466,6 +1466,15 @@ struct kvm_arch {
 	 */
 	bool shadow_root_allocated;
 
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+	/*
+	 * If set, the VM has (or had) an external write tracking user, and
+	 * thus all write tracking metadata has been allocated, even if KVM
+	 * itself isn't using write tracking.
+	 */
+	bool external_write_tracking_enabled;
+#endif
+
 #if IS_ENABLED(CONFIG_HYPERV)
 	hpa_t	hv_root_tdp;
 	spinlock_t hv_root_tdp_lock;
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index c87da11f3a04..f6448284c18e 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -20,10 +20,23 @@
 #include "mmu_internal.h"
 #include "page_track.h"
 
+static bool kvm_external_write_tracking_enabled(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+	/*
+	 * Read external_write_tracking_enabled before related pointers.  Pairs
+	 * with the smp_store_release in kvm_page_track_write_tracking_enable().
+	 */
+	return smp_load_acquire(&kvm->arch.external_write_tracking_enabled);
+#else
+	return false;
+#endif
+}
+
 bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
 {
-	return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
-	       !tdp_enabled || kvm_shadow_root_allocated(kvm);
+	return kvm_external_write_tracking_enabled(kvm) ||
+	       kvm_shadow_root_allocated(kvm) || !tdp_enabled;
 }
 
 void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
@@ -153,6 +166,50 @@ int kvm_page_track_init(struct kvm *kvm)
 	return init_srcu_struct(&head->track_srcu);
 }
 
+static int kvm_enable_external_write_tracking(struct kvm *kvm)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *slot;
+	int r = 0, i, bkt;
+
+	mutex_lock(&kvm->slots_arch_lock);
+
+	/*
+	 * Check for *any* write tracking user (not just external users) under
+	 * lock.  This avoids unnecessary work, e.g. if KVM itself is using
+	 * write tracking, or if two external users raced when registering.
+	 */
+	if (kvm_page_track_write_tracking_enabled(kvm))
+		goto out_success;
+
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot(slot, bkt, slots) {
+			/*
+			 * Intentionally do NOT free allocations on failure to
+			 * avoid having to track which allocations were made
+			 * now versus when the memslot was created.  The
+			 * metadata is guaranteed to be freed when the slot is
+			 * freed, and will be kept/used if userspace retries
+			 * the failed ioctl() instead of killing the VM.
+			 */
+			r = kvm_page_track_write_tracking_alloc(slot);
+			if (r)
+				goto out_unlock;
+		}
+	}
+
+out_success:
+	/*
+	 * Ensure that external_write_tracking_enabled becomes true strictly
+	 * after all the related pointers are set.
+	 */
+	smp_store_release(&kvm->arch.external_write_tracking_enabled, true);
+out_unlock:
+	mutex_unlock(&kvm->slots_arch_lock);
+	return r;
+}
+
 /*
  * register the notifier so that event interception for the tracked guest
  * pages can be received.
@@ -161,10 +218,17 @@ int kvm_page_track_register_notifier(struct kvm *kvm,
 				     struct kvm_page_track_notifier_node *n)
 {
 	struct kvm_page_track_notifier_head *head;
+	int r;
 
 	if (!kvm || kvm->mm != current->mm)
 		return -ESRCH;
 
+	if (!kvm_external_write_tracking_enabled(kvm)) {
+		r = kvm_enable_external_write_tracking(kvm);
+		if (r)
+			return r;
+	}
+
 	kvm_get_kvm(kvm);
 
 	head = &kvm->arch.track_notifier_head;
-- 
cgit