mm: separate vma->lock from vm_area_struct

vma->lock being part of the vm_area_struct causes performance regression during page faults because during contention its count and owner fields are constantly updated and having other parts of vm_area_struct used during page fault handling next to them causes constant cache line bouncing. Fix that by moving the lock outside of the vm_area_struct. All attempts to keep vma->lock inside vm_area_struct in a separate cache line still produce performance regression especially on NUMA machines. Smallest regression was achieved when lock is placed in the fourth cache line but that bloats vm_area_struct to 256 bytes. Considering performance and memory impact, separate lock looks like the best option. It increases memory footprint of each VMA but that can be optimized later if the new size causes issues. Note that after this change vma_init() does not allocate or initialize vma->lock anymore. A number of drivers allocate a pseudo VMA on the stack but they never use the VMA's lock, therefore it does not need to be allocated. The future drivers which might need the VMA lock should use vm_area_alloc()/vm_area_free() to allocate the VMA. Link: https://lkml.kernel.org/r/20230227173632.3292573-34-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
author: Suren Baghdasaryan <surenb@google.com> 2023-02-27 09:36:32 -0800
committer: Andrew Morton <akpm@linux-foundation.org> 2023-04-05 20:03:02 -0700
commit: c7f8f31c00d187a2c71a241c7f2bd6aa102a4e6f (patch)
tree: ca026078b1bcf94138383d9ca000d43a484bcd8b /kernel/fork.c
parent: 0d2ebf9c3f7822e7ba3e4792ea3b6b19aa2da34a (diff)
1 files changed, 59 insertions, 14 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index f86f7e917954..c75088367bb4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -451,13 +451,49 @@ static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+	vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+	if (!vma->vm_lock)
+		return false;
+
+	init_rwsem(&vma->vm_lock->lock);
+	vma->vm_lock_seq = -1;
+
+	return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+	kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 
 	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-	if (vma)
-		vma_init(vma, mm);
+	if (!vma)
+		return NULL;
+
+	vma_init(vma, mm);
+	if (!vma_lock_alloc(vma)) {
+		kmem_cache_free(vm_area_cachep, vma);
+		return NULL;
+	}
+
 	return vma;
 }
 
@@ -465,24 +501,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
 	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 
-	if (new) {
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
-		/*
-		 * orig->shared.rb may be modified concurrently, but the clone
-		 * will be reinitialized.
-		 */
-		data_race(memcpy(new, orig, sizeof(*new)));
-		INIT_LIST_HEAD(&new->anon_vma_chain);
-		vma_init_lock(new);
-		dup_anon_vma_name(orig, new);
+	if (!new)
+		return NULL;
+
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+	/*
+	 * orig->shared.rb may be modified concurrently, but the clone
+	 * will be reinitialized.
+	 */
+	data_race(memcpy(new, orig, sizeof(*new)));
+	if (!vma_lock_alloc(new)) {
+		kmem_cache_free(vm_area_cachep, new);
+		return NULL;
 	}
+	INIT_LIST_HEAD(&new->anon_vma_chain);
+	dup_anon_vma_name(orig, new);
+
 	return new;
 }
 
 void __vm_area_free(struct vm_area_struct *vma)
 {
 	free_anon_vma_name(vma);
+	vma_lock_free(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
@@ -493,7 +535,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
 						  vm_rcu);
 
 	/* The vma should not be locked while being destroyed. */
-	VM_BUG_ON_VMA(rwsem_is_locked(&vma->lock), vma);
+	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
 	__vm_area_free(vma);
 }
 #endif
@@ -3152,6 +3194,9 @@ void __init proc_caches_init(void)
 			NULL);
 
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+	vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
 	mmap_init();
 	nsproxy_cache_init();
 }
author	Suren Baghdasaryan <surenb@google.com>	2023-02-27 09:36:32 -0800
committer	Andrew Morton <akpm@linux-foundation.org>	2023-04-05 20:03:02 -0700
commit	c7f8f31c00d187a2c71a241c7f2bd6aa102a4e6f (patch)
tree	ca026078b1bcf94138383d9ca000d43a484bcd8b /kernel/fork.c
parent	0d2ebf9c3f7822e7ba3e4792ea3b6b19aa2da34a (diff)