From 9bbdc0f324097f72b2354c2f8be4cdffd32679b6 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 22 Mar 2022 14:41:12 -0700 Subject: xarray: use kmem_cache_alloc_lru to allocate xa_node The workingset will add the xa_node to the shadow_nodes list. So the allocation of xa_node should be done by kmem_cache_alloc_lru(). Using xas_set_lru() to pass the list_lru which we want to insert xa_node into to set up the xa_node reclaim context correctly. Link: https://lkml.kernel.org/r/20220228122126.37293-9-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Alex Shi Cc: Anna Schumaker Cc: Chao Yu Cc: Dave Chinner Cc: Fam Zheng Cc: Jaegeuk Kim Cc: Kari Argillander Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Theodore Ts'o Cc: Trond Myklebust Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xiongchun Duan Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/xarray.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'lib/xarray.c') diff --git a/lib/xarray.c b/lib/xarray.c index 6f47f6375808..b95e92598b9c 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -302,7 +302,7 @@ bool xas_nomem(struct xa_state *xas, gfp_t gfp) } if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; - xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!xas->xa_alloc) return false; xas->xa_alloc->parent = NULL; @@ -334,10 +334,10 @@ static bool __xas_nomem(struct xa_state *xas, gfp_t gfp) gfp |= __GFP_ACCOUNT; if (gfpflags_allow_blocking(gfp)) { xas_unlock_type(xas, lock_type); - xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); xas_lock_type(xas, lock_type); } else { - xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); } if (!xas->xa_alloc) return false; @@ -371,7 +371,7 @@ static void *xas_alloc(struct xa_state *xas, unsigned int shift) if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; - node = kmem_cache_alloc(radix_tree_node_cachep, gfp); + node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!node) { xas_set_err(xas, -ENOMEM); return NULL; @@ -1014,7 +1014,7 @@ void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, void *sibling = NULL; struct xa_node *node; - node = kmem_cache_alloc(radix_tree_node_cachep, gfp); + node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp); if (!node) goto nomem; node->array = xas->xa; -- cgit From 3e3c658055c002900982513e289398a1aad4a488 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Mar 2022 19:25:11 -0400 Subject: XArray: Fix xas_create_range() when multi-order entry present If there is already an entry present that is of order >= XA_CHUNK_SHIFT when we call xas_create_range(), xas_create_range() will misinterpret that entry as a node and dereference xa_node->parent, generally leading to a crash that looks something like this: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 0 PID: 32 Comm: khugepaged Not tainted 5.17.0-rc8-syzkaller-00003-g56e337f2cf13 #0 RIP: 0010:xa_parent_locked include/linux/xarray.h:1207 [inline] RIP: 0010:xas_create_range+0x2d9/0x6e0 lib/xarray.c:725 It's deterministically reproducable once you know what the problem is, but producing it in a live kernel requires khugepaged to hit a race. While the problem has been present since xas_create_range() was introduced, I'm not aware of a way to hit it before the page cache was converted to use multi-index entries. Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") Reported-by: syzbot+0d2b0bf32ca5cfd09f2e@syzkaller.appspotmail.com Signed-off-by: Matthew Wilcox (Oracle) --- lib/xarray.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'lib/xarray.c') diff --git a/lib/xarray.c b/lib/xarray.c index 6f47f6375808..757644617b9b 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -722,6 +722,8 @@ void xas_create_range(struct xa_state *xas) for (;;) { struct xa_node *node = xas->xa_node; + if (node->shift >= shift) + break; xas->xa_node = xa_parent_locked(xas->xa, node); xas->xa_offset = node->offset - 1; if (node->offset != 0) -- cgit From 3ed4bb77156da0bc732847c8c9df92454c1fbeea Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 31 Mar 2022 08:27:09 -0400 Subject: XArray: Update the LRU list in xas_split() When splitting a value entry, we may need to add the new nodes to the LRU list and remove the parent node from the LRU list. The WARN_ON checks in shadow_lru_isolate() catch this oversight. This bug was latent until we stopped splitting folios in shrink_page_list() with commit 820c4e2e6f51 ("mm/vmscan: Free non-shmem folios without splitting them"). That allows the creation of large shadow entries, and subsequently when trying to page in a small page, we will split the large shadow entry in __filemap_add_folio(). Fixes: 8fc75643c5e1 ("XArray: add xas_split") Reported-by: Hugh Dickins Signed-off-by: Matthew Wilcox (Oracle) --- lib/xarray.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'lib/xarray.c') diff --git a/lib/xarray.c b/lib/xarray.c index 757644617b9b..88ca87435e3d 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1081,6 +1081,7 @@ void xas_split(struct xa_state *xas, void *entry, unsigned int order) xa_mk_node(child)); if (xa_is_value(curr)) values--; + xas_update(xas, child); } else { unsigned int canon = offset - xas->xa_sibs; @@ -1095,6 +1096,7 @@ void xas_split(struct xa_state *xas, void *entry, unsigned int order) } while (offset-- > xas->xa_offset); node->nr_values += values; + xas_update(xas, node); } EXPORT_SYMBOL_GPL(xas_split); #endif -- cgit From 63b1898fffcd8bd81905b95104ecc52b45a97e21 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 22 Apr 2022 13:23:12 -0400 Subject: XArray: Disallow sibling entries of nodes There is a race between xas_split() and xas_load() which can result in the wrong page being returned, and thus data corruption. Fortunately, it's hard to hit (syzbot took three months to find it) and often guarded with VM_BUG_ON(). The anatomy of this race is: thread A thread B order-9 page is stored at index 0x200 lookup of page at index 0x274 page split starts load of sibling entry at offset 9 stores nodes at offsets 8-15 load of entry at offset 8 The entry at offset 8 turns out to be a node, and so we descend into it, and load the page at index 0x234 instead of 0x274. This is hard to fix on the split side; we could replace the entire node that contains the order-9 page instead of replacing the eight entries. Fixing it on the lookup side is easier; just disallow sibling entries that point to nodes. This cannot ever be a useful thing as the descent would not know the correct offset to use within the new node. The test suite continues to pass, but I have not added a new test for this bug. Reported-by: syzbot+cf4cf13056f85dec2c40@syzkaller.appspotmail.com Tested-by: syzbot+cf4cf13056f85dec2c40@syzkaller.appspotmail.com Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") Signed-off-by: Matthew Wilcox (Oracle) --- lib/xarray.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'lib/xarray.c') diff --git a/lib/xarray.c b/lib/xarray.c index 4acc88ea7c21..54e646e8e6ee 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -207,6 +207,8 @@ static void *xas_descend(struct xa_state *xas, struct xa_node *node) if (xa_is_sibling(entry)) { offset = xa_to_sibling(entry); entry = xa_entry(xas->xa, node, offset); + if (node->shift && xa_is_node(entry)) + entry = XA_RETRY_ENTRY; } xas->xa_offset = offset; -- cgit From 69a37a8ba1b408a1c7616494aa7018e4b3844cbe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Jun 2022 15:18:34 -0400 Subject: mm/huge_memory: Fix xarray node memory leak If xas_split_alloc() fails to allocate the necessary nodes to complete the xarray entry split, it sets the xa_state to -ENOMEM, which xas_nomem() then interprets as "Please allocate more memory", not as "Please free any unnecessary memory" (which was the intended outcome). It's confusing to use xas_nomem() to free memory in this context, so call xas_destroy() instead. Reported-by: syzbot+9e27a75a8c24f3fe75c1@syzkaller.appspotmail.com Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache") Cc: stable@vger.kernel.org Signed-off-by: Matthew Wilcox (Oracle) --- lib/xarray.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'lib/xarray.c') diff --git a/lib/xarray.c b/lib/xarray.c index 54e646e8e6ee..ea9ce1f0b386 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -264,9 +264,10 @@ static void xa_node_free(struct xa_node *node) * xas_destroy() - Free any resources allocated during the XArray operation. * @xas: XArray operation state. * - * This function is now internal-only. + * Most users will not need to call this function; it is called for you + * by xas_nomem(). */ -static void xas_destroy(struct xa_state *xas) +void xas_destroy(struct xa_state *xas) { struct xa_node *next, *node = xas->xa_alloc; -- cgit