From c1ad12ee0efc07244be37f69311e6f7c4ac98e62 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 23 Oct 2023 13:01:54 +0200 Subject: kexec: fix KEXEC_FILE dependencies The cleanup for the CONFIG_KEXEC Kconfig logic accidentally changed the 'depends on CRYPTO=y' dependency to a plain 'depends on CRYPTO', which causes a link failure when all the crypto support is in a loadable module and kexec_file support is built-in: x86_64-linux-ld: vmlinux.o: in function `__x64_sys_kexec_file_load': (.text+0x32e30a): undefined reference to `crypto_alloc_shash' x86_64-linux-ld: (.text+0x32e58e): undefined reference to `crypto_shash_update' x86_64-linux-ld: (.text+0x32e6ee): undefined reference to `crypto_shash_final' Both s390 and x86 have this problem, while ppc64 and riscv have the correct dependency already. On riscv, the dependency is only used for the purgatory, not for the kexec_file code itself, which may be a bit surprising as it means that with CONFIG_CRYPTO=m, it is possible to enable KEXEC_FILE but then the purgatory code is silently left out. Move this into the common Kconfig.kexec file in a way that is correct everywhere, using the dependency on CRYPTO_SHA256=y only when the purgatory code is available. This requires reversing the dependency between ARCH_SUPPORTS_KEXEC_PURGATORY and KEXEC_FILE, but the effect remains the same, other than making riscv behave like the other ones. On s390, there is an additional dependency on CRYPTO_SHA256_S390, which should technically not be required but gives better performance. Remove this dependency here, noting that it was not present in the initial Kconfig code but was brought in without an explanation in commit 71406883fd357 ("s390/kexec_file: Add kexec_file_load system call"). [arnd@arndb.de: fix riscv build] Link: https://lkml.kernel.org/r/67ddd260-d424-4229-a815-e3fcfb864a77@app.fastmail.com Link: https://lkml.kernel.org/r/20231023110308.1202042-1-arnd@kernel.org Fixes: 6af5138083005 ("x86/kexec: refactor for kernel/Kconfig.kexec") Signed-off-by: Arnd Bergmann Reviewed-by: Eric DeVolder Tested-by: Eric DeVolder Cc: Albert Ou Cc: Alexander Gordeev Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Conor Dooley Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: Herbert Xu Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 4 ++-- arch/riscv/Kconfig | 4 +--- arch/s390/Kconfig | 4 ++-- arch/x86/Kconfig | 4 ++-- kernel/Kconfig.kexec | 1 + 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 6f105ee4f3cf..1f11a62809f2 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -608,10 +608,10 @@ config ARCH_SUPPORTS_KEXEC def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP) config ARCH_SUPPORTS_KEXEC_FILE - def_bool PPC64 && CRYPTO=y && CRYPTO_SHA256=y + def_bool PPC64 config ARCH_SUPPORTS_KEXEC_PURGATORY - def_bool KEXEC_FILE + def_bool y config ARCH_SELECTS_KEXEC_FILE def_bool y diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 24c1799e2ec4..cd4c9a204d08 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -702,9 +702,7 @@ config ARCH_SELECTS_KEXEC_FILE select KEXEC_ELF config ARCH_SUPPORTS_KEXEC_PURGATORY - def_bool KEXEC_FILE - depends on CRYPTO=y - depends on CRYPTO_SHA256=y + def_bool ARCH_SUPPORTS_KEXEC_FILE config ARCH_SUPPORTS_CRASH_DUMP def_bool y diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 3bec98d20283..d5d8f99d1f25 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -254,13 +254,13 @@ config ARCH_SUPPORTS_KEXEC def_bool y config ARCH_SUPPORTS_KEXEC_FILE - def_bool CRYPTO && CRYPTO_SHA256 && CRYPTO_SHA256_S390 + def_bool y config ARCH_SUPPORTS_KEXEC_SIG def_bool MODULE_SIG_FORMAT config ARCH_SUPPORTS_KEXEC_PURGATORY - def_bool KEXEC_FILE + def_bool y config ARCH_SUPPORTS_CRASH_DUMP def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3762f41bb092..1566748f16c4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2072,7 +2072,7 @@ config ARCH_SUPPORTS_KEXEC def_bool y config ARCH_SUPPORTS_KEXEC_FILE - def_bool X86_64 && CRYPTO && CRYPTO_SHA256 + def_bool X86_64 config ARCH_SELECTS_KEXEC_FILE def_bool y @@ -2080,7 +2080,7 @@ config ARCH_SELECTS_KEXEC_FILE select HAVE_IMA_KEXEC if IMA config ARCH_SUPPORTS_KEXEC_PURGATORY - def_bool KEXEC_FILE + def_bool y config ARCH_SUPPORTS_KEXEC_SIG def_bool y diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 2fd510256604..92120e396008 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -36,6 +36,7 @@ config KEXEC config KEXEC_FILE bool "Enable kexec file based system call" depends on ARCH_SUPPORTS_KEXEC_FILE + depends on CRYPTO_SHA256=y || !ARCH_SUPPORTS_KEXEC_PURGATORY select KEXEC_CORE help This is new version of kexec system call. This system call is -- cgit From e63bde3d9417f8318d6dd0d0fafa35ebf307aabd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 23 Oct 2023 13:01:55 +0200 Subject: kexec: select CRYPTO from KEXEC_FILE instead of depending on it All other users of crypto code use 'select' instead of 'depends on', so do the same thing with KEXEC_FILE for consistency. In practice this makes very little difference as kernels with kexec support are very likely to also include some other feature that already selects both crypto and crypto_sha256, but being consistent here helps for usability as well as to avoid potential circular dependencies. This reverts the dependency back to what it was originally before commit 74ca317c26a3f ("kexec: create a new config option CONFIG_KEXEC_FILE for new syscall"), which changed changed it with the comment "This should be safer as "select" is not recursive", but that appears to have been done in error, as "select" is indeed recursive, and there are no other dependencies that prevent CRYPTO_SHA256 from being selected here. Link: https://lkml.kernel.org/r/20231023110308.1202042-2-arnd@kernel.org Fixes: 74ca317c26a3f ("kexec: create a new config option CONFIG_KEXEC_FILE for new syscall") Signed-off-by: Arnd Bergmann Reviewed-by: Eric DeVolder Tested-by: Eric DeVolder Acked-by: Baoquan He Cc: Herbert Xu Cc: "David S. Miller" Cc: Albert Ou Cc: Alexander Gordeev Cc: Ard Biesheuvel Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Conor Dooley Cc: Dave Hansen Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 92120e396008..946dffa048b7 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -36,7 +36,8 @@ config KEXEC config KEXEC_FILE bool "Enable kexec file based system call" depends on ARCH_SUPPORTS_KEXEC_FILE - depends on CRYPTO_SHA256=y || !ARCH_SUPPORTS_KEXEC_PURGATORY + select CRYPTO + select CRYPTO_SHA256 select KEXEC_CORE help This is new version of kexec system call. This system call is -- cgit From b2325bf860faa2f304a7e188f00cf9f7dc9b5ee8 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Tue, 12 Dec 2023 16:26:59 -0700 Subject: kunit: kasan_test: disable fortify string checker on kmalloc_oob_memset Similar to commit 09c6304e38e4 ("kasan: test: fix compatibility with FORTIFY_SOURCE") the kernel is panicing in kmalloc_oob_memset_*. This is due to the `ptr` not being hidden from the optimizer which would disable the runtime fortify string checker. kernel BUG at lib/string_helpers.c:1048! Call Trace: [<00000000272502e2>] fortify_panic+0x2a/0x30 ([<00000000272502de>] fortify_panic+0x26/0x30) [<001bffff817045c4>] kmalloc_oob_memset_2+0x22c/0x230 [kasan_test] Hide the `ptr` variable from the optimizer to fix the kernel panic. Also define a memset_size variable and hide that as well. This cleans up the code and follows the same convention as other tests. [npache@redhat.com: address review comments from Andrey] Link: https://lkml.kernel.org/r/20231214164423.6202-1-npache@redhat.com Link: https://lkml.kernel.org/r/20231212232659.18839-1-npache@redhat.com Signed-off-by: Nico Pache Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 8281eb42464b..34515a106ca5 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -493,14 +493,17 @@ static void kmalloc_oob_memset_2(struct kunit *test) { char *ptr; size_t size = 128 - KASAN_GRANULE_SIZE; + size_t memset_size = 2; KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2)); + OPTIMIZER_HIDE_VAR(memset_size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, memset_size)); kfree(ptr); } @@ -508,14 +511,17 @@ static void kmalloc_oob_memset_4(struct kunit *test) { char *ptr; size_t size = 128 - KASAN_GRANULE_SIZE; + size_t memset_size = 4; KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4)); + OPTIMIZER_HIDE_VAR(memset_size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, memset_size)); kfree(ptr); } @@ -523,14 +529,17 @@ static void kmalloc_oob_memset_8(struct kunit *test) { char *ptr; size_t size = 128 - KASAN_GRANULE_SIZE; + size_t memset_size = 8; KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8)); + OPTIMIZER_HIDE_VAR(memset_size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, memset_size)); kfree(ptr); } @@ -538,14 +547,17 @@ static void kmalloc_oob_memset_16(struct kunit *test) { char *ptr; size_t size = 128 - KASAN_GRANULE_SIZE; + size_t memset_size = 16; KASAN_TEST_NEEDS_CHECKED_MEMINTRINSICS(test); ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16)); + OPTIMIZER_HIDE_VAR(memset_size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, memset_size)); kfree(ptr); } -- cgit From e2c27b803bb664748e090d99042ac128b3f88d92 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 13 Dec 2023 14:23:24 +0800 Subject: mm/filemap: avoid buffered read/write race to read inconsistent data The following concurrency may cause the data read to be inconsistent with the data on disk: cpu1 cpu2 ------------------------------|------------------------------ // Buffered write 2048 from 0 ext4_buffered_write_iter generic_perform_write copy_page_from_iter_atomic ext4_da_write_end ext4_da_do_write_end block_write_end __block_commit_write folio_mark_uptodate // Buffered read 4096 from 0 smp_wmb() ext4_file_read_iter set_bit(PG_uptodate, folio_flags) generic_file_read_iter i_size_write // 2048 filemap_read unlock_page(page) filemap_get_pages filemap_get_read_batch folio_test_uptodate(folio) ret = test_bit(PG_uptodate, folio_flags) if (ret) smp_rmb(); // Ensure that the data in page 0-2048 is up-to-date. // New buffered write 2048 from 2048 ext4_buffered_write_iter generic_perform_write copy_page_from_iter_atomic ext4_da_write_end ext4_da_do_write_end block_write_end __block_commit_write folio_mark_uptodate smp_wmb() set_bit(PG_uptodate, folio_flags) i_size_write // 4096 unlock_page(page) isize = i_size_read(inode) // 4096 // Read the latest isize 4096, but without smp_rmb(), there may be // Load-Load disorder resulting in the data in the 2048-4096 range // in the page is not up-to-date. copy_page_to_iter // copyout 4096 In the concurrency above, we read the updated i_size, but there is no read barrier to ensure that the data in the page is the same as the i_size at this point, so we may copy the unsynchronized page out. Hence adding the missing read memory barrier to fix this. This is a Load-Load reordering issue, which only occurs on some weak mem-ordering architectures (e.g. ARM64, ALPHA), but not on strong mem-ordering architectures (e.g. X86). And theoretically the problem doesn't only happen on ext4, filesystems that call filemap_read() but don't hold inode lock (e.g. btrfs, f2fs, ubifs ...) will have this problem, while filesystems with inode lock (e.g. xfs, nfs) won't have this problem. Link: https://lkml.kernel.org/r/20231213062324.739009-1-libaokun1@huawei.com Signed-off-by: Baokun Li Reviewed-by: Jan Kara Cc: Andreas Dilger Cc: Christoph Hellwig Cc: Dave Chinner Cc: Matthew Wilcox (Oracle) Cc: Ritesh Harjani (IBM) Cc: Theodore Ts'o Cc: yangerkun Cc: Yu Kuai Cc: Zhang Yi Cc: Signed-off-by: Andrew Morton --- mm/filemap.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index f1c8c278310f..ad5b4aa049a3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2607,6 +2607,15 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); + /* + * Pairs with a barrier in + * block_write_end()->mark_buffer_dirty() or other page + * dirtying routines like iomap_write_end() to ensure + * changes to page contents are visible before we see + * increased inode size. + */ + smp_rmb(); + /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: -- cgit From 4249f13c11be8b8b7bf93204185e150c3bdc968d Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 13 Dec 2023 12:50:57 -0800 Subject: maple_tree: do not preallocate nodes for slot stores mas_preallocate() defaults to requesting 1 node for preallocation and then ,depending on the type of store, will update the request variable. There isn't a check for a slot store type, so slot stores are preallocating the default 1 node. Slot stores do not require any additional nodes, so add a check for the slot store case that will bypass node_count_gfp(). Update the tests to reflect that slot stores do not require allocations. User visible effects of this bug include increased memory usage from the unneeded node that was allocated. Link: https://lkml.kernel.org/r/20231213205058.386589-1-sidhartha.kumar@oracle.com Fixes: 0b8bb544b1a7 ("maple_tree: update mas_preallocate() testing") Signed-off-by: Sidhartha Kumar Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Peng Zhang Cc: [6.6+] Signed-off-by: Andrew Morton --- lib/maple_tree.c | 11 +++++++++++ tools/testing/radix-tree/maple.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index bb24d84a4922..684689457d77 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5501,6 +5501,17 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) mas_wr_end_piv(&wr_mas); node_size = mas_wr_new_end(&wr_mas); + + /* Slot store, does not require additional nodes */ + if (node_size == wr_mas.node_end) { + /* reuse node */ + if (!mt_in_rcu(mas->tree)) + return 0; + /* shifting boundary */ + if (wr_mas.offset_end - mas->offset == 1) + return 0; + } + if (node_size >= mt_slots[wr_mas.type]) { /* Split, worst case for now. */ request = 1 + mas_mt_height(mas) * 2; diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index e5da1cad70ba..76a8990bb14e 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35538,7 +35538,7 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated != 1); + MT_BUG_ON(mt, allocated != 0); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); -- cgit From fc346d0a70a13d52fe1c4bc49516d83a42cd7c4c Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Thu, 14 Dec 2023 04:58:41 +0000 Subject: mm: migrate high-order folios in swap cache correctly Large folios occupy N consecutive entries in the swap cache instead of using multi-index entries like the page cache. However, if a large folio is re-added to the LRU list, it can be migrated. The migration code was not aware of the difference between the swap cache and the page cache and assumed that a single xas_store() would be sufficient. This leaves potentially many stale pointers to the now-migrated folio in the swap cache, which can lead to almost arbitrary data corruption in the future. This can also manifest as infinite loops with the RCU read lock held. [willy@infradead.org: modifications to the changelog & tweaked the fix] Fixes: 3417013e0d18 ("mm/migrate: Add folio_migrate_mapping()") Link: https://lkml.kernel.org/r/20231214045841.961776-1-willy@infradead.org Signed-off-by: Charan Teja Kalla Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Charan Teja Kalla Closes: https://lkml.kernel.org/r/1700569840-17327-1-git-send-email-quic_charante@quicinc.com Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Naoya Horiguchi Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton --- mm/migrate.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 35a88334bb3c..397f2a6e34cb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -405,6 +405,7 @@ int folio_migrate_mapping(struct address_space *mapping, int dirty; int expected_count = folio_expected_refs(mapping, folio) + extra_count; long nr = folio_nr_pages(folio); + long entries, i; if (!mapping) { /* Anonymous page without mapping */ @@ -442,8 +443,10 @@ int folio_migrate_mapping(struct address_space *mapping, folio_set_swapcache(newfolio); newfolio->private = folio_get_private(folio); } + entries = nr; } else { VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); + entries = 1; } /* Move dirty while page refs frozen and newpage not yet exposed */ @@ -453,7 +456,11 @@ int folio_migrate_mapping(struct address_space *mapping, folio_set_dirty(newfolio); } - xas_store(&xas, newfolio); + /* Swap cache still stores N entries instead of a high-order entry */ + for (i = 0; i < entries; i++) { + xas_store(&xas, newfolio); + xas_next(&xas); + } /* * Drop cache reference from old page by unfreezing -- cgit From 0aac13add26d546ac74c89d2883b3a5f0fbea039 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 14 Dec 2023 15:19:30 +0500 Subject: selftests: secretmem: floor the memory size to the multiple of page_size The "locked-in-memory size" limit per process can be non-multiple of page_size. The mmap() fails if we try to allocate locked-in-memory with same size as the allowed limit if it isn't multiple of the page_size because mmap() rounds off the memory size to be allocated to next multiple of page_size. Fix this by flooring the length to be allocated with mmap() to the previous multiple of the page_size. This was getting triggered on KernelCI regularly because of different ulimit settings which wasn't multiple of the page_size. Find logs here: https://linux.kernelci.org/test/plan/id/657654bd8e81e654fae13532/ The bug in was present from the time test was first added. Link: https://lkml.kernel.org/r/20231214101931.1155586-1-usama.anjum@collabora.com Fixes: 76fe17ef588a ("secretmem: test: add basic selftest for memfd_secret(2)") Signed-off-by: Muhammad Usama Anjum Reported-by: "kernelci.org bot" Closes: https://linux.kernelci.org/test/plan/id/657654bd8e81e654fae13532/ Cc: "James E.J. Bottomley" Cc: Mike Rapoport (IBM) Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/memfd_secret.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c index 957b9e18c729..9b298f6a04b3 100644 --- a/tools/testing/selftests/mm/memfd_secret.c +++ b/tools/testing/selftests/mm/memfd_secret.c @@ -62,6 +62,9 @@ static void test_mlock_limit(int fd) char *mem; len = mlock_limit_cur; + if (len % page_size != 0) + len = (len/page_size) * page_size; + mem = mmap(NULL, len, prot, mode, fd, 0); if (mem == MAP_FAILED) { fail("unable to mmap secret memory\n"); -- cgit From 376907f3a0b34a17e80417825f8cc1c40fcba81b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 18 Dec 2023 13:58:35 +0000 Subject: mm/memory-failure: pass the folio and the page to collect_procs() Patch series "Three memory-failure fixes". I've been looking at the memory-failure code and I believe I have found three bugs that need fixing -- one going all the way back to 2010! I'll have more patches later to use folios more extensively but didn't want these bugfixes to get caught up in that. This patch (of 3): Both collect_procs_anon() and collect_procs_file() iterate over the VMA interval trees looking for a single pgoff, so it is wrong to look for the pgoff of the head page as is currently done. However, it is also wrong to look at page->mapping of the precise page as this is invalid for tail pages. Clear up the confusion by passing both the folio and the precise page to collect_procs(). Link: https://lkml.kernel.org/r/20231218135837.3310403-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231218135837.3310403-2-willy@infradead.org Fixes: 415c64c1453a ("mm/memory-failure: split thp earlier in memory error handling") Signed-off-by: Matthew Wilcox (Oracle) Cc: Dan Williams Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 660c21859118..6953bda11e6e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -595,10 +595,9 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) /* * Collect processes when the error hit an anonymous page. */ -static void collect_procs_anon(struct page *page, struct list_head *to_kill, - int force_early) +static void collect_procs_anon(struct folio *folio, struct page *page, + struct list_head *to_kill, int force_early) { - struct folio *folio = page_folio(page); struct vm_area_struct *vma; struct task_struct *tsk; struct anon_vma *av; @@ -633,12 +632,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, /* * Collect processes when the error hit a file mapped page. */ -static void collect_procs_file(struct page *page, struct list_head *to_kill, - int force_early) +static void collect_procs_file(struct folio *folio, struct page *page, + struct list_head *to_kill, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; pgoff_t pgoff; i_mmap_lock_read(mapping); @@ -704,17 +703,17 @@ static void collect_procs_fsdax(struct page *page, /* * Collect the processes who have the corrupted page mapped to kill. */ -static void collect_procs(struct page *page, struct list_head *tokill, - int force_early) +static void collect_procs(struct folio *folio, struct page *page, + struct list_head *tokill, int force_early) { - if (!page->mapping) + if (!folio->mapping) return; if (unlikely(PageKsm(page))) collect_procs_ksm(page, tokill, force_early); else if (PageAnon(page)) - collect_procs_anon(page, tokill, force_early); + collect_procs_anon(folio, page, tokill, force_early); else - collect_procs_file(page, tokill, force_early); + collect_procs_file(folio, page, tokill, force_early); } struct hwpoison_walk { @@ -1602,7 +1601,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. */ - collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED); if (PageHuge(hpage) && !PageAnon(hpage)) { /* @@ -1772,7 +1771,7 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags, * SIGBUS (i.e. MF_MUST_KILL) */ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; - collect_procs(&folio->page, &to_kill, true); + collect_procs(folio, &folio->page, &to_kill, true); unmap_and_kill(&to_kill, pfn, folio->mapping, folio->index, flags); unlock: -- cgit From c79c5a0a00a9457718056b588f312baadf44e471 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 18 Dec 2023 13:58:36 +0000 Subject: mm/memory-failure: check the mapcount of the precise page A process may map only some of the pages in a folio, and might be missed if it maps the poisoned page but not the head page. Or it might be unnecessarily hit if it maps the head page, but not the poisoned page. Link: https://lkml.kernel.org/r/20231218135837.3310403-3-willy@infradead.org Fixes: 7af446a841a2 ("HWPOISON, hugetlb: enable error handling path for hugepage") Signed-off-by: Matthew Wilcox (Oracle) Cc: Dan Williams Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6953bda11e6e..82e15baabb48 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1570,7 +1570,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ - if (!page_mapped(hpage)) + if (!page_mapped(p)) return true; if (PageSwapCache(p)) { @@ -1621,10 +1621,10 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, try_to_unmap(folio, ttu); } - unmap_success = !page_mapped(hpage); + unmap_success = !page_mapped(p); if (!unmap_success) pr_err("%#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pfn, page_mapcount(p)); /* * try_to_unmap() might put mlocked page in lru cache, so call -- cgit From 39ebd6dce62d8cfe3864e16148927a139f11bc9a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 18 Dec 2023 13:58:37 +0000 Subject: mm/memory-failure: cast index to loff_t before shifting it On 32-bit systems, we'll lose the top bits of index because arithmetic will be performed in unsigned long instead of unsigned long long. This affects files over 4GB in size. Link: https://lkml.kernel.org/r/20231218135837.3310403-4-willy@infradead.org Fixes: 6100e34b2526 ("mm, memory_failure: Teach memory_failure() about dev_pagemap pages") Signed-off-by: Matthew Wilcox (Oracle) Cc: Dan Williams Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 82e15baabb48..455093f73a70 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1704,7 +1704,7 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, * mapping being torn down is communicated in siginfo, see * kill_proc() */ - loff_t start = (index << PAGE_SHIFT) & ~(size - 1); + loff_t start = ((loff_t)index << PAGE_SHIFT) & ~(size - 1); unmap_mapping_range(mapping, start, size, 0); } -- cgit From 1803d0c5ee1a3bbee23db2336e21add067824f02 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 18 Dec 2023 14:03:28 +0000 Subject: mailmap: add an old address for Naoya Horiguchi This address now bounces, remap it to a current address. Link: https://lkml.kernel.org/r/20231218140328.3313474-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Dan Williams Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 19eb49e55836..6fb41ef38e86 100644 --- a/.mailmap +++ b/.mailmap @@ -429,6 +429,7 @@ Muna Sinada Murali Nalajala Mythri P K Nadia Yvette Chambers William Lee Irwin III +Naoya Horiguchi Nathan Chancellor Neeraj Upadhyay Neil Armstrong -- cgit