From 780f6a9afe8b0e303406a39f6968cf1daa6c3d51 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 4 Jan 2023 13:20:52 -0800 Subject: lib: zstd: Fix -Wstringop-overflow warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following -Wstringop-overflow warning when building with GCC 11+: lib/zstd/decompress/huf_decompress.c: In function ‘HUF_readDTableX2_wksp’: lib/zstd/decompress/huf_decompress.c:700:5: warning: ‘HUF_fillDTableX2.constprop’ accessing 624 bytes in a region of size 52 [-Wstringop-overflow=] 700 | HUF_fillDTableX2(dt, maxTableLog, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 701 | wksp->sortedSymbol, sizeOfSort, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 702 | wksp->rankStart0, wksp->rankVal, maxW, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 703 | tableLog+1, | ~~~~~~~~~~~ 704 | wksp->calleeWksp, sizeof(wksp->calleeWksp) / sizeof(U32)); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lib/zstd/decompress/huf_decompress.c:700:5: note: referencing argument 6 of type ‘U32 (*)[13]’ {aka ‘unsigned int (*)[13]’} lib/zstd/decompress/huf_decompress.c:571:13: note: in a call to function ‘HUF_fillDTableX2.constprop’ 571 | static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, | ^~~~~~~~~~~~~~~~ by using pointer notation instead of array notation. This is one of the last remaining warnings to be fixed before globally enabling -Wstringop-overflow. Co-developed-by: Gustavo A. R. Silva Signed-off-by: Gustavo A. R. Silva Cc: Nick Terrell Signed-off-by: Kees Cook Signed-off-by: Nick Terrell --- lib/zstd/decompress/huf_decompress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c index 89b269a641c7..60958afebc41 100644 --- a/lib/zstd/decompress/huf_decompress.c +++ b/lib/zstd/decompress/huf_decompress.c @@ -985,7 +985,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, const sortedSymbol_t* sortedList, - const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, + const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight, const U32 nbBitsBaseline) { U32* const rankVal = rankValOrigin[0]; -- cgit From 038505c41f0aad26ef101f4f7f6e111531c3914f Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Wed, 15 Feb 2023 15:19:17 -0800 Subject: lib: zstd: Backport fix for in-place decompression Backport the relevant part of upstream commit 5b266196 [0]. This fixes in-place decompression for x86-64 kernel decompression. It uses a bound of 131072 + (uncompressed_size >> 8), which can be violated after upstream commit 6a7ede3d [1], as zstd can use part of the output buffer as temporary storage, and without this patch needs a bound of ~262144. The fix is for zstd to detect that the input and output buffers overlap, so that zstd knows it can't use the overlapping portion of the output buffer as tempoary storage. If the margin is not large enough, this will ensure that zstd will fail the decompression, rather than overwriting part of the input data, and causing corruption. This fix has been landed upstream and is in release v1.5.4. That commit also adds unit and fuzz tests to verify that the margin we use is respected, and correct. That means that the fix is well tested upstream. I have not been able to reproduce the potential bug in x86-64 kernel decompression locally, nor have I recieved reports of failures to decompress the kernel. It is possible that compression saves enough space to make it very hard for the issue to appear. I've boot tested the zstd compressed kernel on x86-64 and i386 with this patch, which uses in-place decompression, and sanity tested zstd compression in btrfs / squashfs to make sure that we don't see any issues, but other uses of zstd shouldn't be affected, because they don't use in-place decompression. Thanks to Vasily Gorbik for debugging a related issue on s390, which was triggered by the same commit, but was a bug in how __decompress() was called [2]. And to Sasha Levin for the CC alerting me of the issue. [0] https://github.com/facebook/zstd/commit/5b266196a41e6a15e21bd4f0eeab43b938db1d90 [1] https://github.com/facebook/zstd/commit/6a7ede3dfccbf3e0a5928b4224a039c260dcff72 [2] https://lore.kernel.org/r/patch-1.thread-41c676.git-41c676c2d153.your-ad-here.call-01675030179-ext-9637@work.hours CC: Vasily Gorbik CC: Heiko Carstens CC: Sasha Levin CC: Yann Collet Signed-off-by: Nick Terrell --- lib/zstd/decompress/zstd_decompress.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c index b9b935a9f5c0..6b3177c94711 100644 --- a/lib/zstd/decompress/zstd_decompress.c +++ b/lib/zstd/decompress/zstd_decompress.c @@ -798,7 +798,7 @@ static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity, if (srcSize == 0) return 0; RETURN_ERROR(dstBuffer_null, ""); } - ZSTD_memcpy(dst, src, srcSize); + ZSTD_memmove(dst, src, srcSize); return srcSize; } @@ -858,6 +858,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, /* Loop on each block */ while (1) { + BYTE* oBlockEnd = oend; size_t decodedSize; blockProperties_t blockProperties; size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties); @@ -867,16 +868,34 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, remainingSrcSize -= ZSTD_blockHeaderSize; RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, ""); + if (ip >= op && ip < oBlockEnd) { + /* We are decompressing in-place. Limit the output pointer so that we + * don't overwrite the block that we are currently reading. This will + * fail decompression if the input & output pointers aren't spaced + * far enough apart. + * + * This is important to set, even when the pointers are far enough + * apart, because ZSTD_decompressBlock_internal() can decide to store + * literals in the output buffer, after the block it is decompressing. + * Since we don't want anything to overwrite our input, we have to tell + * ZSTD_decompressBlock_internal to never write past ip. + * + * See ZSTD_allocateLiteralsBuffer() for reference. + */ + oBlockEnd = op + (ip - op); + } + switch(blockProperties.blockType) { case bt_compressed: - decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming); + decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming); break; case bt_raw : + /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize); break; case bt_rle : - decodedSize = ZSTD_setRleBlock(op, (size_t)(oend-op), *ip, blockProperties.origSize); + decodedSize = ZSTD_setRleBlock(op, (size_t)(oBlockEnd-op), *ip, blockProperties.origSize); break; case bt_reserved : default: -- cgit From 6906598f1ce93761716d780b6e3f171e13f0f4ce Mon Sep 17 00:00:00 2001 From: Jonathan Neuschäfer Date: Sun, 29 Jan 2023 14:14:36 +0100 Subject: zstd: Fix definition of assert() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit assert(x) should emit a warning if x is false. WARN_ON(x) emits a warning if x is true. Thus, assert(x) should be defined as WARN_ON(!x) rather than WARN_ON(x). Signed-off-by: Jonathan Neuschäfer Signed-off-by: Nick Terrell --- lib/zstd/common/zstd_deps.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h index 7a5bf44839c9..f06df065dec0 100644 --- a/lib/zstd/common/zstd_deps.h +++ b/lib/zstd/common/zstd_deps.h @@ -84,7 +84,7 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) { #include -#define assert(x) WARN_ON((x)) +#define assert(x) WARN_ON(!(x)) #endif /* ZSTD_DEPS_ASSERT */ #endif /* ZSTD_DEPS_NEED_ASSERT */ -- cgit From 85c37208b0cb178084600dd01d9f97a32b6a21ea Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 3 Mar 2023 11:50:55 -0500 Subject: dyndbg: remove unused 'base' arg from __ddebug_add_module() The 'base' parameter to __ddebug_add_module() is no longer in use after: Commit b7b4eebdba7b ("dyndbg: gather __dyndbg[] state into struct _ddebug_info"). Cc: Jim Cromie Cc: Greg Kroah-Hartman Tested-by: Jim Cromie Reviewed-by: Vincenzo Palazzo Signed-off-by: Jason Baron Signed-off-by: Luis Chamberlain --- lib/dynamic_debug.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 009f2ead09c1..8136e5236b7b 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -1223,8 +1223,7 @@ static void ddebug_attach_module_classes(struct ddebug_table *dt, * Allocate a new ddebug_table for the given module * and add it to the global list. */ -static int __ddebug_add_module(struct _ddebug_info *di, unsigned int base, - const char *modname) +static int __ddebug_add_module(struct _ddebug_info *di, const char *modname) { struct ddebug_table *dt; @@ -1265,7 +1264,7 @@ static int __ddebug_add_module(struct _ddebug_info *di, unsigned int base, int ddebug_add_module(struct _ddebug_info *di, const char *modname) { - return __ddebug_add_module(di, 0, modname); + return __ddebug_add_module(di, modname); } /* helper for ddebug_dyndbg_(boot|module)_param_cb */ @@ -1408,7 +1407,7 @@ static int __init dynamic_debug_init(void) mod_ct++; di.num_descs = mod_sites; di.descs = iter_mod_start; - ret = __ddebug_add_module(&di, i - mod_sites, modname); + ret = __ddebug_add_module(&di, modname); if (ret) goto out_err; @@ -1419,7 +1418,7 @@ static int __init dynamic_debug_init(void) } di.num_descs = mod_sites; di.descs = iter_mod_start; - ret = __ddebug_add_module(&di, i - mod_sites, modname); + ret = __ddebug_add_module(&di, modname); if (ret) goto out_err; -- cgit From 7deabd67498869640c937c9bd83472574b7dea0b Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 3 Mar 2023 11:50:56 -0500 Subject: dyndbg: use the module notifier callbacks Bring dynamic debug in line with other subsystems by using the module notifier callbacks. This results in a net decrease in core module code. Additionally, Jim Cromie has a new dynamic debug classmap feature, which requires that jump labels be initialized prior to dynamic debug. Specifically, the new feature toggles a jump label from the existing dynamic_debug_setup() function. However, this does not currently work properly, because jump labels are initialized via the 'module_notify_list' notifier chain, which is invoked after the current call to dynamic_debug_setup(). Thus, this patch ensures that jump labels are initialized prior to dynamic debug by setting the dynamic debug notifier priority to 0, while jump labels have the higher priority of 1. Tested by Jim using his new test case, and I've verfied the correct printing via: # modprobe test_dynamic_debug dyndbg. Link: https://lore.kernel.org/lkml/20230113193016.749791-21-jim.cromie@gmail.com/ Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202302190427.9iIK2NfJ-lkp@intel.com/ Tested-by: Jim Cromie Reviewed-by: Vincenzo Palazzo Cc: Peter Zijlstra CC: Jim Cromie Cc: Luis Chamberlain Cc: Greg Kroah-Hartman Signed-off-by: Jason Baron Signed-off-by: Luis Chamberlain --- lib/dynamic_debug.c | 50 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 8136e5236b7b..fdd6d9800a70 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -1223,7 +1223,7 @@ static void ddebug_attach_module_classes(struct ddebug_table *dt, * Allocate a new ddebug_table for the given module * and add it to the global list. */ -static int __ddebug_add_module(struct _ddebug_info *di, const char *modname) +static int ddebug_add_module(struct _ddebug_info *di, const char *modname) { struct ddebug_table *dt; @@ -1262,11 +1262,6 @@ static int __ddebug_add_module(struct _ddebug_info *di, const char *modname) return 0; } -int ddebug_add_module(struct _ddebug_info *di, const char *modname) -{ - return __ddebug_add_module(di, modname); -} - /* helper for ddebug_dyndbg_(boot|module)_param_cb */ static int ddebug_dyndbg_param_cb(char *param, char *val, const char *modname, int on_err) @@ -1313,11 +1308,13 @@ static void ddebug_table_free(struct ddebug_table *dt) kfree(dt); } +#ifdef CONFIG_MODULES + /* * Called in response to a module being unloaded. Removes * any ddebug_table's which point at the module. */ -int ddebug_remove_module(const char *mod_name) +static int ddebug_remove_module(const char *mod_name) { struct ddebug_table *dt, *nextdt; int ret = -ENOENT; @@ -1336,6 +1333,33 @@ int ddebug_remove_module(const char *mod_name) return ret; } +static int ddebug_module_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + ret = ddebug_add_module(&mod->dyndbg_info, mod->name); + if (ret) + WARN(1, "Failed to allocate memory: dyndbg may not work properly.\n"); + break; + case MODULE_STATE_GOING: + ddebug_remove_module(mod->name); + break; + } + + return notifier_from_errno(ret); +} + +static struct notifier_block ddebug_module_nb = { + .notifier_call = ddebug_module_notify, + .priority = 0, /* dynamic debug depends on jump label */ +}; + +#endif /* CONFIG_MODULES */ + static void ddebug_remove_all_tables(void) { mutex_lock(&ddebug_lock); @@ -1387,6 +1411,14 @@ static int __init dynamic_debug_init(void) .num_classes = __stop___dyndbg_classes - __start___dyndbg_classes, }; +#ifdef CONFIG_MODULES + ret = register_module_notifier(&ddebug_module_nb); + if (ret) { + pr_warn("Failed to register dynamic debug module notifier\n"); + return ret; + } +#endif /* CONFIG_MODULES */ + if (&__start___dyndbg == &__stop___dyndbg) { if (IS_ENABLED(CONFIG_DYNAMIC_DEBUG)) { pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); @@ -1407,7 +1439,7 @@ static int __init dynamic_debug_init(void) mod_ct++; di.num_descs = mod_sites; di.descs = iter_mod_start; - ret = __ddebug_add_module(&di, modname); + ret = ddebug_add_module(&di, modname); if (ret) goto out_err; @@ -1418,7 +1450,7 @@ static int __init dynamic_debug_init(void) } di.num_descs = mod_sites; di.descs = iter_mod_start; - ret = __ddebug_add_module(&di, modname); + ret = ddebug_add_module(&di, modname); if (ret) goto out_err; -- cgit From efb5b62d72719fd1df1f927542c58d1e21d69e19 Mon Sep 17 00:00:00 2001 From: Nick Alcock Date: Wed, 8 Mar 2023 12:12:29 +0000 Subject: lib: packing: remove MODULE_LICENSE in non-modules Since commit 8b41fc4454e ("kbuild: create modules.builtin without Makefile.modbuiltin or tristate.conf"), MODULE_LICENSE declarations are used to identify modules. As a consequence, uses of the macro in non-modules will cause modprobe to misidentify their containing object file as a module when it is not (false positives), and modprobe might succeed rather than failing with a suitable error message. So remove it in the files in this commit, none of which can be built as modules. Signed-off-by: Nick Alcock Suggested-by: Luis Chamberlain Cc: Hitomi Hasegawa Cc: Vladimir Oltean Link: https://lore.kernel.org/r/20230308121230.5354-1-nick.alcock@oracle.com Signed-off-by: Jakub Kicinski --- lib/packing.c | 1 - 1 file changed, 1 deletion(-) (limited to 'lib') diff --git a/lib/packing.c b/lib/packing.c index a96169237ae6..3f656167c17e 100644 --- a/lib/packing.c +++ b/lib/packing.c @@ -198,5 +198,4 @@ int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, } EXPORT_SYMBOL(packing); -MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("Generic bitfield packing and unpacking"); -- cgit From 887d85a0736ff346cbfe5efaf51cf20c7ca195a3 Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Wed, 8 Mar 2023 20:39:50 +0000 Subject: kunit: fix bug in debugfs logs of parameterized tests Fix bug in debugfs logs that causes individual parameterized results to not appear because the log is reinitialized (cleared) when each parameter is run. Ensure these results appear in the debugfs logs, increase log size to allow for the size of parameterized results. As a result, append lines to the log directly rather than using an intermediate variable that can cause stack size warnings due to the increased log size. Here is the debugfs log of ext4_inode_test which uses parameterized tests before the fix: KTAP version 1 # Subtest: ext4_inode_test 1..1 # Totals: pass:16 fail:0 skip:0 total:16 ok 1 ext4_inode_test As you can see, this log does not include any of the individual parametrized results. After (in combination with the next two fixes to remove extra empty line and ensure KTAP valid format): KTAP version 1 1..1 KTAP version 1 # Subtest: ext4_inode_test 1..1 KTAP version 1 # Subtest: inode_test_xtimestamp_decoding ok 1 1901-12-13 Lower bound of 32bit < 0 timestamp, no extra bits ... (the rest of the individual parameterized tests) ok 16 2446-05-10 Upper bound of 32bit >=0 timestamp. All extra # inode_test_xtimestamp_decoding: pass:16 fail:0 skip:0 total:16 ok 1 inode_test_xtimestamp_decoding # Totals: pass:16 fail:0 skip:0 total:16 ok 1 ext4_inode_test Signed-off-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/test.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/kunit/test.c b/lib/kunit/test.c index c9e15bb60058..c4d6304edd61 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -114,22 +114,27 @@ static void kunit_print_test_stats(struct kunit *test, */ void kunit_log_append(char *log, const char *fmt, ...) { - char line[KUNIT_LOG_SIZE]; va_list args; - int len_left; + int len, log_len, len_left; if (!log) return; - len_left = KUNIT_LOG_SIZE - strlen(log) - 1; + log_len = strlen(log); + len_left = KUNIT_LOG_SIZE - log_len - 1; if (len_left <= 0) return; + /* Evaluate length of line to add to log */ va_start(args, fmt); - vsnprintf(line, sizeof(line), fmt, args); + len = vsnprintf(NULL, 0, fmt, args) + 1; + va_end(args); + + /* Print formatted line to the log */ + va_start(args, fmt); + vsnprintf(log + log_len, min(len, len_left), fmt, args); va_end(args); - strncat(log, line, len_left); } EXPORT_SYMBOL_GPL(kunit_log_append); @@ -437,7 +442,6 @@ static void kunit_run_case_catch_errors(struct kunit_suite *suite, struct kunit_try_catch_context context; struct kunit_try_catch *try_catch; - kunit_init_test(test, test_case->name, test_case->log); try_catch = &test->try_catch; kunit_try_catch_init(try_catch, @@ -533,6 +537,8 @@ int kunit_run_tests(struct kunit_suite *suite) struct kunit_result_stats param_stats = { 0 }; test_case->status = KUNIT_SKIPPED; + kunit_init_test(&test, test_case->name, test_case->log); + if (!test_case->generate_params) { /* Non-parameterised test. */ kunit_run_case_catch_errors(suite, test_case, &test); -- cgit From f9a301c3317daa921375da0aec82462ddf019928 Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Wed, 8 Mar 2023 20:39:51 +0000 Subject: kunit: fix bug in the order of lines in debugfs logs Fix bug in debugfs logs that causes an incorrect order of lines in the debugfs log. Currently, the test counts lines that show the number of tests passed, failed, and skipped, as well as any suite diagnostic lines, appear prior to the individual results, which is a bug. Ensure the order of printing for the debugfs log is correct. Additionally, add a KTAP header to so the debugfs logs can be valid KTAP. This is an example of a log prior to these fixes: KTAP version 1 # Subtest: kunit_status 1..2 # kunit_status: pass:2 fail:0 skip:0 total:2 # Totals: pass:2 fail:0 skip:0 total:2 ok 1 kunit_status_set_failure_test ok 2 kunit_status_mark_skipped_test ok 1 kunit_status Note the two lines with stats are out of order. This is the same debugfs log after the fixes (in combination with the third patch to remove the extra line): KTAP version 1 1..1 KTAP version 1 # Subtest: kunit_status 1..2 ok 1 kunit_status_set_failure_test ok 2 kunit_status_mark_skipped_test # kunit_status: pass:2 fail:0 skip:0 total:2 # Totals: pass:2 fail:0 skip:0 total:2 ok 1 kunit_status Signed-off-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/debugfs.c | 14 ++++++++++++-- lib/kunit/test.c | 21 ++++++++++++++------- 2 files changed, 26 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/kunit/debugfs.c b/lib/kunit/debugfs.c index de0ee2e03ed6..b08bb1fba106 100644 --- a/lib/kunit/debugfs.c +++ b/lib/kunit/debugfs.c @@ -55,14 +55,24 @@ static int debugfs_print_results(struct seq_file *seq, void *v) enum kunit_status success = kunit_suite_has_succeeded(suite); struct kunit_case *test_case; - if (!suite || !suite->log) + if (!suite) return 0; - seq_printf(seq, "%s", suite->log); + /* Print KTAP header so the debugfs log can be parsed as valid KTAP. */ + seq_puts(seq, "KTAP version 1\n"); + seq_puts(seq, "1..1\n"); + + /* Print suite header because it is not stored in the test logs. */ + seq_puts(seq, KUNIT_SUBTEST_INDENT "KTAP version 1\n"); + seq_printf(seq, KUNIT_SUBTEST_INDENT "# Subtest: %s\n", suite->name); + seq_printf(seq, KUNIT_SUBTEST_INDENT "1..%zd\n", kunit_suite_num_test_cases(suite)); kunit_suite_for_each_test_case(suite, test_case) debugfs_print_result(seq, suite, test_case); + if (suite->log) + seq_printf(seq, "%s", suite->log); + seq_printf(seq, "%s %d %s\n", kunit_status_to_ok_not_ok(success), 1, suite->name); return 0; diff --git a/lib/kunit/test.c b/lib/kunit/test.c index c4d6304edd61..811fcc376d2f 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -152,10 +152,18 @@ EXPORT_SYMBOL_GPL(kunit_suite_num_test_cases); static void kunit_print_suite_start(struct kunit_suite *suite) { - kunit_log(KERN_INFO, suite, KUNIT_SUBTEST_INDENT "KTAP version 1\n"); - kunit_log(KERN_INFO, suite, KUNIT_SUBTEST_INDENT "# Subtest: %s", + /* + * We do not log the test suite header as doing so would + * mean debugfs display would consist of the test suite + * header prior to individual test results. + * Hence directly printk the suite status, and we will + * separately seq_printf() the suite header for the debugfs + * representation. + */ + pr_info(KUNIT_SUBTEST_INDENT "KTAP version 1\n"); + pr_info(KUNIT_SUBTEST_INDENT "# Subtest: %s\n", suite->name); - kunit_log(KERN_INFO, suite, KUNIT_SUBTEST_INDENT "1..%zd", + pr_info(KUNIT_SUBTEST_INDENT "1..%zd\n", kunit_suite_num_test_cases(suite)); } @@ -172,10 +180,9 @@ static void kunit_print_ok_not_ok(void *test_or_suite, /* * We do not log the test suite results as doing so would - * mean debugfs display would consist of the test suite - * description and status prior to individual test results. - * Hence directly printk the suite status, and we will - * separately seq_printf() the suite status for the debugfs + * mean debugfs display would consist of an incorrect test + * number. Hence directly printk the suite result, and we will + * separately seq_printf() the suite results for the debugfs * representation. */ if (suite) -- cgit From 2c6a96dad5797e57b4cf04101d6c8d5c7a571603 Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Wed, 8 Mar 2023 20:39:52 +0000 Subject: kunit: fix bug of extra newline characters in debugfs logs Fix bug of the extra newline characters in debugfs logs. When a line is added to debugfs with a newline character at the end, an extra line appears in the debugfs log. This is due to a discrepancy between how the lines are printed and how they are added to the logs. Remove this discrepancy by checking if a newline character is present before adding a newline character. This should closely match the printk behavior. Add kunit_log_newline_test to provide test coverage for this issue. (Also, move kunit_log_test above suite definition to remove the unnecessary declaration prior to the suite definition) As an example, say we add these two lines to the log: kunit_log(..., "KTAP version 1\n"); kunit_log(..., "1..1"); The debugfs log before this fix: KTAP version 1 1..1 The debugfs log after this fix: KTAP version 1 1..1 Signed-off-by: Rae Moar Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/kunit-test.c | 35 +++++++++++++++++++++++------------ lib/kunit/test.c | 18 ++++++++++++++++++ 2 files changed, 41 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c index 4df0335d0d06..b63595d3e241 100644 --- a/lib/kunit/kunit-test.c +++ b/lib/kunit/kunit-test.c @@ -443,18 +443,6 @@ static struct kunit_suite kunit_resource_test_suite = { .test_cases = kunit_resource_test_cases, }; -static void kunit_log_test(struct kunit *test); - -static struct kunit_case kunit_log_test_cases[] = { - KUNIT_CASE(kunit_log_test), - {} -}; - -static struct kunit_suite kunit_log_test_suite = { - .name = "kunit-log-test", - .test_cases = kunit_log_test_cases, -}; - static void kunit_log_test(struct kunit *test) { struct kunit_suite suite; @@ -481,6 +469,29 @@ static void kunit_log_test(struct kunit *test) #endif } +static void kunit_log_newline_test(struct kunit *test) +{ + kunit_info(test, "Add newline\n"); + if (test->log) { + KUNIT_ASSERT_NOT_NULL_MSG(test, strstr(test->log, "Add newline\n"), + "Missing log line, full log:\n%s", test->log); + KUNIT_EXPECT_NULL(test, strstr(test->log, "Add newline\n\n")); + } else { + kunit_skip(test, "only useful when debugfs is enabled"); + } +} + +static struct kunit_case kunit_log_test_cases[] = { + KUNIT_CASE(kunit_log_test), + KUNIT_CASE(kunit_log_newline_test), + {} +}; + +static struct kunit_suite kunit_log_test_suite = { + .name = "kunit-log-test", + .test_cases = kunit_log_test_cases, +}; + static void kunit_status_set_failure_test(struct kunit *test) { struct kunit fake; diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 811fcc376d2f..e2910b261112 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -108,6 +108,22 @@ static void kunit_print_test_stats(struct kunit *test, stats.total); } +/** + * kunit_log_newline() - Add newline to the end of log if one is not + * already present. + * @log: The log to add the newline to. + */ +static void kunit_log_newline(char *log) +{ + int log_len, len_left; + + log_len = strlen(log); + len_left = KUNIT_LOG_SIZE - log_len - 1; + + if (log_len > 0 && log[log_len - 1] != '\n') + strncat(log, "\n", len_left); +} + /* * Append formatted message to log, size of which is limited to * KUNIT_LOG_SIZE bytes (including null terminating byte). @@ -135,6 +151,8 @@ void kunit_log_append(char *log, const char *fmt, ...) vsnprintf(log + log_len, min(len, len_left), fmt, args); va_end(args); + /* Add newline to end of log if not already present. */ + kunit_log_newline(log); } EXPORT_SYMBOL_GPL(kunit_log_append); -- cgit From 984063339e9ec9b6d9b011169d1f330a505a7571 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 11 Mar 2023 03:14:46 +0000 Subject: kobject: define common logging prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All log messages start with the prefix "kobject: ". Deduplicate this by using the pr_fmt() facility. This makes the very long log strings shorter. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20230311-kobject-warning-v1-1-1ebba4f71fb5@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- lib/kobject.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index 6e2f0bee3560..09c81ffb8b33 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -10,6 +10,8 @@ * about using the kobject interface. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -127,7 +129,7 @@ static int fill_kobj_path(const struct kobject *kobj, char *path, int length) *(path + --length) = '/'; } - pr_debug("kobject: '%s' (%p): %s: path = '%s'\n", kobject_name(kobj), + pr_debug("'%s' (%p): %s: path = '%s'\n", kobject_name(kobj), kobj, __func__, path); return 0; @@ -223,7 +225,7 @@ static int kobject_add_internal(struct kobject *kobj) kobj->parent = parent; } - pr_debug("kobject: '%s' (%p): %s: parent: '%s', set: '%s'\n", + pr_debug("'%s' (%p): %s: parent: '%s', set: '%s'\n", kobject_name(kobj), kobj, __func__, parent ? kobject_name(parent) : "", kobj->kset ? kobject_name(&kobj->kset->kobj) : ""); @@ -359,7 +361,7 @@ static __printf(3, 0) int kobject_add_varg(struct kobject *kobj, retval = kobject_set_name_vargs(kobj, fmt, vargs); if (retval) { - pr_err("kobject: can not set name properly!\n"); + pr_err("can not set name properly!\n"); return retval; } kobj->parent = parent; @@ -588,7 +590,7 @@ static void __kobject_del(struct kobject *kobj) /* send "remove" if the caller did not do it but sent "add" */ if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) { - pr_debug("kobject: '%s' (%p): auto cleanup 'remove' event\n", + pr_debug("'%s' (%p): auto cleanup 'remove' event\n", kobject_name(kobj), kobj); kobject_uevent(kobj, KOBJ_REMOVE); } @@ -658,16 +660,16 @@ static void kobject_cleanup(struct kobject *kobj) const struct kobj_type *t = get_ktype(kobj); const char *name = kobj->name; - pr_debug("kobject: '%s' (%p): %s, parent %p\n", + pr_debug("'%s' (%p): %s, parent %p\n", kobject_name(kobj), kobj, __func__, kobj->parent); if (t && !t->release) - pr_debug("kobject: '%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n", + pr_debug("'%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n", kobject_name(kobj), kobj); /* remove from sysfs if the caller did not do it */ if (kobj->state_in_sysfs) { - pr_debug("kobject: '%s' (%p): auto cleanup kobject_del\n", + pr_debug("'%s' (%p): auto cleanup kobject_del\n", kobject_name(kobj), kobj); __kobject_del(kobj); } else { @@ -676,14 +678,14 @@ static void kobject_cleanup(struct kobject *kobj) } if (t && t->release) { - pr_debug("kobject: '%s' (%p): calling ktype release\n", + pr_debug("'%s' (%p): calling ktype release\n", kobject_name(kobj), kobj); t->release(kobj); } /* free name if we allocated it */ if (name) { - pr_debug("kobject: '%s': free name\n", name); + pr_debug("'%s': free name\n", name); kfree_const(name); } @@ -703,8 +705,8 @@ static void kobject_release(struct kref *kref) struct kobject *kobj = container_of(kref, struct kobject, kref); #ifdef CONFIG_DEBUG_KOBJECT_RELEASE unsigned long delay = HZ + HZ * get_random_u32_below(4); - pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n", - kobject_name(kobj), kobj, __func__, kobj->parent, delay); + pr_info("'%s' (%p): %s, parent %p (delayed %ld)\n", + kobject_name(kobj), kobj, __func__, kobj->parent, delay); INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup); schedule_delayed_work(&kobj->release, delay); @@ -733,7 +735,7 @@ EXPORT_SYMBOL(kobject_put); static void dynamic_kobj_release(struct kobject *kobj) { - pr_debug("kobject: (%p): %s\n", kobj, __func__); + pr_debug("(%p): %s\n", kobj, __func__); kfree(kobj); } @@ -910,7 +912,7 @@ EXPORT_SYMBOL_GPL(kset_find_obj); static void kset_release(struct kobject *kobj) { struct kset *kset = container_of(kobj, struct kset, kobj); - pr_debug("kobject: '%s' (%p): %s\n", + pr_debug("'%s' (%p): %s\n", kobject_name(kobj), kobj, __func__); kfree(kset); } -- cgit From 64414da25baf3acf31350f75fdd10514bd8390b1 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Sat, 11 Mar 2023 03:14:47 +0000 Subject: kobject: align stacktrace levels to logging message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without an explicit level the stacktraces are printed at a default level. If this level does not match the one from the logging level it may happen that the stacktrace is shown without the message or vice versa. Both these cases are confusing, so make sure the user always sees both, the message and the stacktrace. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20230311-kobject-warning-v1-2-1ebba4f71fb5@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- lib/kobject.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index 09c81ffb8b33..f79a434e1231 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -340,7 +340,7 @@ void kobject_init(struct kobject *kobj, const struct kobj_type *ktype) /* do not error out as sometimes we can recover */ pr_err("kobject (%p): tried to init an initialized object, something is seriously wrong.\n", kobj); - dump_stack(); + dump_stack_lvl(KERN_ERR); } kobject_init_internal(kobj); @@ -349,7 +349,7 @@ void kobject_init(struct kobject *kobj, const struct kobj_type *ktype) error: pr_err("kobject (%p): %s\n", kobj, err_str); - dump_stack(); + dump_stack_lvl(KERN_ERR); } EXPORT_SYMBOL(kobject_init); @@ -413,7 +413,7 @@ int kobject_add(struct kobject *kobj, struct kobject *parent, if (!kobj->state_initialized) { pr_err("kobject '%s' (%p): tried to add an uninitialized object, something is seriously wrong.\n", kobject_name(kobj), kobj); - dump_stack(); + dump_stack_lvl(KERN_ERR); return -EINVAL; } va_start(args, fmt); -- cgit From 1470afefc3c42df5d1662f87d079b46651bdc95b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 15 Mar 2023 17:31:02 -0700 Subject: cpumask: introduce for_each_cpu_or Equivalent of for_each_cpu_and, except it ORs the two masks together so it iterates all the CPUs present in either mask. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- lib/find_bit.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'lib') diff --git a/lib/find_bit.c b/lib/find_bit.c index c10920e66788..32f99e9a670e 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -182,6 +182,15 @@ unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned l EXPORT_SYMBOL(_find_next_andnot_bit); #endif +#ifndef find_next_or_bit +unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, + unsigned long nbits, unsigned long start) +{ + return FIND_NEXT_BIT(addr1[idx] | addr2[idx], /* nop */, nbits, start); +} +EXPORT_SYMBOL(_find_next_or_bit); +#endif + #ifndef find_next_zero_bit unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) -- cgit From 8b57b11cca88f397035a95b9e12b03511847b0e8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 15 Mar 2023 17:31:02 -0700 Subject: pcpcntrs: fix dying cpu summation race In commit f689054aace2 ("percpu_counter: add percpu_counter_sum_all interface") a race condition between a cpu dying and percpu_counter_sum() iterating online CPUs was identified. The solution was to iterate all possible CPUs for summation via percpu_counter_sum_all(). We recently had a percpu_counter_sum() call in XFS trip over this same race condition and it fired a debug assert because the filesystem was unmounting and the counter *should* be zero just before we destroy it. That was reported here: https://lore.kernel.org/linux-kernel/20230314090649.326642-1-yebin@huaweicloud.com/ likely as a result of running generic/648 which exercises filesystems in the presence of CPU online/offline events. The solution to use percpu_counter_sum_all() is an awful one. We use percpu counters and percpu_counter_sum() for accurate and reliable threshold detection for space management, so a summation race condition during these operations can result in overcommit of available space and that may result in filesystem shutdowns. As percpu_counter_sum_all() iterates all possible CPUs rather than just those online or even those present, the mask can include CPUs that aren't even installed in the machine, or in the case of machines that can hot-plug CPU capable nodes, even have physical sockets present in the machine. Fundamentally, this race condition is caused by the CPU being offlined being removed from the cpu_online_mask before the notifier that cleans up per-cpu state is run. Hence percpu_counter_sum() will not sum the count for a cpu currently being taken offline, regardless of whether the notifier has run or not. This is the root cause of the bug. The percpu counter notifier iterates all the registered counters, locks the counter and moves the percpu count to the global sum. This is serialised against other operations that move the percpu counter to the global sum as well as percpu_counter_sum() operations that sum the percpu counts while holding the counter lock. Hence the notifier is safe to run concurrently with sum operations, and the only thing we actually need to care about is that percpu_counter_sum() iterates dying CPUs. That's trivial to do, and when there are no CPUs dying, it has no addition overhead except for a cpumask_or() operation. This change makes percpu_counter_sum() always do the right thing in the presence of CPU hot unplug events and makes percpu_counter_sum_all() unnecessary. This, in turn, means that filesystems like XFS, ext4, and btrfs don't have to work out when they should use percpu_counter_sum() vs percpu_counter_sum_all() in their space accounting algorithms Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- lib/percpu_counter.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index dba56c5c1837..0e096311e0c0 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -131,7 +131,7 @@ static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; - for_each_cpu(cpu, cpu_mask) { + for_each_cpu_or(cpu, cpu_online_mask, cpu_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } @@ -141,11 +141,20 @@ static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, /* * Add up all the per-cpu counts, return the result. This is a more accurate - * but much slower version of percpu_counter_read_positive() + * but much slower version of percpu_counter_read_positive(). + * + * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums + * from CPUs that are in the process of being taken offline. Dying cpus have + * been removed from the online mask, but may not have had the hotplug dead + * notifier called to fold the percpu count back into the global counter sum. + * By including dying CPUs in the iteration mask, we avoid this race condition + * so __percpu_counter_sum() just does the right thing when CPUs are being taken + * offline. */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { - return __percpu_counter_sum_mask(fbc, cpu_online_mask); + + return __percpu_counter_sum_mask(fbc, cpu_dying_mask); } EXPORT_SYMBOL(__percpu_counter_sum); -- cgit From e9b60c7f97130795c7aa81a649ae4b93a172a277 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 15 Mar 2023 17:31:03 -0700 Subject: pcpcntr: remove percpu_counter_sum_all() percpu_counter_sum_all() is now redundant as the race condition it was invented to handle is now dealt with by percpu_counter_sum() directly and all users of percpu_counter_sum_all() have been removed. Remove it. This effectively reverts the changes made in f689054aace2 ("percpu_counter: add percpu_counter_sum_all interface") except for the cpumask iteration that fixes percpu_counter_sum() made earlier in this series. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- lib/percpu_counter.c | 40 +++++++++++----------------------------- 1 file changed, 11 insertions(+), 29 deletions(-) (limited to 'lib') diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 0e096311e0c0..5004463c4f9f 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -122,23 +122,6 @@ void percpu_counter_sync(struct percpu_counter *fbc) } EXPORT_SYMBOL(percpu_counter_sync); -static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, - const struct cpumask *cpu_mask) -{ - s64 ret; - int cpu; - unsigned long flags; - - raw_spin_lock_irqsave(&fbc->lock, flags); - ret = fbc->count; - for_each_cpu_or(cpu, cpu_online_mask, cpu_mask) { - s32 *pcount = per_cpu_ptr(fbc->counters, cpu); - ret += *pcount; - } - raw_spin_unlock_irqrestore(&fbc->lock, flags); - return ret; -} - /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive(). @@ -153,22 +136,21 @@ static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { + s64 ret; + int cpu; + unsigned long flags; - return __percpu_counter_sum_mask(fbc, cpu_dying_mask); + raw_spin_lock_irqsave(&fbc->lock, flags); + ret = fbc->count; + for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { + s32 *pcount = per_cpu_ptr(fbc->counters, cpu); + ret += *pcount; + } + raw_spin_unlock_irqrestore(&fbc->lock, flags); + return ret; } EXPORT_SYMBOL(__percpu_counter_sum); -/* - * This is slower version of percpu_counter_sum as it traverses all possible - * cpus. Use this only in the cases where accurate data is needed in the - * presense of CPUs getting offlined. - */ -s64 percpu_counter_sum_all(struct percpu_counter *fbc) -{ - return __percpu_counter_sum_mask(fbc, cpu_possible_mask); -} -EXPORT_SYMBOL(percpu_counter_sum_all); - int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key) { -- cgit From 7ce93729091dff24e6a1c3578b58b36f4b1cf10a Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 10 Mar 2023 16:27:28 -0500 Subject: dyndbg: cleanup dynamic usage in ib_srp.c Currently, in dynamic_debug.h we only provide DEFINE_DYNAMIC_DEBUG_METADATA() and DYNAMIC_DEBUG_BRANCH() definitions if CONFIG_DYNAMIC_CORE is enabled. Thus, drivers such as infiniband srp (see: drivers/infiniband/ulp/srp/ib_srp.c) must provide their own definitions for !CONFIG_DYNAMIC_CORE. Thus, let's move this !CONFIG_DYNAMIC_CORE case into dynamic_debug.h. However, the dynamic debug interfaces should really only be defined if CONFIG_DYNAMIC_DEBUG is set or CONFIG_DYNAMIC_CORE is set along with DYNAMIC_DEBUG_MODULE, (see: Documentation/admin-guide/dynamic-debug-howto.rst). Thus, the undefined case becomes: !((CONFIG_DYNAMIC_DEBUG || (CONFIG_DYNAMIC_CORE && DYNAMIC_DEBUG_MODULE)). With those changes in place, we can remove the !CONFIG_DYNAMIC_CORE case from ib_srp.c This change was prompted by a build breakeage in ib_srp.c stemming from the inclusion of dynamic_debug.h unconditionally in module.h, due to commit 7deabd674988 ("dyndbg: use the module notifier callbacks"). In that case, if we have CONFIG_DYNAMIC_CORE=y and CONFIG_DYNAMIC_DEBUG=n then the definitions for DEFINE_DYNAMIC_DEBUG_METADATA() and DYNAMIC_DEBUG_BRANCH() are defined once in ib_srp.c and then again in the dynamic_debug.h. This had been working prior to the above referenced commit because dynamic_debug.h was only pulled into ib_srp.c conditinally via printk.h if CONFIG_DYNAMIC_DEBUG was set. Also, the exported functions in lib/dynamic_debug.c itself may not have a prototype if CONFIG_DYNAMIC_DEBUG=n and CONFIG_DYNAMIC_CORE=y. This would trigger the -Wmissing-prototypes warning. The exported functions are behind (include/linux/dynamic_debug.h): if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) Thus, by adding -DDYNAMIC_CONFIG_MODULE to the lib/Makefile we can ensure that the exported functions have a prototype in all cases, since lib/dynamic_debug.c is built whenever CONFIG_DYNAMIC_DEBUG_CORE=y. Fixes: 7deabd674988 ("dyndbg: use the module notifier callbacks") Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202303071444.sIbZTDCy-lkp@intel.com/ Signed-off-by: Jason Baron [mcgrof: adjust commit log, and remove urldefense from URL] Signed-off-by: Luis Chamberlain --- lib/Makefile | 3 +++ 1 file changed, 3 insertions(+) (limited to 'lib') diff --git a/lib/Makefile b/lib/Makefile index baf2821f7a00..7afcd85f78f6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -231,6 +231,9 @@ lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o +#ensure exported functions have prototypes +CFLAGS_dynamic_debug.o := -DDYNAMIC_DEBUG_MODULE + obj-$(CONFIG_SYMBOLIC_ERRNAME) += errname.o obj-$(CONFIG_NLATTR) += nlattr.o -- cgit From 322a7ce7a62f0593160bb80f5fba52d64967b92f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 13 Mar 2023 13:50:39 +0100 Subject: s390: enable DEBUG_FORCE_FUNCTION_ALIGN_64B Allow to enforce 64 byte function alignment like it is possible for a couple of other architectures. This may or may not be helpful for debugging performance problems, as described with the Kconfig option. Since the kernel works also with 64 byte function alignment there is no reason for not allowing to enforce this function alignment. Signed-off-by: Heiko Carstens --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c8b379e2e9ad..59d8f3080cba 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -502,7 +502,7 @@ config SECTION_MISMATCH_WARN_ONLY config DEBUG_FORCE_FUNCTION_ALIGN_64B bool "Force all function address 64B aligned" - depends on EXPERT && (X86_64 || ARM64 || PPC32 || PPC64 || ARC) + depends on EXPERT && (X86_64 || ARM64 || PPC32 || PPC64 || ARC || S390) select FUNCTION_ALIGNMENT_64B help There are cases that a commit from one domain changes the function -- cgit From aff69273af61f5d1c8fb401d6f19148d11629b41 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 10 Mar 2023 19:07:50 +0000 Subject: vdso: Improve cmd_vdso_check to check all dynamic relocations The actual intention is that no dynamic relocation exists in the VDSO. For this the VDSO build validates that the resulting .so file does not have any relocations which are specified via $(ARCH_REL_TYPE_ABS) per architecture, which is fragile as e.g. ARM64 lacks an entry for R_AARCH64_RELATIVE. Aside of that ARCH_REL_TYPE_ABS is a misnomer as it checks for relative relocations too. However, some GNU ld ports produce unneeded R_*_NONE relocation entries. If a port fails to determine the exact .rel[a].dyn size, the trailing zeros become R_*_NONE relocations. E.g. ld's powerpc port recently fixed https://sourceware.org/bugzilla/show_bug.cgi?id=29540). R_*_NONE are generally a no-op in the dynamic loaders. So just ignore them. Remove the ARCH_REL_TYPE_ABS defines and just validate that the resulting .so file does not contain any R_* relocation entries except R_*_NONE. Signed-off-by: Fangrui Song Signed-off-by: Thomas Gleixner Tested-by: Vincenzo Frascino # for aarch64 Reviewed-by: Christophe Leroy Reviewed-by: Vincenzo Frascino # for vDSO, aarch64 Acked-by: Michael Ellerman (powerpc) Link: https://lore.kernel.org/r/20230310190750.3323802-1-maskray@google.com --- lib/vdso/Makefile | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile index e814061d6aa0..9f031eafc465 100644 --- a/lib/vdso/Makefile +++ b/lib/vdso/Makefile @@ -5,18 +5,13 @@ GENERIC_VDSO_DIR := $(dir $(GENERIC_VDSO_MK_PATH)) c-gettimeofday-$(CONFIG_GENERIC_GETTIMEOFDAY) := $(addprefix $(GENERIC_VDSO_DIR), gettimeofday.c) -# This cmd checks that the vdso library does not contain absolute relocation +# This cmd checks that the vdso library does not contain dynamic relocations. # It has to be called after the linking of the vdso library and requires it # as a parameter. # -# $(ARCH_REL_TYPE_ABS) is defined in the arch specific makefile and corresponds -# to the absolute relocation types printed by "objdump -R" and accepted by the -# dynamic linker. -ifndef ARCH_REL_TYPE_ABS -$(error ARCH_REL_TYPE_ABS is not set) -endif - +# As a workaround for some GNU ld ports which produce unneeded R_*_NONE +# dynamic relocations, ignore R_*_NONE. quiet_cmd_vdso_check = VDSOCHK $@ - cmd_vdso_check = if $(OBJDUMP) -R $@ | grep -E -h "$(ARCH_REL_TYPE_ABS)"; \ + cmd_vdso_check = if $(READELF) -rW $@ | grep -v _NONE | grep -q " R_\w*_"; \ then (echo >&2 "$@: dynamic relocations are not supported"; \ rm -f $@; /bin/false); fi -- cgit From 0fa99fdfe1b38da396d0b2d1496a823bcd0ebea0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 7 Mar 2023 13:02:46 -0500 Subject: maple_tree: fix mas_skip_node() end slot detection Patch series "Fix mas_skip_node() for mas_empty_area()", v2. mas_empty_area() was incorrectly returning an error when there was room. The issue was tracked down to mas_skip_node() using the incorrect end-of-slot count. Instead of using the nodes hard limit, the limit of data should be used. mas_skip_node() was also setting the min and max to that of the child node, which was unnecessary. Within these limits being set, there was also a bug that corrupted the maple state's max if the offset was set to the maximum node pivot. The bug was without consequence unless there was a sufficient gap in the next child node which would cause an error to be returned. This patch set fixes these errors by removing the limit setting from mas_skip_node() and uses the mas_data_end() for slot limits, and adds tests for all failures discovered. This patch (of 2): mas_skip_node() is used to move the maple state to the node with a higher limit. It does this by walking up the tree and increasing the slot count. Since slot count may not be able to be increased, it may need to walk up multiple times to find room to walk right to a higher limit node. The limit of slots that was being used was the node limit and not the last location of data in the node. This would cause the maple state to be shifted outside actual data and enter an error state, thus returning -EBUSY. The result of the incorrect error state means that mas_awalk() would return an error instead of finding the allocation space. The fix is to use mas_data_end() in mas_skip_node() to detect the nodes data end point and continue walking the tree up until it is safe to move to a node with a higher limit. The walk up the tree also sets the maple state limits so remove the buggy code from mas_skip_node(). Setting the limits had the unfortunate side effect of triggering another bug if the parent node was full and the there was no suitable gap in the second last child, but room in the next child. mas_skip_node() may also be passed a maple state in an error state from mas_anode_descend() when no allocations are available. Return on such an error state immediately. Link: https://lkml.kernel.org/r/20230307180247.2220303-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230307180247.2220303-2-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reported-by: Snild Dolkow Link: https://lore.kernel.org/linux-mm/cb8dc31a-fef2-1d09-f133-e9f7b9f9e77a@sony.com/ Tested-by: Snild Dolkow Cc: Peng Zhang Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 646297cae5d1..9e2735cbc2b4 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5099,35 +5099,21 @@ static inline bool mas_rewind_node(struct ma_state *mas) */ static inline bool mas_skip_node(struct ma_state *mas) { - unsigned char slot, slot_count; - unsigned long *pivots; - enum maple_type mt; + if (mas_is_err(mas)) + return false; - mt = mte_node_type(mas->node); - slot_count = mt_slots[mt] - 1; do { if (mte_is_root(mas->node)) { - slot = mas->offset; - if (slot > slot_count) { + if (mas->offset >= mas_data_end(mas)) { mas_set_err(mas, -EBUSY); return false; } } else { mas_ascend(mas); - slot = mas->offset; - mt = mte_node_type(mas->node); - slot_count = mt_slots[mt] - 1; } - } while (slot > slot_count); - - mas->offset = ++slot; - pivots = ma_pivots(mas_mn(mas), mt); - if (slot > 0) - mas->min = pivots[slot - 1] + 1; - - if (slot <= slot_count) - mas->max = pivots[slot]; + } while (mas->offset >= mas_data_end(mas)); + mas->offset++; return true; } -- cgit From 4bd6dded6318dc8e2514d74868c1f8fb38b61a60 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 7 Mar 2023 13:02:47 -0500 Subject: test_maple_tree: add more testing for mas_empty_area() Test robust filling of an entire area of the tree, then test one beyond. This is to test the walking back up the tree at the end of nodes and error condition. Test inspired by the reproducer code provided by Snild Dolkow. The last test in the function tests for the case of a corrupted maple state caused by the incorrect limits set during mas_skip_node(). There needs to be a gap in the second last child and last child, but the search must rule out the second last child's gap. This would avoid correcting the maple state to the correct max limit and return an error. Link: https://lkml.kernel.org/r/20230307180247.2220303-3-Liam.Howlett@oracle.com Cc: Snild Dolkow Link: https://lore.kernel.org/linux-mm/cb8dc31a-fef2-1d09-f133-e9f7b9f9e77a@sony.com/ Fixes: e15e06a83923 ("lib/test_maple_tree: add testing for maple tree") Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'lib') diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 3d19b1f78d71..f1db333270e9 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -2670,6 +2670,49 @@ static noinline void check_empty_area_window(struct maple_tree *mt) rcu_read_unlock(); } +static noinline void check_empty_area_fill(struct maple_tree *mt) +{ + const unsigned long max = 0x25D78000; + unsigned long size; + int loop, shift; + MA_STATE(mas, mt, 0, 0); + + mt_set_non_kernel(99999); + for (shift = 12; shift <= 16; shift++) { + loop = 5000; + size = 1 << shift; + while (loop--) { + mas_set(&mas, 0); + mas_lock(&mas); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, max, size) != 0); + MT_BUG_ON(mt, mas.last != mas.index + size - 1); + mas_store_gfp(&mas, (void *)size, GFP_KERNEL); + mas_unlock(&mas); + mas_reset(&mas); + } + } + + /* No space left. */ + size = 0x1000; + rcu_read_lock(); + MT_BUG_ON(mt, mas_empty_area(&mas, 0, max, size) != -EBUSY); + rcu_read_unlock(); + + /* Fill a depth 3 node to the maximum */ + for (unsigned long i = 629440511; i <= 629440800; i += 6) + mtree_store_range(mt, i, i + 5, (void *)i, GFP_KERNEL); + /* Make space in the second-last depth 4 node */ + mtree_erase(mt, 631668735); + /* Make space in the last depth 4 node */ + mtree_erase(mt, 629506047); + mas_reset(&mas); + /* Search from just after the gap in the second-last depth 4 */ + rcu_read_lock(); + MT_BUG_ON(mt, mas_empty_area(&mas, 629506048, 690000000, 0x5000) != 0); + rcu_read_unlock(); + mt_set_non_kernel(0); +} + static DEFINE_MTREE(tree); static int maple_tree_seed(void) { @@ -2926,6 +2969,11 @@ static int maple_tree_seed(void) check_empty_area_window(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_empty_area_fill(&tree); + mtree_destroy(&tree); + + #if defined(BENCH) skip: #endif -- cgit From 13684e966d46283e0e89b6a4941596dc52b18bf3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 15 Mar 2023 15:28:17 +0100 Subject: lib: dhry: fix unstable smp_processor_id(_) usage When running the in-kernel Dhrystone benchmark with CONFIG_DEBUG_PREEMPT=y: BUG: using smp_processor_id() in preemptible [00000000] code: bash/938 Fix this by not using smp_processor_id() directly, but instead wrapping the whole benchmark inside a get_cpu()/put_cpu() pair. This makes sure the whole benchmark is run on the same CPU core, and the reported values are consistent. Link: https://lkml.kernel.org/r/b0d29932bb24ad82cea7f821e295c898e9657be0.1678890070.git.geert+renesas@glider.be Fixes: d5528cc16893f1f6 ("lib: add Dhrystone benchmark test") Signed-off-by: Geert Uytterhoeven Reported-by: Tobias Klausmann Link: https://bugzilla.kernel.org/show_bug.cgi?id=217179 Signed-off-by: Andrew Morton --- lib/dhry_run.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/dhry_run.c b/lib/dhry_run.c index f9d33efa6d09..f15ac666e9d3 100644 --- a/lib/dhry_run.c +++ b/lib/dhry_run.c @@ -31,6 +31,7 @@ MODULE_PARM_DESC(iterations, static void dhry_benchmark(void) { + unsigned int cpu = get_cpu(); int i, n; if (iterations > 0) { @@ -45,9 +46,10 @@ static void dhry_benchmark(void) } report: + put_cpu(); if (n >= 0) - pr_info("CPU%u: Dhrystones per Second: %d (%d DMIPS)\n", - smp_processor_id(), n, n / DHRY_VAX); + pr_info("CPU%u: Dhrystones per Second: %d (%d DMIPS)\n", cpu, + n, n / DHRY_VAX); else if (n == -EAGAIN) pr_err("Please increase the number of iterations\n"); else -- cgit From c52198601695851622f361d3f16456e9fc857629 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 20 Mar 2023 17:55:13 -0700 Subject: locking/csd_lock: Add Kconfig option for csd_debug default The csd_debug kernel parameter works well, but is inconvenient in cases where it is more closely associated with boot loaders or automation than with a particular kernel version or release. Thererfore, provide a new CSD_LOCK_WAIT_DEBUG_DEFAULT Kconfig option that defaults csd_debug to 1 when selected and 0 otherwise, with this latter being the default. Signed-off-by: Paul E. McKenney Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juergen Gross Link: https://lore.kernel.org/r/20230321005516.50558-1-paulmck@kernel.org --- lib/Kconfig.debug | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c8b379e2e9ad..e1b160a0474d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1480,6 +1480,15 @@ config CSD_LOCK_WAIT_DEBUG include the IPI handler function currently executing (if any) and relevant stack traces. +config CSD_LOCK_WAIT_DEBUG_DEFAULT + bool "Default csd_lock_wait() debugging on at boot time" + depends on CSD_LOCK_WAIT_DEBUG + depends on 64BIT + default n + help + This option causes the csdlock_debug= kernel boot parameter to + default to 1 (basic debugging) instead of 0 (no debugging). + endmenu # lock debugging config TRACE_IRQFLAGS -- cgit From 4e0473f1060aa49621d40a113afde24818101d37 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 8 Feb 2023 07:51:02 +0200 Subject: lib: cpu_rmap: Avoid use after free on rmap->obj array entries When calling irq_set_affinity_notifier() with NULL at the notify argument, it will cause freeing of the glue pointer in the corresponding array entry but will leave the pointer in the array. A subsequent call to free_irq_cpu_rmap() will try to free this entry again leading to possible use after free. Fix that by setting NULL to the array entry and checking that we have non-zero at the array entry when iterating over the array in free_irq_cpu_rmap(). The current code does not suffer from this since there are no cases where irq_set_affinity_notifier(irq, NULL) (note the NULL passed for the notify arg) is called, followed by a call to free_irq_cpu_rmap() so we don't hit and issue. Subsequent patches in this series excersize this flow, hence the required fix. Cc: Thomas Gleixner Signed-off-by: Eli Cohen Signed-off-by: Saeed Mahameed Reviewed-by: Jacob Keller --- lib/cpu_rmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c index f08d9c56f712..e77f12bb3c77 100644 --- a/lib/cpu_rmap.c +++ b/lib/cpu_rmap.c @@ -232,7 +232,8 @@ void free_irq_cpu_rmap(struct cpu_rmap *rmap) for (index = 0; index < rmap->used; index++) { glue = rmap->obj[index]; - irq_set_affinity_notifier(glue->notify.irq, NULL); + if (glue) + irq_set_affinity_notifier(glue->notify.irq, NULL); } cpu_rmap_put(rmap); @@ -268,6 +269,7 @@ static void irq_cpu_rmap_release(struct kref *ref) container_of(ref, struct irq_glue, notify.kref); cpu_rmap_put(glue->rmap); + glue->rmap->obj[glue->index] = NULL; kfree(glue); } @@ -297,6 +299,7 @@ int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq) rc = irq_set_affinity_notifier(irq, &glue->notify); if (rc) { cpu_rmap_put(glue->rmap); + rmap->obj[glue->index] = NULL; kfree(glue); } return rc; -- cgit From 9821d8d4628e630ab56f47a8e6b878a2576e069b Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Tue, 14 Feb 2023 09:29:46 +0200 Subject: lib: cpu_rmap: Use allocator for rmap entries Use a proper allocator for rmap entries using a naive for loop. The allocator relies on whether an entry is NULL to be considered free. Remove the used field of rmap which is not needed. Also, avoid crashing the kernel if an entry is not available. Cc: Thomas Gleixner Signed-off-by: Eli Cohen Signed-off-by: Saeed Mahameed Reviewed-by: Jacob Keller --- lib/cpu_rmap.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c index e77f12bb3c77..5d4bf7a8b926 100644 --- a/lib/cpu_rmap.c +++ b/lib/cpu_rmap.c @@ -128,19 +128,31 @@ debug_print_rmap(const struct cpu_rmap *rmap, const char *prefix) } #endif +static int get_free_index(struct cpu_rmap *rmap) +{ + int i; + + for (i = 0; i < rmap->size; i++) + if (!rmap->obj[i]) + return i; + + return -ENOSPC; +} + /** * cpu_rmap_add - add object to a rmap * @rmap: CPU rmap allocated with alloc_cpu_rmap() * @obj: Object to add to rmap * - * Return index of object. + * Return index of object or -ENOSPC if no free entry was found */ int cpu_rmap_add(struct cpu_rmap *rmap, void *obj) { - u16 index; + int index = get_free_index(rmap); + + if (index < 0) + return index; - BUG_ON(rmap->used >= rmap->size); - index = rmap->used++; rmap->obj[index] = obj; return index; } @@ -230,7 +242,7 @@ void free_irq_cpu_rmap(struct cpu_rmap *rmap) if (!rmap) return; - for (index = 0; index < rmap->used; index++) { + for (index = 0; index < rmap->size; index++) { glue = rmap->obj[index]; if (glue) irq_set_affinity_notifier(glue->notify.irq, NULL); @@ -295,13 +307,22 @@ int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq) glue->notify.release = irq_cpu_rmap_release; glue->rmap = rmap; cpu_rmap_get(rmap); - glue->index = cpu_rmap_add(rmap, glue); + rc = cpu_rmap_add(rmap, glue); + if (rc < 0) + goto err_add; + + glue->index = rc; rc = irq_set_affinity_notifier(irq, &glue->notify); - if (rc) { - cpu_rmap_put(glue->rmap); - rmap->obj[glue->index] = NULL; - kfree(glue); - } + if (rc) + goto err_set; + + return rc; + +err_set: + rmap->obj[glue->index] = NULL; +err_add: + cpu_rmap_put(glue->rmap); + kfree(glue); return rc; } EXPORT_SYMBOL(irq_cpu_rmap_add); -- cgit From 71f0a2478605c100358a9f9e174849fa643bf8a7 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Tue, 14 Feb 2023 11:05:46 +0200 Subject: lib: cpu_rmap: Add irq_cpu_rmap_remove to complement irq_cpu_rmap_add Add a function to complement irq_cpu_rmap_add(). It removes the irq from the reverse mapping by setting the notifier to NULL. The function calls irq_set_affinity_notifier() with NULL at the notify argument which then cancel any pending notifier work and decrement reference on the notifier. When ref count reaches zero, the glue pointer is kfree and the rmap entry is set to NULL serving both to avoid second attempt to release it and also making the rmap entry available for subsequent mapping. It should be noted the drivers usually creates the reverse mapping at initialization time and remove it at unload time so we do not expect failures in allocating rmap due to kref holding the glue entry. Cc: Thomas Gleixner Signed-off-by: Eli Cohen Signed-off-by: Saeed Mahameed Reviewed-by: Jacob Keller --- lib/cpu_rmap.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'lib') diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c index 5d4bf7a8b926..73c1636b927b 100644 --- a/lib/cpu_rmap.c +++ b/lib/cpu_rmap.c @@ -285,6 +285,17 @@ static void irq_cpu_rmap_release(struct kref *ref) kfree(glue); } +/** + * irq_cpu_rmap_remove - remove an IRQ from a CPU affinity reverse-map + * @rmap: The reverse-map + * @irq: The IRQ number + */ +int irq_cpu_rmap_remove(struct cpu_rmap *rmap, int irq) +{ + return irq_set_affinity_notifier(irq, NULL); +} +EXPORT_SYMBOL(irq_cpu_rmap_remove); + /** * irq_cpu_rmap_add - add an IRQ to a CPU affinity reverse-map * @rmap: The reverse-map -- cgit From ee1ee6db07795d9637bc5e8993a8ddcf886541ef Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Mar 2023 21:55:31 +0100 Subject: atomics: Provide rcuref - scalable reference counting atomic_t based reference counting, including refcount_t, uses atomic_inc_not_zero() for acquiring a reference. atomic_inc_not_zero() is implemented with a atomic_try_cmpxchg() loop. High contention of the reference count leads to retry loops and scales badly. There is nothing to improve on this implementation as the semantics have to be preserved. Provide rcuref as a scalable alternative solution which is suitable for RCU managed objects. Similar to refcount_t it comes with overflow and underflow detection and mitigation. rcuref treats the underlying atomic_t as an unsigned integer and partitions this space into zones: 0x00000000 - 0x7FFFFFFF valid zone (1 .. (INT_MAX + 1) references) 0x80000000 - 0xBFFFFFFF saturation zone 0xC0000000 - 0xFFFFFFFE dead zone 0xFFFFFFFF no reference rcuref_get() unconditionally increments the reference count with atomic_add_negative_relaxed(). rcuref_put() unconditionally decrements the reference count with atomic_add_negative_release(). This unconditional increment avoids the inc_not_zero() problem, but requires a more complex implementation on the put() side when the count drops from 0 to -1. When this transition is detected then it is attempted to mark the reference count dead, by setting it to the midpoint of the dead zone with a single atomic_cmpxchg_release() operation. This operation can fail due to a concurrent rcuref_get() elevating the reference count from -1 to 0 again. If the unconditional increment in rcuref_get() hits a reference count which is marked dead (or saturated) it will detect it after the fact and bring back the reference count to the midpoint of the respective zone. The zones provide enough tolerance which makes it practically impossible to escape from a zone. The racy implementation of rcuref_put() requires to protect rcuref_put() against a grace period ending in order to prevent a subtle use after free. As RCU is the only mechanism which allows to protect against that, it is not possible to fully replace the atomic_inc_not_zero() based implementation of refcount_t with this scheme. The final drop is slightly more expensive than the atomic_dec_return() counterpart, but that's not the case which this is optimized for. The optimization is on the high frequeunt get()/put() pairs and their scalability. The performance of an uncontended rcuref_get()/put() pair where the put() is not dropping the last reference is still on par with the plain atomic operations, while at the same time providing overflow and underflow detection and mitigation. The performance of rcuref compared to plain atomic_inc_not_zero() and atomic_dec_return() based reference counting under contention: - Micro benchmark: All CPUs running a increment/decrement loop on an elevated reference count, which means the 0 to -1 transition never happens. The performance gain depends on microarchitecture and the number of CPUs and has been observed in the range of 1.3X to 4.7X - Conversion of dst_entry::__refcnt to rcuref and testing with the localhost memtier/memcached benchmark. That benchmark shows the reference count contention prominently. The performance gain depends on microarchitecture and the number of CPUs and has been observed in the range of 1.1X to 2.6X over the previous fix for the false sharing issue vs. struct dst_entry::__refcnt. When memtier is run over a real 1Gb network connection, there is a small gain on top of the false sharing fix. The two changes combined result in a 2%-5% total gain for that networked test. Reported-by: Wangyang Guo Reported-by: Arjan Van De Ven Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230323102800.158429195@linutronix.de --- lib/Makefile | 2 +- lib/rcuref.c | 281 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 282 insertions(+), 1 deletion(-) create mode 100644 lib/rcuref.c (limited to 'lib') diff --git a/lib/Makefile b/lib/Makefile index baf2821f7a00..31a3a257fd49 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -47,7 +47,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \ list_sort.o uuid.o iov_iter.o clz_ctz.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ percpu-refcount.o rhashtable.o base64.o \ - once.o refcount.o usercopy.o errseq.o bucket_locks.o \ + once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \ generic-radix-tree.o obj-$(CONFIG_STRING_SELFTEST) += test_string.o obj-y += string_helpers.o diff --git a/lib/rcuref.c b/lib/rcuref.c new file mode 100644 index 000000000000..5ec00a4a64d1 --- /dev/null +++ b/lib/rcuref.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * rcuref - A scalable reference count implementation for RCU managed objects + * + * rcuref is provided to replace open coded reference count implementations + * based on atomic_t. It protects explicitely RCU managed objects which can + * be visible even after the last reference has been dropped and the object + * is heading towards destruction. + * + * A common usage pattern is: + * + * get() + * rcu_read_lock(); + * p = get_ptr(); + * if (p && !atomic_inc_not_zero(&p->refcnt)) + * p = NULL; + * rcu_read_unlock(); + * return p; + * + * put() + * if (!atomic_dec_return(&->refcnt)) { + * remove_ptr(p); + * kfree_rcu((p, rcu); + * } + * + * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has + * O(N^2) behaviour under contention with N concurrent operations. + * + * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales + * better under contention. + * + * Why not refcount? + * ================= + * + * In principle it should be possible to make refcount use the rcuref + * scheme, but the destruction race described below cannot be prevented + * unless the protected object is RCU managed. + * + * Theory of operation + * =================== + * + * rcuref uses an unsigned integer reference counter. As long as the + * counter value is greater than or equal to RCUREF_ONEREF and not larger + * than RCUREF_MAXREF the reference is alive: + * + * ONEREF MAXREF SATURATED RELEASED DEAD NOREF + * 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF + * <---valid --------> <-------saturation zone-------> <-----dead zone-----> + * + * The get() and put() operations do unconditional increments and + * decrements. The result is checked after the operation. This optimizes + * for the fast path. + * + * If the reference count is saturated or dead, then the increments and + * decrements are not harmful as the reference count still stays in the + * respective zones and is always set back to STATURATED resp. DEAD. The + * zones have room for 2^28 racing operations in each direction, which + * makes it practically impossible to escape the zones. + * + * Once the last reference is dropped the reference count becomes + * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The + * slowpath then tries to set the reference count from RCUREF_NOREF to + * RCUREF_DEAD via a cmpxchg(). This opens a small window where a + * concurrent rcuref_get() can acquire the reference count and bring it + * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD. + * + * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in + * DEAD + 1, which is inside the dead zone. If that happens the reference + * count is put back to DEAD. + * + * The actual race is possible due to the unconditional increment and + * decrements in rcuref_get() and rcuref_put(): + * + * T1 T2 + * get() put() + * if (atomic_add_negative(-1, &ref->refcnt)) + * succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); + * + * atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1 + * + * As the result of T1's add is negative, the get() goes into the slow path + * and observes refcnt being in the dead zone which makes the operation fail. + * + * Possible critical states: + * + * Context Counter References Operation + * T1 0 1 init() + * T2 1 2 get() + * T1 0 1 put() + * T2 -1 0 put() tries to mark dead + * T1 0 1 get() + * T2 0 1 put() mark dead fails + * T1 -1 0 put() tries to mark dead + * T1 DEAD 0 put() mark dead succeeds + * T2 DEAD+1 0 get() fails and puts it back to DEAD + * + * Of course there are more complex scenarios, but the above illustrates + * the working principle. The rest is left to the imagination of the + * reader. + * + * Deconstruction race + * =================== + * + * The release operation must be protected by prohibiting a grace period in + * order to prevent a possible use after free: + * + * T1 T2 + * put() get() + * // ref->refcnt = ONEREF + * if (!atomic_add_negative(-1, &ref->refcnt)) + * return false; <- Not taken + * + * // ref->refcnt == NOREF + * --> preemption + * // Elevates ref->refcnt to ONEREF + * if (!atomic_add_negative(1, &ref->refcnt)) + * return true; <- taken + * + * if (put(&p->ref)) { <-- Succeeds + * remove_pointer(p); + * kfree_rcu(p, rcu); + * } + * + * RCU grace period ends, object is freed + * + * atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF + * + * This is prevented by disabling preemption around the put() operation as + * that's in most kernel configurations cheaper than a rcu_read_lock() / + * rcu_read_unlock() pair and in many cases even a NOOP. In any case it + * prevents the grace period which keeps the object alive until all put() + * operations complete. + * + * Saturation protection + * ===================== + * + * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX). + * Once this is exceedded the reference count becomes stale by setting it + * to RCUREF_SATURATED, which will cause a memory leak, but it prevents + * wrap arounds which obviously cause worse problems than a memory + * leak. When saturation is reached a warning is emitted. + * + * Race conditions + * =============== + * + * All reference count increment/decrement operations are unconditional and + * only verified after the fact. This optimizes for the good case and takes + * the occasional race vs. a dead or already saturated refcount into + * account. The saturation and dead zones are large enough to accomodate + * for that. + * + * Memory ordering + * =============== + * + * Memory ordering rules are slightly relaxed wrt regular atomic_t functions + * and provide only what is strictly required for refcounts. + * + * The increments are fully relaxed; these will not provide ordering. The + * rationale is that whatever is used to obtain the object to increase the + * reference count on will provide the ordering. For locked data + * structures, its the lock acquire, for RCU/lockless data structures its + * the dependent load. + * + * rcuref_get() provides a control dependency ordering future stores which + * ensures that the object is not modified when acquiring a reference + * fails. + * + * rcuref_put() provides release order, i.e. all prior loads and stores + * will be issued before. It also provides a control dependency ordering + * against the subsequent destruction of the object. + * + * If rcuref_put() successfully dropped the last reference and marked the + * object DEAD it also provides acquire ordering. + */ + +#include +#include + +/** + * rcuref_get_slowpath - Slowpath of rcuref_get() + * @ref: Pointer to the reference count + * + * Invoked when the reference count is outside of the valid zone. + * + * Return: + * False if the reference count was already marked dead + * + * True if the reference count is saturated, which prevents the + * object from being deconstructed ever. + */ +bool rcuref_get_slowpath(rcuref_t *ref) +{ + unsigned int cnt = atomic_read(&ref->refcnt); + + /* + * If the reference count was already marked dead, undo the + * increment so it stays in the middle of the dead zone and return + * fail. + */ + if (cnt >= RCUREF_RELEASED) { + atomic_set(&ref->refcnt, RCUREF_DEAD); + return false; + } + + /* + * If it was saturated, warn and mark it so. In case the increment + * was already on a saturated value restore the saturation + * marker. This keeps it in the middle of the saturation zone and + * prevents the reference count from overflowing. This leaks the + * object memory, but prevents the obvious reference count overflow + * damage. + */ + if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory")) + atomic_set(&ref->refcnt, RCUREF_SATURATED); + return true; +} +EXPORT_SYMBOL_GPL(rcuref_get_slowpath); + +/** + * rcuref_put_slowpath - Slowpath of __rcuref_put() + * @ref: Pointer to the reference count + * + * Invoked when the reference count is outside of the valid zone. + * + * Return: + * True if this was the last reference with no future references + * possible. This signals the caller that it can safely schedule the + * object, which is protected by the reference counter, for + * deconstruction. + * + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * deconstruct the protected object. + */ +bool rcuref_put_slowpath(rcuref_t *ref) +{ + unsigned int cnt = atomic_read(&ref->refcnt); + + /* Did this drop the last reference? */ + if (likely(cnt == RCUREF_NOREF)) { + /* + * Carefully try to set the reference count to RCUREF_DEAD. + * + * This can fail if a concurrent get() operation has + * elevated it again or the corresponding put() even marked + * it dead already. Both are valid situations and do not + * require a retry. If this fails the caller is not + * allowed to deconstruct the object. + */ + if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF) + return false; + + /* + * The caller can safely schedule the object for + * deconstruction. Provide acquire ordering. + */ + smp_acquire__after_ctrl_dep(); + return true; + } + + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) { + atomic_set(&ref->refcnt, RCUREF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > RCUREF_MAXREF) + atomic_set(&ref->refcnt, RCUREF_SATURATED); + return false; +} +EXPORT_SYMBOL_GPL(rcuref_put_slowpath); -- cgit From 35260cf545226c3b21d52a9d21083f7ff999969c Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Sun, 29 Jan 2023 11:10:09 +0800 Subject: Kconfig.debug: fix SCHED_DEBUG dependency The path for SCHED_DEBUG is /sys/kernel/debug/sched. So, SCHED_DEBUG should depend on DEBUG_FS, not PROC_FS. Link: https://lkml.kernel.org/r/202301291110098787982@zte.com.cn Signed-off-by: ye xingchen Cc: Dan Williams Cc: Geert Uytterhoeven Cc: Josh Poimboeuf Cc: Kees Cook Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Rasmus Villemoes Cc: Vlastimil Babka Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c8b379e2e9ad..3cc5d239964a 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1143,7 +1143,7 @@ menu "Scheduler Debugging" config SCHED_DEBUG bool "Collect scheduler debugging info" - depends on DEBUG_KERNEL && PROC_FS + depends on DEBUG_KERNEL && DEBUG_FS default y help If you say Y here, the /sys/kernel/debug/sched file will be provided -- cgit From f478b9987cc8236b412d9f2afc958d3e15a7cf85 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 21 Mar 2023 14:35:08 +0800 Subject: lib/Kconfig.debug: correct help info of LOCKDEP_STACK_TRACE_HASH_BITS We can see the following definition in kernel/locking/lockdep_internals.h: #define STACK_TRACE_HASH_SIZE (1 << CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS) CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS is related with STACK_TRACE_HASH_SIZE instead of MAX_STACK_TRACE_ENTRIES, fix it. Link: https://lkml.kernel.org/r/1679380508-20830-1-git-send-email-yangtiezhu@loongson.cn Fixes: 5dc33592e955 ("lockdep: Allow tuning tracing capacity constants.") Signed-off-by: Tiezhu Yang Cc: Dmitry Vyukov Cc: Tetsuo Handa Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3cc5d239964a..39d1d93164bd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1392,7 +1392,7 @@ config LOCKDEP_STACK_TRACE_HASH_BITS range 10 30 default 14 help - Try increasing this value if you need large MAX_STACK_TRACE_ENTRIES. + Try increasing this value if you need large STACK_TRACE_HASH_SIZE. config LOCKDEP_CIRCULAR_QUEUE_BITS int "Bitsize for elements in circular_queue struct" -- cgit From 76d0de5729c0569c4071e7f21fcab394e502f03a Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 2 Feb 2023 00:56:01 +0900 Subject: fprobe: Pass entry_data to handlers Pass the private entry_data to the entry and exit handlers so that they can share the context data, something like saved function arguments etc. User must specify the private entry_data size by @entry_data_size field before registering the fprobe. Link: https://lkml.kernel.org/r/167526696173.433354.17408372048319432574.stgit@mhiramat.roam.corp.google.com Cc: Florent Revest Cc: Mark Rutland Cc: Will Deacon Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- lib/test_fprobe.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c index 1fb56cf5e5ce..e4f65d114ed2 100644 --- a/lib/test_fprobe.c +++ b/lib/test_fprobe.c @@ -30,7 +30,8 @@ static noinline u32 fprobe_selftest_target2(u32 value) return (value / div_factor) + 1; } -static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs) +static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip, + struct pt_regs *regs, void *data) { KUNIT_EXPECT_FALSE(current_test, preemptible()); /* This can be called on the fprobe_selftest_target and the fprobe_selftest_target2 */ @@ -39,7 +40,8 @@ static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip, struct entry_val = (rand1 / div_factor); } -static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs) +static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, + struct pt_regs *regs, void *data) { unsigned long ret = regs_return_value(regs); -- cgit From 34cabf8fd18f31c773b489d4113fbf6cb5b964c9 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 2 Feb 2023 00:56:10 +0900 Subject: lib/test_fprobe: Add private entry_data testcases Add test cases for checking whether private entry_data is correctly passed or not. Link: https://lkml.kernel.org/r/167526697074.433354.17790288501657876219.stgit@mhiramat.roam.corp.google.com Cc: Florent Revest Cc: Mark Rutland Cc: Will Deacon Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- lib/test_fprobe.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'lib') diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c index e4f65d114ed2..6c7ef5acea21 100644 --- a/lib/test_fprobe.c +++ b/lib/test_fprobe.c @@ -38,6 +38,12 @@ static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip, if (ip != target_ip) KUNIT_EXPECT_EQ(current_test, ip, target2_ip); entry_val = (rand1 / div_factor); + if (fp->entry_data_size) { + KUNIT_EXPECT_NOT_NULL(current_test, data); + if (data) + *(u32 *)data = entry_val; + } else + KUNIT_EXPECT_NULL(current_test, data); } static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, @@ -53,6 +59,12 @@ static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, KUNIT_EXPECT_EQ(current_test, ret, (rand1 / div_factor)); KUNIT_EXPECT_EQ(current_test, entry_val, (rand1 / div_factor)); exit_val = entry_val + div_factor; + if (fp->entry_data_size) { + KUNIT_EXPECT_NOT_NULL(current_test, data); + if (data) + KUNIT_EXPECT_EQ(current_test, *(u32 *)data, entry_val); + } else + KUNIT_EXPECT_NULL(current_test, data); } /* Test entry only (no rethook) */ @@ -134,6 +146,23 @@ static void test_fprobe_syms(struct kunit *test) KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp)); } +/* Test private entry_data */ +static void test_fprobe_data(struct kunit *test) +{ + struct fprobe fp = { + .entry_handler = fp_entry_handler, + .exit_handler = fp_exit_handler, + .entry_data_size = sizeof(u32), + }; + + current_test = test; + KUNIT_EXPECT_EQ(test, 0, register_fprobe(&fp, "fprobe_selftest_target", NULL)); + + target(rand1); + + KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp)); +} + static unsigned long get_ftrace_location(void *func) { unsigned long size, addr = (unsigned long)func; @@ -159,6 +188,7 @@ static struct kunit_case fprobe_testcases[] = { KUNIT_CASE(test_fprobe_entry), KUNIT_CASE(test_fprobe), KUNIT_CASE(test_fprobe_syms), + KUNIT_CASE(test_fprobe_data), {} }; -- cgit From 7e7ef1bfe5522faab6f245ced7b6749e9ac410d8 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 2 Feb 2023 00:56:28 +0900 Subject: lib/test_fprobe: Add a test case for nr_maxactive Add a test case for nr_maxactive. If the number of active functions is more than nr_maxactive, it must be skipped. Link: https://lkml.kernel.org/r/167526698856.433354.4430007340787176666.stgit@mhiramat.roam.corp.google.com Cc: Florent Revest Cc: Mark Rutland Cc: Will Deacon Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- lib/test_fprobe.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'lib') diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c index 6c7ef5acea21..4b37d7022f35 100644 --- a/lib/test_fprobe.c +++ b/lib/test_fprobe.c @@ -17,8 +17,10 @@ static u32 rand1, entry_val, exit_val; /* Use indirect calls to avoid inlining the target functions */ static u32 (*target)(u32 value); static u32 (*target2)(u32 value); +static u32 (*target_nest)(u32 value, u32 (*nest)(u32)); static unsigned long target_ip; static unsigned long target2_ip; +static unsigned long target_nest_ip; static noinline u32 fprobe_selftest_target(u32 value) { @@ -30,6 +32,11 @@ static noinline u32 fprobe_selftest_target2(u32 value) return (value / div_factor) + 1; } +static noinline u32 fprobe_selftest_nest_target(u32 value, u32 (*nest)(u32)) +{ + return nest(value + 2); +} + static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs, void *data) { @@ -67,6 +74,19 @@ static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, KUNIT_EXPECT_NULL(current_test, data); } +static notrace void nest_entry_handler(struct fprobe *fp, unsigned long ip, + struct pt_regs *regs, void *data) +{ + KUNIT_EXPECT_FALSE(current_test, preemptible()); +} + +static notrace void nest_exit_handler(struct fprobe *fp, unsigned long ip, + struct pt_regs *regs, void *data) +{ + KUNIT_EXPECT_FALSE(current_test, preemptible()); + KUNIT_EXPECT_EQ(current_test, ip, target_nest_ip); +} + /* Test entry only (no rethook) */ static void test_fprobe_entry(struct kunit *test) { @@ -163,6 +183,25 @@ static void test_fprobe_data(struct kunit *test) KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp)); } +/* Test nr_maxactive */ +static void test_fprobe_nest(struct kunit *test) +{ + static const char *syms[] = {"fprobe_selftest_target", "fprobe_selftest_nest_target"}; + struct fprobe fp = { + .entry_handler = nest_entry_handler, + .exit_handler = nest_exit_handler, + .nr_maxactive = 1, + }; + + current_test = test; + KUNIT_EXPECT_EQ(test, 0, register_fprobe_syms(&fp, syms, 2)); + + target_nest(rand1, target); + KUNIT_EXPECT_EQ(test, 1, fp.nmissed); + + KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp)); +} + static unsigned long get_ftrace_location(void *func) { unsigned long size, addr = (unsigned long)func; @@ -178,8 +217,10 @@ static int fprobe_test_init(struct kunit *test) rand1 = get_random_u32_above(div_factor); target = fprobe_selftest_target; target2 = fprobe_selftest_target2; + target_nest = fprobe_selftest_nest_target; target_ip = get_ftrace_location(target); target2_ip = get_ftrace_location(target2); + target_nest_ip = get_ftrace_location(target_nest); return 0; } @@ -189,6 +230,7 @@ static struct kunit_case fprobe_testcases[] = { KUNIT_CASE(test_fprobe), KUNIT_CASE(test_fprobe_syms), KUNIT_CASE(test_fprobe_data), + KUNIT_CASE(test_fprobe_nest), {} }; -- cgit From 39d954200bf6ad503c722e44d0be80c7b826fa42 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 2 Feb 2023 00:56:38 +0900 Subject: fprobe: Skip exit_handler if entry_handler returns !0 Skip hooking function return and calling exit_handler if the entry_handler() returns !0. Link: https://lkml.kernel.org/r/167526699798.433354.10998365726830117303.stgit@mhiramat.roam.corp.google.com Cc: Florent Revest Cc: Mark Rutland Cc: Will Deacon Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- lib/test_fprobe.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c index 4b37d7022f35..9fa2ac9eda83 100644 --- a/lib/test_fprobe.c +++ b/lib/test_fprobe.c @@ -37,7 +37,7 @@ static noinline u32 fprobe_selftest_nest_target(u32 value, u32 (*nest)(u32)) return nest(value + 2); } -static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip, +static notrace int fp_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs, void *data) { KUNIT_EXPECT_FALSE(current_test, preemptible()); @@ -51,6 +51,8 @@ static notrace void fp_entry_handler(struct fprobe *fp, unsigned long ip, *(u32 *)data = entry_val; } else KUNIT_EXPECT_NULL(current_test, data); + + return 0; } static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, @@ -74,10 +76,11 @@ static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, KUNIT_EXPECT_NULL(current_test, data); } -static notrace void nest_entry_handler(struct fprobe *fp, unsigned long ip, +static notrace int nest_entry_handler(struct fprobe *fp, unsigned long ip, struct pt_regs *regs, void *data) { KUNIT_EXPECT_FALSE(current_test, preemptible()); + return 0; } static notrace void nest_exit_handler(struct fprobe *fp, unsigned long ip, -- cgit From 87de2163a36b3ffb0d2e0df8e903dc7e30566548 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 2 Feb 2023 00:56:46 +0900 Subject: lib/test_fprobe: Add a testcase for skipping exit_handler Add a testcase for skipping exit_handler if entry_handler returns !0. Link: https://lkml.kernel.org/r/167526700658.433354.12922388040490848613.stgit@mhiramat.roam.corp.google.com Cc: Florent Revest Cc: Mark Rutland Cc: Will Deacon Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- lib/test_fprobe.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c index 9fa2ac9eda83..0fe5273e960b 100644 --- a/lib/test_fprobe.c +++ b/lib/test_fprobe.c @@ -21,6 +21,7 @@ static u32 (*target_nest)(u32 value, u32 (*nest)(u32)); static unsigned long target_ip; static unsigned long target2_ip; static unsigned long target_nest_ip; +static int entry_return_value; static noinline u32 fprobe_selftest_target(u32 value) { @@ -52,7 +53,7 @@ static notrace int fp_entry_handler(struct fprobe *fp, unsigned long ip, } else KUNIT_EXPECT_NULL(current_test, data); - return 0; + return entry_return_value; } static notrace void fp_exit_handler(struct fprobe *fp, unsigned long ip, @@ -205,6 +206,28 @@ static void test_fprobe_nest(struct kunit *test) KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp)); } +static void test_fprobe_skip(struct kunit *test) +{ + struct fprobe fp = { + .entry_handler = fp_entry_handler, + .exit_handler = fp_exit_handler, + }; + + current_test = test; + KUNIT_EXPECT_EQ(test, 0, register_fprobe(&fp, "fprobe_selftest_target", NULL)); + + entry_return_value = 1; + entry_val = 0; + exit_val = 0; + target(rand1); + KUNIT_EXPECT_NE(test, 0, entry_val); + KUNIT_EXPECT_EQ(test, 0, exit_val); + KUNIT_EXPECT_EQ(test, 0, fp.nmissed); + entry_return_value = 0; + + KUNIT_EXPECT_EQ(test, 0, unregister_fprobe(&fp)); +} + static unsigned long get_ftrace_location(void *func) { unsigned long size, addr = (unsigned long)func; @@ -234,6 +257,7 @@ static struct kunit_case fprobe_testcases[] = { KUNIT_CASE(test_fprobe_syms), KUNIT_CASE(test_fprobe_data), KUNIT_CASE(test_fprobe_nest), + KUNIT_CASE(test_fprobe_skip), {} }; -- cgit From 2655421ae69fa479df1575cb2630af9131d28939 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 3 Feb 2023 17:18:36 +1000 Subject: lazy tlb: shoot lazies, non-refcounting lazy tlb mm reference handling scheme On big systems, the mm refcount can become highly contented when doing a lot of context switching with threaded applications. user<->idle switch is one of the important cases. Abandoning lazy tlb entirely slows this switching down quite a bit in the common uncontended case, so that is not viable. Implement a scheme where lazy tlb mm references do not contribute to the refcount, instead they get explicitly removed when the refcount reaches zero. The final mmdrop() sends IPIs to all CPUs in the mm_cpumask and they switch away from this mm to init_mm if it was being used as the lazy tlb mm. Enabling the shoot lazies option therefore requires that the arch ensures that mm_cpumask contains all CPUs that could possibly be using mm. A DEBUG_VM option IPIs every CPU in the system after this to ensure there are no references remaining before the mm is freed. Shootdown IPIs cost could be an issue, but they have not been observed to be a serious problem with this scheme, because short-lived processes tend not to migrate CPUs much, therefore they don't get much chance to leave lazy tlb mm references on remote CPUs. There are a lot of options to reduce them if necessary, described in comments. The near-worst-case can be benchmarked with will-it-scale: context_switch1_threads -t $(($(nproc) / 2)) This will create nproc threads (nproc / 2 switching pairs) all sharing the same mm that spread over all CPUs so each CPU does thread->idle->thread switching. [ Rik came up with basically the same idea a few years ago, so credit to him for that. ] Link: https://lore.kernel.org/linux-mm/20230118080011.2258375-1-npiggin@gmail.com/ Link: https://lore.kernel.org/all/20180728215357.3249-11-riel@surriel.com/ Link: https://lkml.kernel.org/r/20230203071837.1136453-5-npiggin@gmail.com Signed-off-by: Nicholas Piggin Acked-by: Linus Torvalds Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Michael Ellerman Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 39d1d93164bd..5cd8183bb4c1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -791,6 +791,16 @@ config DEBUG_VM If unsure, say N. +config DEBUG_VM_SHOOT_LAZIES + bool "Debug MMU_LAZY_TLB_SHOOTDOWN implementation" + depends on DEBUG_VM + depends on MMU_LAZY_TLB_SHOOTDOWN + help + Enable additional IPIs that ensure lazy tlb mm references are removed + before the mm is freed. + + If unsure, say N. + config DEBUG_VM_MAPLE_TREE bool "Debug VM maple trees" depends on DEBUG_VM -- cgit From 4c85c0be3d7a9a7ffe48bfe0954eacc0ba9d3c75 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Mon, 30 Jan 2023 13:25:13 +0900 Subject: mm, printk: introduce new format %pGt for page_type %pGp format is used to display 'flags' field of a struct page. However, some page flags (i.e. PG_buddy, see page-flags.h for more details) are stored in page_type field. To display human-readable output of page_type, introduce %pGt format. It is important to note the meaning of bits are different in page_type. if page_type is 0xffffffff, no flags are set. Setting PG_buddy (0x00000080) flag results in a page_type of 0xffffff7f. Clearing a bit actually means setting a flag. Bits in page_type are inverted when displaying type names. Only values for which page_type_has_type() returns true are considered as page_type, to avoid confusion with mapcount values. if it returns false, only raw values are displayed and not page type names. Link: https://lkml.kernel.org/r/20230130042514.2418-3-42.hyeyoo@gmail.com Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Petr Mladek [vsprintf part] Cc: Andy Shevchenko Cc: David Hildenbrand Cc: Joe Perches Cc: John Ogness Cc: Matthew Wilcox Cc: Sergey Senozhatsky Cc: Steven Rostedt (Google) Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/test_printf.c | 26 ++++++++++++++++++++++++++ lib/vsprintf.c | 21 +++++++++++++++++++++ 2 files changed, 47 insertions(+) (limited to 'lib') diff --git a/lib/test_printf.c b/lib/test_printf.c index 46b4e6c414a3..7677ebccf3c3 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -642,12 +642,26 @@ page_flags_test(int section, int node, int zone, int last_cpupid, test(cmp_buf, "%pGp", &flags); } +static void __init page_type_test(unsigned int page_type, const char *name, + char *cmp_buf) +{ + unsigned long size; + + size = scnprintf(cmp_buf, BUF_SIZE, "%#x(", page_type); + if (page_type_has_type(page_type)) + size += scnprintf(cmp_buf + size, BUF_SIZE - size, "%s", name); + + snprintf(cmp_buf + size, BUF_SIZE - size, ")"); + test(cmp_buf, "%pGt", &page_type); +} + static void __init flags(void) { unsigned long flags; char *cmp_buffer; gfp_t gfp; + unsigned int page_type; cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); if (!cmp_buffer) @@ -687,6 +701,18 @@ flags(void) gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); + page_type = ~0; + page_type_test(page_type, "", cmp_buffer); + + page_type = 10; + page_type_test(page_type, "", cmp_buffer); + + page_type = ~PG_buddy; + page_type_test(page_type, "buddy", cmp_buffer); + + page_type = ~(PG_table | PG_buddy); + page_type_test(page_type, "table|buddy", cmp_buffer); + kfree(cmp_buffer); } diff --git a/lib/vsprintf.c b/lib/vsprintf.c index be71a03c936a..fbe320b5e89f 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2052,6 +2052,25 @@ char *format_page_flags(char *buf, char *end, unsigned long flags) return buf; } +static +char *format_page_type(char *buf, char *end, unsigned int page_type) +{ + buf = number(buf, end, page_type, default_flag_spec); + + if (buf < end) + *buf = '('; + buf++; + + if (page_type_has_type(page_type)) + buf = format_flags(buf, end, ~page_type, pagetype_names); + + if (buf < end) + *buf = ')'; + buf++; + + return buf; +} + static noinline_for_stack char *flags_string(char *buf, char *end, void *flags_ptr, struct printf_spec spec, const char *fmt) @@ -2065,6 +2084,8 @@ char *flags_string(char *buf, char *end, void *flags_ptr, switch (fmt[1]) { case 'p': return format_page_flags(buf, end, *(unsigned long *)flags_ptr); + case 't': + return format_page_type(buf, end, *(unsigned int *)flags_ptr); case 'v': flags = *(unsigned long *)flags_ptr; names = vmaflag_names; -- cgit From 8e00b2dffd822b34d8d1c627dc19f0743f9f5ac6 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 6 Mar 2023 12:13:21 +0100 Subject: lib/stackdepot: kmsan: mark API outputs as initialized KMSAN does not instrument stackdepot and may treat memory allocated by it as uninitialized. This is not a problem for KMSAN itself, because its functions calling stackdepot API are also not instrumented. But other kernel features (e.g. netdev tracker) may access stack depot from instrumented code, which will lead to false positives, unless we explicitly mark stackdepot outputs as initialized. Link: https://lkml.kernel.org/r/20230306111322.205724-1-glider@google.com Signed-off-by: Alexander Potapenko Reported-by: syzbot Reviewed-by: Dmitry Vyukov Suggested-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Cc: Marco Elver Signed-off-by: Andrew Morton --- lib/stackdepot.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'lib') diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 036da8e295d1..2f5aa851834e 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -306,6 +307,11 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); pool_offset += required_size; + /* + * Let KMSAN know the stored stack record is initialized. This shall + * prevent false positive reports if instrumented code accesses it. + */ + kmsan_unpoison_memory(stack, required_size); return stack; } @@ -465,6 +471,12 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle, struct stack_record *stack; *entries = NULL; + /* + * Let KMSAN know *entries is initialized. This shall prevent false + * positive reports if instrumented code accesses it. + */ + kmsan_unpoison_memory(entries, sizeof(*entries)); + if (!handle) return 0; -- cgit From 5c63a7c32a94a7e2fecdd6754a6ff47cd4226ee1 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Thu, 2 Mar 2023 02:10:35 +0100 Subject: maple_tree: export symbol mas_preallocate() Fix missing EXPORT_SYMBOL_GPL() statement for mas_preallocate(). It isn't actually used by anything yet, but mas_preallocate() is part of the maple tree's 'Advanced API'. All other functions of this API are exported already. Link: https://lkml.kernel.org/r/20230302011035.4928-1-dakr@redhat.com Signed-off-by: Danilo Krummrich Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 9e2735cbc2b4..ae37a167e25d 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5726,6 +5726,7 @@ int mas_preallocate(struct ma_state *mas, gfp_t gfp) mas_reset(mas); return ret; } +EXPORT_SYMBOL_GPL(mas_preallocate); /* * mas_destroy() - destroy a maple state. -- cgit From de4f5fed3f231a8ff4790bf52975f847b95b85ea Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 Mar 2023 08:52:15 -0600 Subject: iov_iter: add iter_iovec() helper This returns a pointer to the current iovec entry in the iterator. Only useful with ITER_IOVEC right now, but it prepares us to treat ITER_UBUF and ITER_IOVEC identically for the first segment. Rename struct iov_iter->iov to iov_iter->__iov to find any potentially troublesome spots, and also to prevent anyone from adding new code that accesses iter->iov directly. Signed-off-by: Jens Axboe --- lib/iov_iter.c | 56 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 26 deletions(-) (limited to 'lib') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 274014e4eafe..87488c4aad3f 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -126,13 +126,13 @@ __out: \ iterate_buf(i, n, base, len, off, \ i->ubuf, (I)) \ } else if (likely(iter_is_iovec(i))) { \ - const struct iovec *iov = i->iov; \ + const struct iovec *iov = iter_iov(i); \ void __user *base; \ size_t len; \ iterate_iovec(i, n, base, len, off, \ iov, (I)) \ - i->nr_segs -= iov - i->iov; \ - i->iov = iov; \ + i->nr_segs -= iov - iter_iov(i); \ + i->__iov = iov; \ } else if (iov_iter_is_bvec(i)) { \ const struct bio_vec *bvec = i->bvec; \ void *base; \ @@ -355,7 +355,7 @@ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) size_t skip; size -= count; - for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { + for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; @@ -398,7 +398,7 @@ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) size_t skip; size -= count; - for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { + for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; @@ -425,7 +425,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, .nofault = false, .user_backed = true, .data_source = direction, - .iov = iov, + .__iov = iov, .nr_segs = nr_segs, .iov_offset = 0, .count = count @@ -876,14 +876,14 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) i->count -= size; size += i->iov_offset; // from beginning of current segment - for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { + for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { if (likely(size < iov->iov_len)) break; size -= iov->iov_len; } i->iov_offset = size; - i->nr_segs -= iov - i->iov; - i->iov = iov; + i->nr_segs -= iov - iter_iov(i); + i->__iov = iov; } void iov_iter_advance(struct iov_iter *i, size_t size) @@ -958,12 +958,12 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) unroll -= n; } } else { /* same logics for iovec and kvec */ - const struct iovec *iov = i->iov; + const struct iovec *iov = iter_iov(i); while (1) { size_t n = (--iov)->iov_len; i->nr_segs++; if (unroll <= n) { - i->iov = iov; + i->__iov = iov; i->iov_offset = n - unroll; return; } @@ -980,7 +980,7 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) { if (i->nr_segs > 1) { if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) - return min(i->count, i->iov->iov_len - i->iov_offset); + return min(i->count, iter_iov(i)->iov_len - i->iov_offset); if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } @@ -1095,13 +1095,14 @@ static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned k; for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->iov[k].iov_len - skip; + const struct iovec *iov = iter_iov(i) + k; + size_t len = iov->iov_len - skip; if (len > size) len = size; if (len & len_mask) return false; - if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask) + if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; size -= len; @@ -1194,9 +1195,10 @@ static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) unsigned k; for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->iov[k].iov_len - skip; + const struct iovec *iov = iter_iov(i) + k; + size_t len = iov->iov_len - skip; if (len) { - res |= (unsigned long)i->iov[k].iov_base + skip; + res |= (unsigned long)iov->iov_base + skip; if (len > size) len = size; res |= len; @@ -1273,14 +1275,15 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) return ~0U; for (k = 0; k < i->nr_segs; k++) { - if (i->iov[k].iov_len) { - unsigned long base = (unsigned long)i->iov[k].iov_base; + const struct iovec *iov = iter_iov(i) + k; + if (iov->iov_len) { + unsigned long base = (unsigned long)iov->iov_base; if (v) // if not the first one res |= base | v; // this start | previous end - v = base + i->iov[k].iov_len; - if (size <= i->iov[k].iov_len) + v = base + iov->iov_len; + if (size <= iov->iov_len) break; - size -= i->iov[k].iov_len; + size -= iov->iov_len; } } return res; @@ -1396,13 +1399,14 @@ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) return (unsigned long)i->ubuf + i->iov_offset; for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { - size_t len = i->iov[k].iov_len - skip; + const struct iovec *iov = iter_iov(i) + k; + size_t len = iov->iov_len - skip; if (unlikely(!len)) continue; if (*size > len) *size = len; - return (unsigned long)i->iov[k].iov_base + skip; + return (unsigned long)iov->iov_base + skip; } BUG(); // if it had been empty, we wouldn't get called } @@ -1614,7 +1618,7 @@ static int iov_npages(const struct iov_iter *i, int maxpages) const struct iovec *p; int npages = 0; - for (p = i->iov; size; skip = 0, p++) { + for (p = iter_iov(i); size; skip = 0, p++) { unsigned offs = offset_in_page(p->iov_base + skip); size_t len = min(p->iov_len - skip, size); @@ -1691,7 +1695,7 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) flags); else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ - return new->iov = kmemdup(new->iov, + return new->__iov = kmemdup(new->__iov, new->nr_segs * sizeof(struct iovec), flags); return NULL; @@ -1918,7 +1922,7 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) if (iov_iter_is_bvec(i)) i->bvec -= state->nr_segs - i->nr_segs; else - i->iov -= state->nr_segs - i->nr_segs; + i->__iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } -- cgit From e03ad4ee2783e41afc90cc7848468aef10741c0e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Mar 2023 14:35:49 -0600 Subject: iov_iter: convert import_single_range() to ITER_UBUF Since we're just importing a single vector, we don't have to turn it into an ITER_IOVEC. Instead turn it into an ITER_UBUF, which is cheaper to iterate. Signed-off-by: Jens Axboe --- lib/iov_iter.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 87488c4aad3f..f411bda1171f 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1870,9 +1870,7 @@ int import_single_range(int rw, void __user *buf, size_t len, if (unlikely(!access_ok(buf, len))) return -EFAULT; - iov->iov_base = buf; - iov->iov_len = len; - iov_iter_init(i, rw, iov, 1, len); + iov_iter_ubuf(i, rw, buf, len); return 0; } EXPORT_SYMBOL(import_single_range); -- cgit From 3b2deb0e46da9798b694cf50bd8bea1b26dcc789 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Mar 2023 14:37:19 -0600 Subject: iov_iter: import single vector iovecs as ITER_UBUF Add a special case to __import_iovec(), which imports a single segment iovec as an ITER_UBUF rather than an ITER_IOVEC. ITER_UBUF is cheaper to iterate than ITER_IOVEC, and for a single segment iovec, there's no point in using a segmented iterator. Signed-off-by: Jens Axboe --- lib/iov_iter.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'lib') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f411bda1171f..3e6c9bcfa612 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1784,6 +1784,30 @@ struct iovec *iovec_from_user(const struct iovec __user *uvec, return iov; } +/* + * Single segment iovec supplied by the user, import it as ITER_UBUF. + */ +static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, + struct iovec **iovp, struct iov_iter *i, + bool compat) +{ + struct iovec *iov = *iovp; + ssize_t ret; + + if (compat) + ret = copy_compat_iovec_from_user(iov, uvec, 1); + else + ret = copy_iovec_from_user(iov, uvec, 1); + if (unlikely(ret)) + return ret; + + ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); + if (unlikely(ret)) + return ret; + *iovp = NULL; + return i->count; +} + ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat) @@ -1792,6 +1816,9 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned long seg; struct iovec *iov; + if (nr_segs == 1) + return __import_iovec_ubuf(type, uvec, iovp, i, compat); + iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); if (IS_ERR(iov)) { *iovp = NULL; -- cgit From c616fb0cbae8af5f3f837f54c625700992dcd78d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 24 Mar 2023 17:59:38 +0800 Subject: crypto: lib/utils - Move utilities into new header The utilities have historically resided in algapi.h as they were first used internally before being exported. Move them into a new header file so external users don't see internal API details. Signed-off-by: Herbert Xu --- lib/crypto/utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/crypto/utils.c b/lib/crypto/utils.c index 53230ab1b195..c852c7151b0a 100644 --- a/lib/crypto/utils.c +++ b/lib/crypto/utils.c @@ -6,7 +6,7 @@ */ #include -#include +#include #include /* -- cgit From 57b4f760f94da13104cd596d5fb1643ae09c4d49 Mon Sep 17 00:00:00 2001 From: Sadiya Kazi Date: Fri, 31 Mar 2023 06:45:29 +0000 Subject: list: test: Test the klist structure Add KUnit tests to the klist linked-list structure. These perform testing for different variations of node add and node delete in the klist data structure (). Limitation: Since we use a static global variable, and if multiple instances of this test are run concurrently, the test may fail. Signed-off-by: Sadiya Kazi Reviewed-by: Brendan Higgins Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/list-test.c | 300 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 299 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/list-test.c b/lib/list-test.c index d374cf5d1a57..0cc27de9cec8 100644 --- a/lib/list-test.c +++ b/lib/list-test.c @@ -8,6 +8,7 @@ #include #include +#include struct list_test_struct { int data; @@ -1199,6 +1200,303 @@ static struct kunit_suite hlist_test_module = { .test_cases = hlist_test_cases, }; -kunit_test_suites(&list_test_module, &hlist_test_module); + +struct klist_test_struct { + int data; + struct klist klist; + struct klist_node klist_node; +}; + +static int node_count; +static struct klist_node *last_node; + +static void check_node(struct klist_node *node_ptr) +{ + node_count++; + last_node = node_ptr; +} + +static void check_delete_node(struct klist_node *node_ptr) +{ + node_count--; + last_node = node_ptr; +} + +static void klist_test_add_tail(struct kunit *test) +{ + struct klist_node a, b; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, NULL); + + klist_add_tail(&a, &mylist); + KUNIT_EXPECT_EQ(test, node_count, 1); + KUNIT_EXPECT_PTR_EQ(test, last_node, &a); + + klist_add_tail(&b, &mylist); + KUNIT_EXPECT_EQ(test, node_count, 2); + KUNIT_EXPECT_PTR_EQ(test, last_node, &b); + + /* should be [list] -> a -> b */ + klist_iter_init(&mylist, &i); + + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + KUNIT_EXPECT_NULL(test, klist_next(&i)); + + klist_iter_exit(&i); + +} + +static void klist_test_add_head(struct kunit *test) +{ + struct klist_node a, b; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, NULL); + + klist_add_head(&a, &mylist); + KUNIT_EXPECT_EQ(test, node_count, 1); + KUNIT_EXPECT_PTR_EQ(test, last_node, &a); + + klist_add_head(&b, &mylist); + KUNIT_EXPECT_EQ(test, node_count, 2); + KUNIT_EXPECT_PTR_EQ(test, last_node, &b); + + /* should be [list] -> b -> a */ + klist_iter_init(&mylist, &i); + + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_NULL(test, klist_next(&i)); + + klist_iter_exit(&i); + +} + +static void klist_test_add_behind(struct kunit *test) +{ + struct klist_node a, b, c, d; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, NULL); + + klist_add_head(&a, &mylist); + klist_add_head(&b, &mylist); + + klist_add_behind(&c, &a); + KUNIT_EXPECT_EQ(test, node_count, 3); + KUNIT_EXPECT_PTR_EQ(test, last_node, &c); + + klist_add_behind(&d, &b); + KUNIT_EXPECT_EQ(test, node_count, 4); + KUNIT_EXPECT_PTR_EQ(test, last_node, &d); + + klist_iter_init(&mylist, &i); + + /* should be [list] -> b -> d -> a -> c*/ + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &d); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &c); + KUNIT_EXPECT_NULL(test, klist_next(&i)); + + klist_iter_exit(&i); + +} + +static void klist_test_add_before(struct kunit *test) +{ + struct klist_node a, b, c, d; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, NULL); + + klist_add_head(&a, &mylist); + klist_add_head(&b, &mylist); + klist_add_before(&c, &a); + KUNIT_EXPECT_EQ(test, node_count, 3); + KUNIT_EXPECT_PTR_EQ(test, last_node, &c); + + klist_add_before(&d, &b); + KUNIT_EXPECT_EQ(test, node_count, 4); + KUNIT_EXPECT_PTR_EQ(test, last_node, &d); + + klist_iter_init(&mylist, &i); + + /* should be [list] -> b -> d -> a -> c*/ + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &d); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &c); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_NULL(test, klist_next(&i)); + + klist_iter_exit(&i); + +} + +/* + * Verify that klist_del() delays the deletion of a node until there + * are no other references to it + */ +static void klist_test_del_refcount_greater_than_zero(struct kunit *test) +{ + struct klist_node a, b, c, d; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, &check_delete_node); + + /* Add nodes a,b,c,d to the list*/ + klist_add_tail(&a, &mylist); + klist_add_tail(&b, &mylist); + klist_add_tail(&c, &mylist); + klist_add_tail(&d, &mylist); + + klist_iter_init(&mylist, &i); + + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + /* Advance the iterator to point to node c*/ + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &c); + + /* Try to delete node c while there is a reference to it*/ + klist_del(&c); + + /* + * Verify that node c is still attached to the list even after being + * deleted. Since the iterator still points to c, the reference count is not + * decreased to 0 + */ + KUNIT_EXPECT_TRUE(test, klist_node_attached(&c)); + + /* Check that node c has not been removed yet*/ + KUNIT_EXPECT_EQ(test, node_count, 4); + KUNIT_EXPECT_PTR_EQ(test, last_node, &d); + + klist_iter_exit(&i); + + /* + * Since the iterator is no longer pointing to node c, node c is removed + * from the list + */ + KUNIT_EXPECT_EQ(test, node_count, 3); + KUNIT_EXPECT_PTR_EQ(test, last_node, &c); + +} + +/* + * Verify that klist_del() deletes a node immediately when there are no + * other references to it. + */ +static void klist_test_del_refcount_zero(struct kunit *test) +{ + struct klist_node a, b, c, d; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, &check_delete_node); + + /* Add nodes a,b,c,d to the list*/ + klist_add_tail(&a, &mylist); + klist_add_tail(&b, &mylist); + klist_add_tail(&c, &mylist); + klist_add_tail(&d, &mylist); + /* Delete node c*/ + klist_del(&c); + + /* Check that node c is deleted from the list*/ + KUNIT_EXPECT_EQ(test, node_count, 3); + KUNIT_EXPECT_PTR_EQ(test, last_node, &c); + + /* Should be [list] -> a -> b -> d*/ + klist_iter_init(&mylist, &i); + + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &d); + KUNIT_EXPECT_NULL(test, klist_next(&i)); + + klist_iter_exit(&i); + +} + +static void klist_test_remove(struct kunit *test) +{ + /* This test doesn't check correctness under concurrent access */ + struct klist_node a, b, c, d; + struct klist mylist; + struct klist_iter i; + + node_count = 0; + klist_init(&mylist, &check_node, &check_delete_node); + + /* Add nodes a,b,c,d to the list*/ + klist_add_tail(&a, &mylist); + klist_add_tail(&b, &mylist); + klist_add_tail(&c, &mylist); + klist_add_tail(&d, &mylist); + /* Delete node c*/ + klist_remove(&c); + + /* Check the nodes in the list*/ + KUNIT_EXPECT_EQ(test, node_count, 3); + KUNIT_EXPECT_PTR_EQ(test, last_node, &c); + + /* should be [list] -> a -> b -> d*/ + klist_iter_init(&mylist, &i); + + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &a); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &b); + KUNIT_EXPECT_PTR_EQ(test, klist_next(&i), &d); + KUNIT_EXPECT_NULL(test, klist_next(&i)); + + klist_iter_exit(&i); + +} + +static void klist_test_node_attached(struct kunit *test) +{ + struct klist_node a = {}; + struct klist mylist; + + klist_init(&mylist, NULL, NULL); + + KUNIT_EXPECT_FALSE(test, klist_node_attached(&a)); + klist_add_head(&a, &mylist); + KUNIT_EXPECT_TRUE(test, klist_node_attached(&a)); + klist_del(&a); + KUNIT_EXPECT_FALSE(test, klist_node_attached(&a)); + +} + +static struct kunit_case klist_test_cases[] = { + KUNIT_CASE(klist_test_add_tail), + KUNIT_CASE(klist_test_add_head), + KUNIT_CASE(klist_test_add_behind), + KUNIT_CASE(klist_test_add_before), + KUNIT_CASE(klist_test_del_refcount_greater_than_zero), + KUNIT_CASE(klist_test_del_refcount_zero), + KUNIT_CASE(klist_test_remove), + KUNIT_CASE(klist_test_node_attached), + {}, +}; + +static struct kunit_suite klist_test_module = { + .name = "klist", + .test_cases = klist_test_cases, +}; + +kunit_test_suites(&list_test_module, &hlist_test_module, &klist_test_module); MODULE_LICENSE("GPL v2"); -- cgit From 48e1a66fecb4e8b64cf2a0a8978c048990181d94 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 27 Mar 2023 17:27:21 +0300 Subject: lib/vsprintf: Use isodigit() for the octal number check Use isodigit() to test the octal number instead of homegrown approach. Signed-off-by: Andy Shevchenko Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20230327142721.48378-1-andriy.shevchenko@linux.intel.com --- lib/vsprintf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/vsprintf.c b/lib/vsprintf.c index be71a03c936a..426418253fd4 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -3621,7 +3621,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args) if (!digit || (base == 16 && !isxdigit(digit)) || (base == 10 && !isdigit(digit)) - || (base == 8 && (!isdigit(digit) || digit > '7')) + || (base == 8 && !isodigit(digit)) || (base == 0 && !isdigit(digit))) break; -- cgit From c779b97281d52faac253e9afb004537e50ada4e8 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 1 Feb 2023 16:08:11 +0100 Subject: lib/test_vmalloc.c: Rename kvfree_rcu() to kvfree_rcu_mightsleep() The kvfree_rcu() macro's single-argument form is deprecated. Therefore switch to the new kvfree_rcu_mightsleep() variant. The goal is to avoid accidental use of the single-argument forms, which can introduce functionality bugs in atomic contexts and latency bugs in non-atomic contexts. Acked-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) --- lib/test_vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index de4ee0d50906..cd2bdba6d3ed 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -334,7 +334,7 @@ kvfree_rcu_1_arg_vmalloc_test(void) return -1; p->array[0] = 'a'; - kvfree_rcu(p); + kvfree_rcu_mightsleep(p); } return 0; -- cgit From a42077b787680cbc365a96446b30f32399fa3f6f Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Mon, 3 Apr 2023 20:19:30 +0000 Subject: kunit: add tests for using current KUnit test field Create test suite called "kunit_current" to add test coverage for the use of current->kunit_test, which returns the current KUnit test. Add two test cases: - kunit_current_test to test current->kunit_test and the method kunit_get_current_test(), which utilizes current->kunit_test. - kunit_current_fail_test to test the method kunit_fail_current_test(), which utilizes current->kunit_test. Signed-off-by: Rae Moar Reviewed-by: Daniel Latypov Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/kunit-test.c | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c index b63595d3e241..42e44caa1bdd 100644 --- a/lib/kunit/kunit-test.c +++ b/lib/kunit/kunit-test.c @@ -6,6 +6,7 @@ * Author: Brendan Higgins */ #include +#include #include "try-catch-impl.h" @@ -532,7 +533,46 @@ static struct kunit_suite kunit_status_test_suite = { .test_cases = kunit_status_test_cases, }; +static void kunit_current_test(struct kunit *test) +{ + /* Check results of both current->kunit_test and + * kunit_get_current_test() are equivalent to current test. + */ + KUNIT_EXPECT_PTR_EQ(test, test, current->kunit_test); + KUNIT_EXPECT_PTR_EQ(test, test, kunit_get_current_test()); +} + +static void kunit_current_fail_test(struct kunit *test) +{ + struct kunit fake; + + kunit_init_test(&fake, "fake test", NULL); + KUNIT_EXPECT_EQ(test, fake.status, KUNIT_SUCCESS); + + /* Set current->kunit_test to fake test. */ + current->kunit_test = &fake; + + kunit_fail_current_test("This should make `fake` test fail."); + KUNIT_EXPECT_EQ(test, fake.status, (enum kunit_status)KUNIT_FAILURE); + kunit_cleanup(&fake); + + /* Reset current->kunit_test to current test. */ + current->kunit_test = test; +} + +static struct kunit_case kunit_current_test_cases[] = { + KUNIT_CASE(kunit_current_test), + KUNIT_CASE(kunit_current_fail_test), + {} +}; + +static struct kunit_suite kunit_current_test_suite = { + .name = "kunit_current", + .test_cases = kunit_current_test_cases, +}; + kunit_test_suites(&kunit_try_catch_test_suite, &kunit_resource_test_suite, - &kunit_log_test_suite, &kunit_status_test_suite); + &kunit_log_test_suite, &kunit_status_test_suite, + &kunit_current_test_suite); MODULE_LICENSE("GPL v2"); -- cgit From fcbfe8121a45152a3cfbe1c28c96a3b611b7347d Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 23 Mar 2023 17:33:52 +0100 Subject: Kconfig: introduce HAS_IOPORT option and select it as necessary We introduce a new HAS_IOPORT Kconfig option to indicate support for I/O Port access. In a future patch HAS_IOPORT=n will disable compilation of the I/O accessor functions inb()/outb() and friends on architectures which can not meaningfully support legacy I/O spaces such as s390. The following architectures do not select HAS_IOPORT: * ARC * C-SKY * Hexagon * Nios II * OpenRISC * s390 * User-Mode Linux * Xtensa All other architectures select HAS_IOPORT at least conditionally. The "depends on" relations on HAS_IOPORT in drivers as well as ifdefs for HAS_IOPORT specific sections will be added in subsequent patches on a per subsystem basis. Co-developed-by: Arnd Bergmann Signed-off-by: Arnd Bergmann Acked-by: Johannes Berg # for ARCH=um Acked-by: Geert Uytterhoeven Signed-off-by: Niklas Schnelle Signed-off-by: Arnd Bergmann --- lib/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'lib') diff --git a/lib/Kconfig b/lib/Kconfig index ce2abffb9ed8..5c2da561c516 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -92,6 +92,7 @@ config ARCH_USE_SYM_ANNOTATIONS config INDIRECT_PIO bool "Access I/O in non-MMIO mode" depends on ARM64 + depends on HAS_IOPORT help On some platforms where no separate I/O space exists, there are I/O hosts which can not be accessed in MMIO mode. Using the logical PIO @@ -509,6 +510,9 @@ config HAS_IOMEM depends on !NO_IOMEM default y +config HAS_IOPORT + bool + config HAS_IOPORT_MAP bool depends on HAS_IOMEM && !NO_IOPORT_MAP -- cgit From 39d0bd86c499ecd6abae42a9b7112056c5560691 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 27 Feb 2023 09:36:00 -0800 Subject: maple_tree: be more cautious about dead nodes Patch series "Fix VMA tree modification under mmap read lock". Syzbot reported a BUG_ON in mm/mmap.c which was found to be caused by an inconsistency between threads walking the VMA maple tree. The inconsistency is caused by the page fault handler modifying the maple tree while holding the mmap_lock for read. This only happens for stack VMAs. We had thought this was safe as it only modifies a single pivot in the tree. Unfortunately, syzbot constructed a test case where the stack had no guard page and grew the stack to abut the next VMA. This causes us to delete the NULL entry between the two VMAs and rewrite the node. We considered several options for fixing this, including dropping the mmap_lock, then reacquiring it for write; and relaxing the definition of the tree to permit a zero-length NULL entry in the node. We decided the best option was to backport some of the RCU patches from -next, which solve the problem by allocating a new node and RCU-freeing the old node. Since the problem exists in 6.1, we preferred a solution which is similar to the one we intended to merge next merge window. These patches have been in -next since next-20230301, and have received intensive testing in Android as part of the RCU page fault patchset. They were also sent as part of the "Per-VMA locks" v4 patch series. Patches 1 to 7 are bug fixes for RCU mode of the tree and patch 8 enables RCU mode for the tree. Performance v6.3-rc3 vs patched v6.3-rc3: Running these changes through mmtests showed there was a 15-20% performance decrease in will-it-scale/brk1-processes. This tests creating and inserting a single VMA repeatedly through the brk interface and isn't representative of any real world applications. This patch (of 8): ma_pivots() and ma_data_end() may be called with a dead node. Ensure to that the node isn't dead before using the returned values. This is necessary for RCU mode of the maple tree. Link: https://lkml.kernel.org/r/20230327185532.2354250-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230227173632.3292573-1-surenb@google.com Link: https://lkml.kernel.org/r/20230227173632.3292573-2-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam Howlett Signed-off-by: Suren Baghdasaryan Cc: Andy Lutomirski Cc: Arjun Roy Cc: Axel Rasmussen Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: David Rientjes Cc: Eric Dumazet Cc: freak07 Cc: Greg Thelen Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes Cc: Johannes Weiner Cc: Kent Overstreet Cc: Laurent Dufour Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Minchan Kim Cc: Paul E. McKenney Cc: Peter Oskolkov Cc: Peter Xu Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Sebastian Andrzej Siewior Cc: Shakeel Butt Cc: Soheil Hassas Yeganeh Cc: Song Liu Cc: Vlastimil Babka Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 52 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 9e2735cbc2b4..095b9cb1f4f1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -544,6 +544,7 @@ static inline bool ma_dead_node(const struct maple_node *node) return (parent == node); } + /* * mte_dead_node() - check if the @enode is dead. * @enode: The encoded maple node @@ -625,6 +626,8 @@ static inline unsigned int mas_alloc_req(const struct ma_state *mas) * @node - the maple node * @type - the node type * + * In the event of a dead node, this array may be %NULL + * * Return: A pointer to the maple node pivots */ static inline unsigned long *ma_pivots(struct maple_node *node, @@ -1096,8 +1099,11 @@ static int mas_ascend(struct ma_state *mas) a_type = mas_parent_enum(mas, p_enode); a_node = mte_parent(p_enode); a_slot = mte_parent_slot(p_enode); - pivots = ma_pivots(a_node, a_type); a_enode = mt_mk_node(a_node, a_type); + pivots = ma_pivots(a_node, a_type); + + if (unlikely(ma_dead_node(a_node))) + return 1; if (!set_min && a_slot) { set_min = true; @@ -1401,6 +1407,9 @@ static inline unsigned char ma_data_end(struct maple_node *node, { unsigned char offset; + if (!pivots) + return 0; + if (type == maple_arange_64) return ma_meta_end(node, type); @@ -1436,6 +1445,9 @@ static inline unsigned char mas_data_end(struct ma_state *mas) return ma_meta_end(node, type); pivots = ma_pivots(node, type); + if (unlikely(ma_dead_node(node))) + return 0; + offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); @@ -4505,6 +4517,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) node = mas_mn(mas); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); + if (unlikely(ma_dead_node(node))) + return 1; + mas->max = pivots[offset]; if (offset) mas->min = pivots[offset - 1] + 1; @@ -4526,6 +4541,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min) slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); offset = ma_data_end(node, mt, pivots, mas->max); + if (unlikely(ma_dead_node(node))) + return 1; + if (offset) mas->min = pivots[offset - 1] + 1; @@ -4574,6 +4592,7 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, struct maple_enode *enode; int level = 0; unsigned char offset; + unsigned char node_end; enum maple_type mt; void __rcu **slots; @@ -4597,7 +4616,11 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); - } while (unlikely(offset == ma_data_end(node, mt, pivots, mas->max))); + node_end = ma_data_end(node, mt, pivots, mas->max); + if (unlikely(ma_dead_node(node))) + return 1; + + } while (unlikely(offset == node_end)); slots = ma_slots(node, mt); pivot = mas_safe_pivot(mas, pivots, ++offset, mt); @@ -4613,6 +4636,9 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, mt = mte_node_type(mas->node); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); + if (unlikely(ma_dead_node(node))) + return 1; + offset = 0; pivot = pivots[0]; } @@ -4659,11 +4685,14 @@ static inline void *mas_next_nentry(struct ma_state *mas, return NULL; } - pivots = ma_pivots(node, type); slots = ma_slots(node, type); - mas->index = mas_safe_min(mas, pivots, mas->offset); + pivots = ma_pivots(node, type); count = ma_data_end(node, type, pivots, mas->max); - if (ma_dead_node(node)) + if (unlikely(ma_dead_node(node))) + return NULL; + + mas->index = mas_safe_min(mas, pivots, mas->offset); + if (unlikely(ma_dead_node(node))) return NULL; if (mas->index > max) @@ -4817,6 +4846,11 @@ retry: slots = ma_slots(mn, mt); pivots = ma_pivots(mn, mt); + if (unlikely(ma_dead_node(mn))) { + mas_rewalk(mas, index); + goto retry; + } + if (offset == mt_pivots[mt]) pivot = mas->max; else @@ -6617,11 +6651,11 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, while (likely(!ma_is_leaf(mt))) { MT_BUG_ON(mas->tree, mte_dead_node(mas->node)); slots = ma_slots(mn, mt); - pivots = ma_pivots(mn, mt); - max = pivots[0]; entry = mas_slot(mas, slots, 0); + pivots = ma_pivots(mn, mt); if (unlikely(ma_dead_node(mn))) return NULL; + max = pivots[0]; mas->node = entry; mn = mas_mn(mas); mt = mte_node_type(mas->node); @@ -6641,13 +6675,13 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, if (likely(entry)) return entry; - pivots = ma_pivots(mn, mt); - mas->index = pivots[0] + 1; mas->offset = 1; entry = mas_slot(mas, slots, 1); + pivots = ma_pivots(mn, mt); if (unlikely(ma_dead_node(mn))) return NULL; + mas->index = pivots[0] + 1; if (mas->index > limit) goto none; -- cgit From a7b92d59c885018cb7bb88539892278e4fd64b29 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 27 Feb 2023 09:36:01 -0800 Subject: maple_tree: detect dead nodes in mas_start() When initially starting a search, the root node may already be in the process of being replaced in RCU mode. Detect and restart the walk if this is the case. This is necessary for RCU mode of the maple tree. Link: https://lkml.kernel.org/r/20230227173632.3292573-3-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam Howlett Signed-off-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 095b9cb1f4f1..3d53339656e1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1360,12 +1360,16 @@ static inline struct maple_enode *mas_start(struct ma_state *mas) mas->max = ULONG_MAX; mas->depth = 0; +retry: root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->node = mte_safe_root(root); mas->offset = 0; + if (mte_dead_node(mas->node)) + goto retry; + return NULL; } -- cgit From 2e5b4921f8efc9e845f4f04741797d16f36847eb Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 27 Feb 2023 09:36:02 -0800 Subject: maple_tree: fix freeing of nodes in rcu mode The walk to destroy the nodes was not always setting the node type and would result in a destroy method potentially using the values as nodes. Avoid this by setting the correct node types. This is necessary for the RCU mode of the maple tree. Link: https://lkml.kernel.org/r/20230227173632.3292573-4-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam Howlett Signed-off-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3d53339656e1..946acda29521 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -902,6 +902,44 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, meta->end = end; } +/* + * mas_clear_meta() - clear the metadata information of a node, if it exists + * @mas: The maple state + * @mn: The maple node + * @mt: The maple node type + * @offset: The offset of the highest sub-gap in this node. + * @end: The end of the data in this node. + */ +static inline void mas_clear_meta(struct ma_state *mas, struct maple_node *mn, + enum maple_type mt) +{ + struct maple_metadata *meta; + unsigned long *pivots; + void __rcu **slots; + void *next; + + switch (mt) { + case maple_range_64: + pivots = mn->mr64.pivot; + if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { + slots = mn->mr64.slot; + next = mas_slot_locked(mas, slots, + MAPLE_RANGE64_SLOTS - 1); + if (unlikely((mte_to_node(next) && mte_node_type(next)))) + return; /* The last slot is a node, no metadata */ + } + fallthrough; + case maple_arange_64: + meta = ma_meta(mn, mt); + break; + default: + return; + } + + meta->gap = 0; + meta->end = 0; +} + /* * ma_meta_end() - Get the data end of a node from the metadata * @mn: The maple node @@ -5441,20 +5479,22 @@ no_gap: * mas_dead_leaves() - Mark all leaves of a node as dead. * @mas: The maple state * @slots: Pointer to the slot array + * @type: The maple node type * * Must hold the write lock. * * Return: The number of leaves marked as dead. */ static inline -unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots) +unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots, + enum maple_type mt) { struct maple_node *node; enum maple_type type; void *entry; int offset; - for (offset = 0; offset < mt_slot_count(mas->node); offset++) { + for (offset = 0; offset < mt_slots[mt]; offset++) { entry = mas_slot_locked(mas, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); @@ -5473,14 +5513,13 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots) static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset) { - struct maple_node *node, *next; + struct maple_node *next; void __rcu **slots = NULL; next = mas_mn(mas); do { - mas->node = ma_enode_ptr(next); - node = mas_mn(mas); - slots = ma_slots(node, node->type); + mas->node = mt_mk_node(next, next->type); + slots = ma_slots(next, next->type); next = mas_slot_locked(mas, slots, offset); offset = 0; } while (!ma_is_leaf(next->type)); @@ -5544,11 +5583,14 @@ static inline void __rcu **mas_destroy_descend(struct ma_state *mas, node = mas_mn(mas); slots = ma_slots(node, mte_node_type(mas->node)); next = mas_slot_locked(mas, slots, 0); - if ((mte_dead_node(next))) + if ((mte_dead_node(next))) { + mte_to_node(next)->type = mte_node_type(next); next = mas_slot_locked(mas, slots, 1); + } mte_set_node_dead(mas->node); node->type = mte_node_type(mas->node); + mas_clear_meta(mas, node, node->type); node->piv_parent = prev; node->parent_slot = offset; offset = 0; @@ -5568,13 +5610,18 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, MA_STATE(mas, &mt, 0, 0); - if (mte_is_leaf(enode)) + mas.node = enode; + if (mte_is_leaf(enode)) { + node->type = mte_node_type(enode); goto free_leaf; + } + ma_flags &= ~MT_FLAGS_LOCK_MASK; mt_init_flags(&mt, ma_flags); mas_lock(&mas); - mas.node = start = enode; + mte_to_node(enode)->ma_flags = ma_flags; + start = enode; slots = mas_destroy_descend(&mas, start, 0); node = mas_mn(&mas); do { @@ -5582,7 +5629,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, unsigned char offset; struct maple_enode *parent, *tmp; - node->slot_len = mas_dead_leaves(&mas, slots); + node->type = mte_node_type(mas.node); + node->slot_len = mas_dead_leaves(&mas, slots, node->type); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; @@ -5606,7 +5654,8 @@ next: } while (start != mas.node); node = mas_mn(&mas); - node->slot_len = mas_dead_leaves(&mas, slots); + node->type = mte_node_type(mas.node); + node->slot_len = mas_dead_leaves(&mas, slots, node->type); if (free) mt_free_bulk(node->slot_len, slots); @@ -5616,6 +5665,8 @@ start_slots_free: free_leaf: if (free) mt_free_rcu(&node->rcu); + else + mas_clear_meta(&mas, node, node->type); } /* -- cgit From 8372f4d83f96f35915106093cde4565836587123 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 27 Feb 2023 09:36:03 -0800 Subject: maple_tree: remove extra smp_wmb() from mas_dead_leaves() The call to mte_set_dead_node() before the smp_wmb() already calls smp_wmb() so this is not needed. This is an optimization for the RCU mode of the maple tree. Link: https://lkml.kernel.org/r/20230227173632.3292573-5-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam Howlett Signed-off-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 946acda29521..96d673e4ba5b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5503,7 +5503,6 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots, break; mte_set_node_dead(entry); - smp_wmb(); /* Needed for RCU */ node->type = type; rcu_assign_pointer(slots[offset], node); } -- cgit From c13af03de46ba27674dd9fb31a17c0d480081139 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 27 Feb 2023 09:36:04 -0800 Subject: maple_tree: fix write memory barrier of nodes once dead for RCU mode During the development of the maple tree, the strategy of freeing multiple nodes changed and, in the process, the pivots were reused to store pointers to dead nodes. To ensure the readers see accurate pivots, the writers need to mark the nodes as dead and call smp_wmb() to ensure any readers can identify the node as dead before using the pivot values. There were two places where the old method of marking the node as dead without smp_wmb() were being used, which resulted in RCU readers seeing the wrong pivot value before seeing the node was dead. Fix this race condition by using mte_set_node_dead() which has the smp_wmb() call to ensure the race is closed. Add a WARN_ON() to the ma_free_rcu() call to ensure all nodes being freed are marked as dead to ensure there are no other call paths besides the two updated paths. This is necessary for the RCU mode of the maple tree. Link: https://lkml.kernel.org/r/20230227173632.3292573-6-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Signed-off-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 96d673e4ba5b..5202d89ba56e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -185,7 +185,7 @@ static void mt_free_rcu(struct rcu_head *head) */ static void ma_free_rcu(struct maple_node *node) { - node->parent = ma_parent_ptr(node); + WARN_ON(node->parent != ma_parent_ptr(node)); call_rcu(&node->rcu, mt_free_rcu); } @@ -1778,8 +1778,10 @@ static inline void mas_replace(struct ma_state *mas, bool advanced) rcu_assign_pointer(slots[offset], mas->node); } - if (!advanced) + if (!advanced) { + mte_set_node_dead(old_enode); mas_free(mas, old_enode); + } } /* @@ -4218,6 +4220,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas) done: mas_leaf_set_meta(mas, newnode, dst_pivots, maple_leaf_64, new_end); if (in_rcu) { + mte_set_node_dead(mas->node); mas->node = mt_mk_node(newnode, wr_mas->type); mas_replace(mas, false); } else { -- cgit From 0a2b18d948838e16912b3b627b504ab062b7d02a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 27 Feb 2023 09:36:05 -0800 Subject: maple_tree: add smp_rmb() to dead node detection Add an smp_rmb() before reading the parent pointer to ensure that anything read from the node prior to the parent pointer hasn't been reordered ahead of this check. The is necessary for RCU mode. Link: https://lkml.kernel.org/r/20230227173632.3292573-7-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Signed-off-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 5202d89ba56e..72c89eb03393 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -539,9 +539,11 @@ static inline struct maple_node *mte_parent(const struct maple_enode *enode) */ static inline bool ma_dead_node(const struct maple_node *node) { - struct maple_node *parent = (void *)((unsigned long) - node->parent & ~MAPLE_NODE_MASK); + struct maple_node *parent; + /* Do not reorder reads from the node prior to the parent check */ + smp_rmb(); + parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK); return (parent == node); } @@ -556,6 +558,8 @@ static inline bool mte_dead_node(const struct maple_enode *enode) struct maple_node *parent, *node; node = mte_to_node(enode); + /* Do not reorder reads from the node prior to the parent check */ + smp_rmb(); parent = mte_parent(enode); return (parent == node); } -- cgit From 790e1fa86b340c2bd4a327e01c161f7a1ad885f6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 27 Feb 2023 09:36:06 -0800 Subject: maple_tree: add RCU lock checking to rcu callback functions Dereferencing RCU objects within the RCU callback without the RCU check has caused lockdep to complain. Fix the RCU dereferencing by using the RCU callback lock to ensure the operation is safe. Also stop creating a new lock to use for dereferencing during destruction of the tree or subtree. Instead, pass through a pointer to the tree that has the lock that is held for RCU dereferencing checking. It also does not make sense to use the maple state in the freeing scenario as the tree walk is a special case where the tree no longer has the normal encodings and parent pointers. Link: https://lkml.kernel.org/r/20230227173632.3292573-8-surenb@google.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reported-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 188 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 96 insertions(+), 92 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 72c89eb03393..b1db0bd71aed 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -824,6 +824,11 @@ static inline void *mt_slot(const struct maple_tree *mt, return rcu_dereference_check(slots[offset], mt_locked(mt)); } +static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, + unsigned char offset) +{ + return rcu_dereference_protected(slots[offset], mt_locked(mt)); +} /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. * @mas: The maple state @@ -835,7 +840,7 @@ static inline void *mt_slot(const struct maple_tree *mt, static inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, unsigned char offset) { - return rcu_dereference_protected(slots[offset], mt_locked(mas->tree)); + return mt_slot_locked(mas->tree, slots, offset); } /* @@ -907,34 +912,35 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, } /* - * mas_clear_meta() - clear the metadata information of a node, if it exists - * @mas: The maple state + * mt_clear_meta() - clear the metadata information of a node, if it exists + * @mt: The maple tree * @mn: The maple node - * @mt: The maple node type + * @type: The maple node type * @offset: The offset of the highest sub-gap in this node. * @end: The end of the data in this node. */ -static inline void mas_clear_meta(struct ma_state *mas, struct maple_node *mn, - enum maple_type mt) +static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, + enum maple_type type) { struct maple_metadata *meta; unsigned long *pivots; void __rcu **slots; void *next; - switch (mt) { + switch (type) { case maple_range_64: pivots = mn->mr64.pivot; if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { slots = mn->mr64.slot; - next = mas_slot_locked(mas, slots, - MAPLE_RANGE64_SLOTS - 1); - if (unlikely((mte_to_node(next) && mte_node_type(next)))) - return; /* The last slot is a node, no metadata */ + next = mt_slot_locked(mt, slots, + MAPLE_RANGE64_SLOTS - 1); + if (unlikely((mte_to_node(next) && + mte_node_type(next)))) + return; /* no metadata, could be node */ } fallthrough; case maple_arange_64: - meta = ma_meta(mn, mt); + meta = ma_meta(mn, type); break; default: return; @@ -5483,7 +5489,7 @@ no_gap: } /* - * mas_dead_leaves() - Mark all leaves of a node as dead. + * mte_dead_leaves() - Mark all leaves of a node as dead. * @mas: The maple state * @slots: Pointer to the slot array * @type: The maple node type @@ -5493,16 +5499,16 @@ no_gap: * Return: The number of leaves marked as dead. */ static inline -unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots, - enum maple_type mt) +unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt, + void __rcu **slots) { struct maple_node *node; enum maple_type type; void *entry; int offset; - for (offset = 0; offset < mt_slots[mt]; offset++) { - entry = mas_slot_locked(mas, slots, offset); + for (offset = 0; offset < mt_slot_count(enode); offset++) { + entry = mt_slot(mt, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); /* Use both node and type to catch LE & BE metadata */ @@ -5517,162 +5523,160 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots, return offset; } -static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset) +/** + * mte_dead_walk() - Walk down a dead tree to just before the leaves + * @enode: The maple encoded node + * @offset: The starting offset + * + * Note: This can only be used from the RCU callback context. + */ +static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset) { - struct maple_node *next; + struct maple_node *node, *next; void __rcu **slots = NULL; - next = mas_mn(mas); + next = mte_to_node(*enode); do { - mas->node = mt_mk_node(next, next->type); - slots = ma_slots(next, next->type); - next = mas_slot_locked(mas, slots, offset); + *enode = ma_enode_ptr(next); + node = mte_to_node(*enode); + slots = ma_slots(node, node->type); + next = rcu_dereference_protected(slots[offset], + lock_is_held(&rcu_callback_map)); offset = 0; } while (!ma_is_leaf(next->type)); return slots; } +/** + * mt_free_walk() - Walk & free a tree in the RCU callback context + * @head: The RCU head that's within the node. + * + * Note: This can only be used from the RCU callback context. + */ static void mt_free_walk(struct rcu_head *head) { void __rcu **slots; struct maple_node *node, *start; - struct maple_tree mt; + struct maple_enode *enode; unsigned char offset; enum maple_type type; - MA_STATE(mas, &mt, 0, 0); node = container_of(head, struct maple_node, rcu); if (ma_is_leaf(node->type)) goto free_leaf; - mt_init_flags(&mt, node->ma_flags); - mas_lock(&mas); start = node; - mas.node = mt_mk_node(node, node->type); - slots = mas_dead_walk(&mas, 0); - node = mas_mn(&mas); + enode = mt_mk_node(node, node->type); + slots = mte_dead_walk(&enode, 0); + node = mte_to_node(enode); do { mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; - mas.node = node->piv_parent; - if (mas_mn(&mas) == node) - goto start_slots_free; - - type = mte_node_type(mas.node); - slots = ma_slots(mte_to_node(mas.node), type); - if ((offset < mt_slots[type]) && (slots[offset])) - slots = mas_dead_walk(&mas, offset); - - node = mas_mn(&mas); + enode = node->piv_parent; + if (mte_to_node(enode) == node) + goto free_leaf; + + type = mte_node_type(enode); + slots = ma_slots(mte_to_node(enode), type); + if ((offset < mt_slots[type]) && + rcu_dereference_protected(slots[offset], + lock_is_held(&rcu_callback_map))) + slots = mte_dead_walk(&enode, offset); + node = mte_to_node(enode); } while ((node != start) || (node->slot_len < offset)); slots = ma_slots(node, node->type); mt_free_bulk(node->slot_len, slots); -start_slots_free: - mas_unlock(&mas); free_leaf: mt_free_rcu(&node->rcu); } -static inline void __rcu **mas_destroy_descend(struct ma_state *mas, - struct maple_enode *prev, unsigned char offset) +static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, + struct maple_tree *mt, struct maple_enode *prev, unsigned char offset) { struct maple_node *node; - struct maple_enode *next = mas->node; + struct maple_enode *next = *enode; void __rcu **slots = NULL; + enum maple_type type; + unsigned char next_offset = 0; do { - mas->node = next; - node = mas_mn(mas); - slots = ma_slots(node, mte_node_type(mas->node)); - next = mas_slot_locked(mas, slots, 0); - if ((mte_dead_node(next))) { - mte_to_node(next)->type = mte_node_type(next); - next = mas_slot_locked(mas, slots, 1); - } + *enode = next; + node = mte_to_node(*enode); + type = mte_node_type(*enode); + slots = ma_slots(node, type); + next = mt_slot_locked(mt, slots, next_offset); + if ((mte_dead_node(next))) + next = mt_slot_locked(mt, slots, ++next_offset); - mte_set_node_dead(mas->node); - node->type = mte_node_type(mas->node); - mas_clear_meta(mas, node, node->type); + mte_set_node_dead(*enode); + node->type = type; node->piv_parent = prev; node->parent_slot = offset; - offset = 0; - prev = mas->node; + offset = next_offset; + next_offset = 0; + prev = *enode; } while (!mte_is_leaf(next)); return slots; } -static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, +static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free) { void __rcu **slots; struct maple_node *node = mte_to_node(enode); struct maple_enode *start; - struct maple_tree mt; - - MA_STATE(mas, &mt, 0, 0); - mas.node = enode; if (mte_is_leaf(enode)) { node->type = mte_node_type(enode); goto free_leaf; } - ma_flags &= ~MT_FLAGS_LOCK_MASK; - mt_init_flags(&mt, ma_flags); - mas_lock(&mas); - - mte_to_node(enode)->ma_flags = ma_flags; start = enode; - slots = mas_destroy_descend(&mas, start, 0); - node = mas_mn(&mas); + slots = mte_destroy_descend(&enode, mt, start, 0); + node = mte_to_node(enode); // Updated in the above call. do { enum maple_type type; unsigned char offset; struct maple_enode *parent, *tmp; - node->type = mte_node_type(mas.node); - node->slot_len = mas_dead_leaves(&mas, slots, node->type); + node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; - mas.node = node->piv_parent; - if (mas_mn(&mas) == node) - goto start_slots_free; + enode = node->piv_parent; + if (mte_to_node(enode) == node) + goto free_leaf; - type = mte_node_type(mas.node); - slots = ma_slots(mte_to_node(mas.node), type); + type = mte_node_type(enode); + slots = ma_slots(mte_to_node(enode), type); if (offset >= mt_slots[type]) goto next; - tmp = mas_slot_locked(&mas, slots, offset); + tmp = mt_slot_locked(mt, slots, offset); if (mte_node_type(tmp) && mte_to_node(tmp)) { - parent = mas.node; - mas.node = tmp; - slots = mas_destroy_descend(&mas, parent, offset); + parent = enode; + enode = tmp; + slots = mte_destroy_descend(&enode, mt, parent, offset); } next: - node = mas_mn(&mas); - } while (start != mas.node); + node = mte_to_node(enode); + } while (start != enode); - node = mas_mn(&mas); - node->type = mte_node_type(mas.node); - node->slot_len = mas_dead_leaves(&mas, slots, node->type); + node = mte_to_node(enode); + node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); -start_slots_free: - mas_unlock(&mas); - free_leaf: if (free) mt_free_rcu(&node->rcu); else - mas_clear_meta(&mas, node, node->type); + mt_clear_meta(mt, node, node->type); } /* @@ -5688,10 +5692,10 @@ static inline void mte_destroy_walk(struct maple_enode *enode, struct maple_node *node = mte_to_node(enode); if (mt_in_rcu(mt)) { - mt_destroy_walk(enode, mt->ma_flags, false); + mt_destroy_walk(enode, mt, false); call_rcu(&node->rcu, mt_free_walk); } else { - mt_destroy_walk(enode, mt->ma_flags, true); + mt_destroy_walk(enode, mt, true); } } -- cgit From ec07967d7523adb3670f9dfee0232e3bc868f3de Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 14 Mar 2023 20:42:01 +0800 Subject: maple_tree: fix get wrong data_end in mtree_lookup_walk() if (likely(offset > end)) max = pivots[offset]; The above code should be changed to if (likely(offset < end)), which is correct. This affects the correctness of ma_data_end(). Now it seems that the final result will not be wrong, but it is best to change it. This patch does not change the code as above, because it simplifies the code by the way. Link: https://lkml.kernel.org/r/20230314124203.91572-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20230314124203.91572-2-zhangpeng.00@bytedance.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b1db0bd71aed..b8a230f5d94e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3941,18 +3941,13 @@ static inline void *mtree_lookup_walk(struct ma_state *mas) end = ma_data_end(node, type, pivots, max); if (unlikely(ma_dead_node(node))) goto dead_node; - - if (pivots[offset] >= mas->index) - goto next; - do { - offset++; - } while ((offset < end) && (pivots[offset] < mas->index)); - - if (likely(offset > end)) - max = pivots[offset]; + if (pivots[offset] >= mas->index) { + max = pivots[offset]; + break; + } + } while (++offset < end); -next: slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) -- cgit From c45ea315a602d45569b08b93e9ab30f6a63a38aa Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 14 Mar 2023 20:42:03 +0800 Subject: maple_tree: fix a potential concurrency bug in RCU mode There is a concurrency bug that may cause the wrong value to be loaded when a CPU is modifying the maple tree. CPU1: mtree_insert_range() mas_insert() mas_store_root() ... mas_root_expand() ... rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); ma_set_meta(node, maple_leaf_64, 0, slot); <---IP CPU2: mtree_load() mtree_lookup_walk() ma_data_end(); When CPU1 is about to execute the instruction pointed to by IP, the ma_data_end() executed by CPU2 may return the wrong end position, which will cause the value loaded by mtree_load() to be wrong. An example of triggering the bug: Add mdelay(100) between rcu_assign_pointer() and ma_set_meta() in mas_root_expand(). static DEFINE_MTREE(tree); int work(void *p) { unsigned long val; for (int i = 0 ; i< 30; ++i) { val = (unsigned long)mtree_load(&tree, 8); mdelay(5); pr_info("%lu",val); } return 0; } mt_init_flags(&tree, MT_FLAGS_USE_RCU); mtree_insert(&tree, 0, (void*)12345, GFP_KERNEL); run_thread(work) mtree_insert(&tree, 1, (void*)56789, GFP_KERNEL); In RCU mode, mtree_load() should always return the value before or after the data structure is modified, and in this example mtree_load(&tree, 8) may return 56789 which is not expected, it should always return NULL. Fix it by put ma_set_meta() before rcu_assign_pointer(). Link: https://lkml.kernel.org/r/20230314124203.91572-4-zhangpeng.00@bytedance.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b8a230f5d94e..db60edb55f2f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3725,10 +3725,9 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) slot++; mas->depth = 1; mas_set_height(mas); - + ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); - ma_set_meta(node, maple_leaf_64, 0, slot); return slot; } -- cgit From 4f80818b4a58c9458dce0df7cce9abe107da445e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 22 Mar 2023 18:57:03 +0000 Subject: iov_iter: add copy_page_to_iter_nofault() Provide a means to copy a page to user space from an iterator, aborting if a page fault would occur. This supports compound pages, but may be passed a tail page with an offset extending further into the compound page, so we cannot pass a folio. This allows for this function to be called from atomic context and _try_ to user pages if they are faulted in, aborting if not. The function does not use _copy_to_iter() in order to not specify might_fault(), this is similar to copy_page_from_iter_atomic(). This is being added in order that an iteratable form of vread() can be implemented while holding spinlocks. Link: https://lkml.kernel.org/r/19734729defb0f498a76bdec1bef3ac48a3af3e8.1679511146.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Baoquan He Cc: Alexander Viro Cc: David Hildenbrand Cc: Jens Axboe Cc: Jiri Olsa Cc: Liu Shixin Cc: Matthew Wilcox (Oracle) Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- lib/iov_iter.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'lib') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 274014e4eafe..34dd6bdf2fba 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -172,6 +172,18 @@ static int copyout(void __user *to, const void *from, size_t n) return n; } +static int copyout_nofault(void __user *to, const void *from, size_t n) +{ + long res; + + if (should_fail_usercopy()) + return n; + + res = copy_to_user_nofault(to, from, n); + + return res < 0 ? n : res; +} + static int copyin(void *to, const void __user *from, size_t n) { size_t res = n; @@ -734,6 +746,42 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, } EXPORT_SYMBOL(copy_page_to_iter); +size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, + struct iov_iter *i) +{ + size_t res = 0; + + if (!page_copy_sane(page, offset, bytes)) + return 0; + if (WARN_ON_ONCE(i->data_source)) + return 0; + if (unlikely(iov_iter_is_pipe(i))) + return copy_page_to_iter_pipe(page, offset, bytes, i); + page += offset / PAGE_SIZE; // first subpage + offset %= PAGE_SIZE; + while (1) { + void *kaddr = kmap_local_page(page); + size_t n = min(bytes, (size_t)PAGE_SIZE - offset); + + iterate_and_advance(i, n, base, len, off, + copyout_nofault(base, kaddr + offset + off, len), + memcpy(base, kaddr + offset + off, len) + ) + kunmap_local(kaddr); + res += n; + bytes -= n; + if (!bytes || !n) + break; + offset += n; + if (offset == PAGE_SIZE) { + page++; + offset = 0; + } + } + return res; +} +EXPORT_SYMBOL(copy_page_to_iter_nofault); + size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { -- cgit From 70e79866ab36feaaed8ef26dacfbcbac6a0631c9 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 28 Feb 2023 15:14:17 +0300 Subject: ELF: fix all "Elf" typos ELF is acronym and therefore should be spelled in all caps. I left one exception at Documentation/arm/nwfpe/nwfpe.rst which looks like being written in the first person. Link: https://lkml.kernel.org/r/Y/3wGWQviIOkyLJW@p183 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- lib/buildid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/buildid.c b/lib/buildid.c index dfc62625cae4..e3a7acdeef0e 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -163,7 +163,7 @@ out: /** * build_id_parse_buf - Get build ID from a buffer - * @buf: Elf note section(s) to parse + * @buf: ELF note section(s) to parse * @buf_size: Size of @buf in bytes * @build_id: Build ID parsed from @buf, at least BUILD_ID_SIZE_MAX long * -- cgit From ef55ef3e6400ede7d4020f5fd0bc7aeac4de1ceb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 27 Mar 2023 17:26:04 +0300 Subject: lib/test-string_helpers: replace UNESCAPE_ANY by UNESCAPE_ALL_MASK When we get a random number to generate a flag in the valid range of UNESCAPE flags, use UNESCAPE_ALL_MASK, It's more correct and prevents from missed updates of the test coverage in the future if any. Link: https://lkml.kernel.org/r/20230327142604.48213-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- lib/test-string_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c index 41d3447bc3b4..9a68849a5d55 100644 --- a/lib/test-string_helpers.c +++ b/lib/test-string_helpers.c @@ -587,7 +587,7 @@ static int __init test_string_helpers_init(void) for (i = 0; i < UNESCAPE_ALL_MASK + 1; i++) test_string_unescape("unescape", i, false); test_string_unescape("unescape inplace", - get_random_u32_below(UNESCAPE_ANY + 1), true); + get_random_u32_below(UNESCAPE_ALL_MASK + 1), true); /* Without dictionary */ for (i = 0; i < ESCAPE_ALL_MASK + 1; i++) -- cgit From 50f9a76ef127367847cf62999c79304e48018cfa Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 12 Apr 2023 10:46:48 -0600 Subject: iov_iter: Mark copy_compat_iovec_from_user() noinline After commit 6376ce56feb6 ("iov_iter: import single vector iovecs as ITER_UBUF"), GCC does an inter-procedural compiler optimization which moves the user_access_begin() out of copy_compat_iovec_from_user() and into its callers: lib/iov_iter.o: warning: objtool: .altinstr_replacement+0x0: redundant UACCESS disable lib/iov_iter.o: warning: objtool: iovec_from_user.part.0+0xc7: call to copy_compat_iovec_from_user.part.0() with UACCESS enabled lib/iov_iter.o: warning: objtool: __import_iovec+0x21d: call to copy_compat_iovec_from_user.part.0() with UACCESS enabled Enforce the "no UACCESS enable across function boundaries" rule by disabling cloning for copy_compat_iovec_from_user(). Fixes: 6376ce56feb6 ("iov_iter: import single vector iovecs as ITER_UBUF") Reported-by: Stephen Rothwell https://lkml.kernel.org/lkml/20230327120017.6bb826d7@canb.auug.org.au Signed-off-by: Josh Poimboeuf Tested-by: Jens Axboe Signed-off-by: Jens Axboe --- lib/iov_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 3e6c9bcfa612..86a066aa9bcc 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1702,7 +1702,7 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) } EXPORT_SYMBOL(dup_iter); -static int copy_compat_iovec_from_user(struct iovec *iov, +static __noclone int copy_compat_iovec_from_user(struct iovec *iov, const struct iovec __user *uvec, unsigned long nr_segs) { const struct compat_iovec __user *uiov = -- cgit From 4668c7a2940d134bea50058e138591b97485c5da Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 27 Mar 2023 23:37:32 +0900 Subject: fault-inject: allow configuration via configfs This provides a helper function to allow configuration of fault-injection for configfs-based drivers. The config items created by this function have the same interface as the one created under debugfs by fault_create_debugfs_attr(). Signed-off-by: Akinobu Mita Link: https://lore.kernel.org/r/20230327143733.14599-2-akinobu.mita@gmail.com Signed-off-by: Jens Axboe --- lib/Kconfig.debug | 13 +++- lib/fault-inject.c | 191 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 203 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c8b379e2e9ad..e700b29d7756 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1958,9 +1958,20 @@ config FAIL_SUNRPC Provide fault-injection capability for SunRPC and its consumers. +config FAULT_INJECTION_CONFIGFS + bool "Configfs interface for fault-injection capabilities" + depends on FAULT_INJECTION && CONFIGFS_FS + help + This option allows configfs-based drivers to dynamically configure + fault-injection via configfs. Each parameter for driver-specific + fault-injection can be made visible as a configfs attribute in a + configfs group. + + config FAULT_INJECTION_STACKTRACE_FILTER bool "stacktrace filter for fault-injection capabilities" - depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT + depends on FAULT_INJECTION + depends on (FAULT_INJECTION_DEBUG_FS || FAULT_INJECTION_CONFIGFS) && STACKTRACE_SUPPORT select STACKTRACE depends on FRAME_POINTER || MIPS || PPC || S390 || MICROBLAZE || ARM || ARC || X86 help diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 6cff320c4eb4..d608f9b48c10 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -244,3 +244,194 @@ struct dentry *fault_create_debugfs_attr(const char *name, EXPORT_SYMBOL_GPL(fault_create_debugfs_attr); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#ifdef CONFIG_FAULT_INJECTION_CONFIGFS + +/* These configfs attribute utilities are copied from drivers/block/null_blk/main.c */ + +static ssize_t fault_uint_attr_show(unsigned int val, char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", val); +} + +static ssize_t fault_ulong_attr_show(unsigned long val, char *page) +{ + return snprintf(page, PAGE_SIZE, "%lu\n", val); +} + +static ssize_t fault_bool_attr_show(bool val, char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", val); +} + +static ssize_t fault_atomic_t_attr_show(atomic_t val, char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", atomic_read(&val)); +} + +static ssize_t fault_uint_attr_store(unsigned int *val, const char *page, size_t count) +{ + unsigned int tmp; + int result; + + result = kstrtouint(page, 0, &tmp); + if (result < 0) + return result; + + *val = tmp; + return count; +} + +static ssize_t fault_ulong_attr_store(unsigned long *val, const char *page, size_t count) +{ + int result; + unsigned long tmp; + + result = kstrtoul(page, 0, &tmp); + if (result < 0) + return result; + + *val = tmp; + return count; +} + +static ssize_t fault_bool_attr_store(bool *val, const char *page, size_t count) +{ + bool tmp; + int result; + + result = kstrtobool(page, &tmp); + if (result < 0) + return result; + + *val = tmp; + return count; +} + +static ssize_t fault_atomic_t_attr_store(atomic_t *val, const char *page, size_t count) +{ + int tmp; + int result; + + result = kstrtoint(page, 0, &tmp); + if (result < 0) + return result; + + atomic_set(val, tmp); + return count; +} + +#define CONFIGFS_ATTR_NAMED(_pfx, _name, _attr_name) \ +static struct configfs_attribute _pfx##attr_##_name = { \ + .ca_name = _attr_name, \ + .ca_mode = 0644, \ + .ca_owner = THIS_MODULE, \ + .show = _pfx##_name##_show, \ + .store = _pfx##_name##_store, \ +} + +static struct fault_config *to_fault_config(struct config_item *item) +{ + return container_of(to_config_group(item), struct fault_config, group); +} + +#define FAULT_CONFIGFS_ATTR_NAMED(NAME, ATTR_NAME, MEMBER, TYPE) \ +static ssize_t fault_##NAME##_show(struct config_item *item, char *page) \ +{ \ + return fault_##TYPE##_attr_show(to_fault_config(item)->attr.MEMBER, page); \ +} \ +static ssize_t fault_##NAME##_store(struct config_item *item, const char *page, size_t count) \ +{ \ + struct fault_config *config = to_fault_config(item); \ + return fault_##TYPE##_attr_store(&config->attr.MEMBER, page, count); \ +} \ +CONFIGFS_ATTR_NAMED(fault_, NAME, ATTR_NAME) + +#define FAULT_CONFIGFS_ATTR(NAME, TYPE) \ + FAULT_CONFIGFS_ATTR_NAMED(NAME, __stringify(NAME), NAME, TYPE) + +FAULT_CONFIGFS_ATTR(probability, ulong); +FAULT_CONFIGFS_ATTR(interval, ulong); +FAULT_CONFIGFS_ATTR(times, atomic_t); +FAULT_CONFIGFS_ATTR(space, atomic_t); +FAULT_CONFIGFS_ATTR(verbose, ulong); +FAULT_CONFIGFS_ATTR_NAMED(ratelimit_interval, "verbose_ratelimit_interval_ms", + ratelimit_state.interval, uint); +FAULT_CONFIGFS_ATTR_NAMED(ratelimit_burst, "verbose_ratelimit_burst", + ratelimit_state.burst, uint); +FAULT_CONFIGFS_ATTR_NAMED(task_filter, "task-filter", task_filter, bool); + +#ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER + +static ssize_t fault_stacktrace_depth_show(struct config_item *item, char *page) +{ + return fault_ulong_attr_show(to_fault_config(item)->attr.stacktrace_depth, page); +} + +static ssize_t fault_stacktrace_depth_store(struct config_item *item, const char *page, + size_t count) +{ + int result; + unsigned long tmp; + + result = kstrtoul(page, 0, &tmp); + if (result < 0) + return result; + + to_fault_config(item)->attr.stacktrace_depth = + min_t(unsigned long, tmp, MAX_STACK_TRACE_DEPTH); + + return count; +} + +CONFIGFS_ATTR_NAMED(fault_, stacktrace_depth, "stacktrace-depth"); + +static ssize_t fault_xul_attr_show(unsigned long val, char *page) +{ + return snprintf(page, PAGE_SIZE, + sizeof(val) == sizeof(u32) ? "0x%08lx\n" : "0x%016lx\n", val); +} + +static ssize_t fault_xul_attr_store(unsigned long *val, const char *page, size_t count) +{ + return fault_ulong_attr_store(val, page, count); +} + +FAULT_CONFIGFS_ATTR_NAMED(require_start, "require-start", require_start, xul); +FAULT_CONFIGFS_ATTR_NAMED(require_end, "require-end", require_end, xul); +FAULT_CONFIGFS_ATTR_NAMED(reject_start, "reject-start", reject_start, xul); +FAULT_CONFIGFS_ATTR_NAMED(reject_end, "reject-end", reject_end, xul); + +#endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ + +static struct configfs_attribute *fault_config_attrs[] = { + &fault_attr_probability, + &fault_attr_interval, + &fault_attr_times, + &fault_attr_space, + &fault_attr_verbose, + &fault_attr_ratelimit_interval, + &fault_attr_ratelimit_burst, + &fault_attr_task_filter, +#ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER + &fault_attr_stacktrace_depth, + &fault_attr_require_start, + &fault_attr_require_end, + &fault_attr_reject_start, + &fault_attr_reject_end, +#endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ + NULL, +}; + +static const struct config_item_type fault_config_type = { + .ct_attrs = fault_config_attrs, + .ct_owner = THIS_MODULE, +}; + +void fault_config_init(struct fault_config *config, const char *name) +{ + config_group_init_type_name(&config->group, name, &fault_config_type); +} +EXPORT_SYMBOL_GPL(fault_config_init); + +#endif /* CONFIG_FAULT_INJECTION_CONFIGFS */ -- cgit From 3714878005d3b7d78c096ad1f1f463887eb9b928 Mon Sep 17 00:00:00 2001 From: Nick Alcock Date: Tue, 7 Mar 2023 18:01:30 +0000 Subject: crypto: remove MODULE_LICENSE in non-modules Since commit 8b41fc4454e ("kbuild: create modules.builtin without Makefile.modbuiltin or tristate.conf"), MODULE_LICENSE declarations are used to identify modules. As a consequence, uses of the macro in non-modules will cause modprobe to misidentify their containing object file as a module when it is not (false positives), and modprobe might succeed rather than failing with a suitable error message. So remove it in the files in this commit, none of which can be built as modules. Signed-off-by: Nick Alcock Suggested-by: Luis Chamberlain Cc: Luis Chamberlain Cc: linux-modules@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Hitomi Hasegawa Cc: Herbert Xu Cc: "David S. Miller" Cc: linux-crypto@vger.kernel.org Signed-off-by: Luis Chamberlain --- lib/crypto/blake2s-generic.c | 1 - lib/crypto/blake2s.c | 1 - 2 files changed, 2 deletions(-) (limited to 'lib') diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c index 75ccb3e633e6..4ffe3d927920 100644 --- a/lib/crypto/blake2s-generic.c +++ b/lib/crypto/blake2s-generic.c @@ -110,6 +110,5 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, EXPORT_SYMBOL(blake2s_compress_generic); -MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("BLAKE2s hash function"); MODULE_AUTHOR("Jason A. Donenfeld "); diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c index 98e688c6d891..71a316552cc5 100644 --- a/lib/crypto/blake2s.c +++ b/lib/crypto/blake2s.c @@ -67,6 +67,5 @@ static int __init blake2s_mod_init(void) } module_init(blake2s_mod_init); -MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("BLAKE2s hash function"); MODULE_AUTHOR("Jason A. Donenfeld "); -- cgit From ef5bbd1172f4bd7b9162654d9b167e89afe82867 Mon Sep 17 00:00:00 2001 From: Nick Alcock Date: Mon, 20 Mar 2023 10:20:10 +0000 Subject: crypto: blake2s: remove module-related code Now blake2s-generic.c can no longer be a module, drop all remaining module-related code as well. Signed-off-by: Nick Alcock Requested-by: Herbert Xu Cc: Luis Chamberlain Cc: linux-modules@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Hitomi Hasegawa Cc: Herbert Xu Cc: "David S. Miller" Cc: linux-crypto@vger.kernel.org Signed-off-by: Luis Chamberlain --- lib/crypto/blake2s-generic.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'lib') diff --git a/lib/crypto/blake2s-generic.c b/lib/crypto/blake2s-generic.c index 4ffe3d927920..3b6dcfdd9628 100644 --- a/lib/crypto/blake2s-generic.c +++ b/lib/crypto/blake2s-generic.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -109,6 +108,3 @@ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, } EXPORT_SYMBOL(blake2s_compress_generic); - -MODULE_DESCRIPTION("BLAKE2s hash function"); -MODULE_AUTHOR("Jason A. Donenfeld "); -- cgit From 5e0266f0e5f57617472d5aac4013f58a3ef264ac Mon Sep 17 00:00:00 2001 From: Nick Alcock Date: Tue, 7 Mar 2023 18:01:52 +0000 Subject: lib: remove MODULE_LICENSE in non-modules Since commit 8b41fc4454e ("kbuild: create modules.builtin without Makefile.modbuiltin or tristate.conf"), MODULE_LICENSE declarations are used to identify modules. As a consequence, uses of the macro in non-modules will cause modprobe to misidentify their containing object file as a module when it is not (false positives), and modprobe might succeed rather than failing with a suitable error message. So remove it in the files in this commit, none of which can be built as modules. Signed-off-by: Nick Alcock Suggested-by: Luis Chamberlain Acked-by: Jacob Keller Cc: Luis Chamberlain Cc: linux-modules@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Hitomi Hasegawa Cc: Jacob Keller Signed-off-by: Luis Chamberlain --- lib/pldmfw/pldmfw.c | 1 - 1 file changed, 1 deletion(-) (limited to 'lib') diff --git a/lib/pldmfw/pldmfw.c b/lib/pldmfw/pldmfw.c index 6e77eb6d8e72..54e1809a38fd 100644 --- a/lib/pldmfw/pldmfw.c +++ b/lib/pldmfw/pldmfw.c @@ -875,5 +875,4 @@ out_release_data: EXPORT_SYMBOL(pldmfw_flash_image); MODULE_AUTHOR("Jacob Keller "); -MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("PLDM firmware flash update library"); -- cgit From 0c9bf64c5b38ce4feb06a6d360ef0c9280340049 Mon Sep 17 00:00:00 2001 From: Nick Alcock Date: Tue, 7 Mar 2023 18:02:04 +0000 Subject: btree: remove MODULE_LICENSE in non-modules Since commit 8b41fc4454e ("kbuild: create modules.builtin without Makefile.modbuiltin or tristate.conf"), MODULE_LICENSE declarations are used to identify modules. As a consequence, uses of the macro in non-modules will cause modprobe to misidentify their containing object file as a module when it is not (false positives), and modprobe might succeed rather than failing with a suitable error message. So remove it in the files in this commit, none of which can be built as modules. Signed-off-by: Nick Alcock Suggested-by: Luis Chamberlain Cc: Luis Chamberlain Cc: linux-modules@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Hitomi Hasegawa Signed-off-by: Luis Chamberlain --- lib/btree.c | 1 - 1 file changed, 1 deletion(-) (limited to 'lib') diff --git a/lib/btree.c b/lib/btree.c index a82100c73b55..49420cae3a83 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -794,4 +794,3 @@ module_exit(btree_module_exit); MODULE_AUTHOR("Joern Engel "); MODULE_AUTHOR("Johannes Berg "); -MODULE_LICENSE("GPL"); -- cgit From 7f82b39dc3e41bc12a207101d961353875b05b7d Mon Sep 17 00:00:00 2001 From: Nick Alcock Date: Tue, 7 Mar 2023 18:02:05 +0000 Subject: treewide: remove MODULE_LICENSE in non-modules Since commit 8b41fc4454e ("kbuild: create modules.builtin without Makefile.modbuiltin or tristate.conf"), MODULE_LICENSE declarations are used to identify modules. As a consequence, uses of the macro in non-modules will cause modprobe to misidentify their containing object file as a module when it is not (false positives), and modprobe might succeed rather than failing with a suitable error message. So remove it in the files in this commit, none of which can be built as modules. Signed-off-by: Nick Alcock Suggested-by: Luis Chamberlain Cc: Luis Chamberlain Cc: linux-modules@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Hitomi Hasegawa Signed-off-by: Luis Chamberlain --- lib/test_fprobe.c | 1 - 1 file changed, 1 deletion(-) (limited to 'lib') diff --git a/lib/test_fprobe.c b/lib/test_fprobe.c index 1fb56cf5e5ce..fd6153800e56 100644 --- a/lib/test_fprobe.c +++ b/lib/test_fprobe.c @@ -168,4 +168,3 @@ static struct kunit_suite fprobe_test_suite = { kunit_test_suites(&fprobe_test_suite); -MODULE_LICENSE("GPL"); -- cgit From 63a759694eed61025713b3e14dd827c8548daadc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Apr 2023 09:54:39 +0200 Subject: debugobject: Prevent init race with static objects Statically initialized objects are usually not initialized via the init() function of the subsystem. They are special cased and the subsystem provides a function to validate whether an object which is not yet tracked by debugobjects is statically initialized. This means the object is started to be tracked on first use, e.g. activation. This works perfectly fine, unless there are two concurrent operations on that object. Schspa decoded the problem: T0 T1 debug_object_assert_init(addr) lock_hash_bucket() obj = lookup_object(addr); if (!obj) { unlock_hash_bucket(); - > preemption lock_subsytem_object(addr); activate_object(addr) lock_hash_bucket(); obj = lookup_object(addr); if (!obj) { unlock_hash_bucket(); if (is_static_object(addr)) init_and_track(addr); lock_hash_bucket(); obj = lookup_object(addr); obj->state = ACTIVATED; unlock_hash_bucket(); subsys function modifies content of addr, so static object detection does not longer work. unlock_subsytem_object(addr); if (is_static_object(addr)) <- Fails debugobject emits a warning and invokes the fixup function which reinitializes the already active object in the worst case. This race exists forever, but was never observed until mod_timer() got a debug_object_assert_init() added which is outside of the timer base lock held section right at the beginning of the function to cover the lockless early exit points too. Rework the code so that the lookup, the static object check and the tracking object association happens atomically under the hash bucket lock. This prevents the issue completely as all callers are serialized on the hash bucket lock and therefore cannot observe inconsistent state. Fixes: 3ac7fe5a4aab ("infrastructure to debug (dynamic) objects") Reported-by: syzbot+5093ba19745994288b53@syzkaller.appspotmail.com Debugged-by: Schspa Shi Signed-off-by: Thomas Gleixner Reviewed-by: Stephen Boyd Link: https://syzkaller.appspot.com/bug?id=22c8a5938eab640d1c6bcc0e3dc7be519d878462 Link: https://lore.kernel.org/lkml/20230303161906.831686-1-schspa@gmail.com Link: https://lore.kernel.org/r/87zg7dzgao.ffs@tglx --- lib/debugobjects.c | 125 ++++++++++++++++++++++++++++------------------------- 1 file changed, 66 insertions(+), 59 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index df86e649d8be..b796799fadb2 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -216,10 +216,6 @@ static struct debug_obj *__alloc_object(struct hlist_head *list) return obj; } -/* - * Allocate a new object. If the pool is empty, switch off the debugger. - * Must be called with interrupts disabled. - */ static struct debug_obj * alloc_object(void *addr, struct debug_bucket *b, const struct debug_obj_descr *descr) { @@ -552,11 +548,49 @@ static void debug_object_is_on_stack(void *addr, int onstack) WARN_ON(1); } +static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket *b, + const struct debug_obj_descr *descr, + bool onstack, bool alloc_ifstatic) +{ + struct debug_obj *obj = lookup_object(addr, b); + enum debug_obj_state state = ODEBUG_STATE_NONE; + + if (likely(obj)) + return obj; + + /* + * debug_object_init() unconditionally allocates untracked + * objects. It does not matter whether it is a static object or + * not. + * + * debug_object_assert_init() and debug_object_activate() allow + * allocation only if the descriptor callback confirms that the + * object is static and considered initialized. For non-static + * objects the allocation needs to be done from the fixup callback. + */ + if (unlikely(alloc_ifstatic)) { + if (!descr->is_static_object || !descr->is_static_object(addr)) + return ERR_PTR(-ENOENT); + /* Statically allocated objects are considered initialized */ + state = ODEBUG_STATE_INIT; + } + + obj = alloc_object(addr, b, descr); + if (likely(obj)) { + obj->state = state; + debug_object_is_on_stack(addr, onstack); + return obj; + } + + /* Out of memory. Do the cleanup outside of the locked region */ + debug_objects_enabled = 0; + return NULL; +} + static void __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack) { enum debug_obj_state state; - bool check_stack = false; struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; @@ -572,16 +606,11 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack raw_spin_lock_irqsave(&db->lock, flags); - obj = lookup_object(addr, db); - if (!obj) { - obj = alloc_object(addr, db, descr); - if (!obj) { - debug_objects_enabled = 0; - raw_spin_unlock_irqrestore(&db->lock, flags); - debug_objects_oom(); - return; - } - check_stack = true; + obj = lookup_object_or_alloc(addr, db, descr, onstack, false); + if (unlikely(!obj)) { + raw_spin_unlock_irqrestore(&db->lock, flags); + debug_objects_oom(); + return; } switch (obj->state) { @@ -607,8 +636,6 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack } raw_spin_unlock_irqrestore(&db->lock, flags); - if (check_stack) - debug_object_is_on_stack(addr, onstack); } /** @@ -648,14 +675,12 @@ EXPORT_SYMBOL_GPL(debug_object_init_on_stack); */ int debug_object_activate(void *addr, const struct debug_obj_descr *descr) { + struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; enum debug_obj_state state; struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; int ret; - struct debug_obj o = { .object = addr, - .state = ODEBUG_STATE_NOTAVAILABLE, - .descr = descr }; if (!debug_objects_enabled) return 0; @@ -664,8 +689,8 @@ int debug_object_activate(void *addr, const struct debug_obj_descr *descr) raw_spin_lock_irqsave(&db->lock, flags); - obj = lookup_object(addr, db); - if (obj) { + obj = lookup_object_or_alloc(addr, db, descr, false, true); + if (likely(!IS_ERR_OR_NULL(obj))) { bool print_object = false; switch (obj->state) { @@ -698,24 +723,16 @@ int debug_object_activate(void *addr, const struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); - /* - * We are here when a static object is activated. We - * let the type specific code confirm whether this is - * true or not. if true, we just make sure that the - * static object is tracked in the object tracker. If - * not, this must be a bug, so we try to fix it up. - */ - if (descr->is_static_object && descr->is_static_object(addr)) { - /* track this static object */ - debug_object_init(addr, descr); - debug_object_activate(addr, descr); - } else { - debug_print_object(&o, "activate"); - ret = debug_object_fixup(descr->fixup_activate, addr, - ODEBUG_STATE_NOTAVAILABLE); - return ret ? 0 : -EINVAL; + /* If NULL the allocation has hit OOM */ + if (!obj) { + debug_objects_oom(); + return 0; } - return 0; + + /* Object is neither static nor tracked. It's not initialized */ + debug_print_object(&o, "activate"); + ret = debug_object_fixup(descr->fixup_activate, addr, ODEBUG_STATE_NOTAVAILABLE); + return ret ? 0 : -EINVAL; } EXPORT_SYMBOL_GPL(debug_object_activate); @@ -869,6 +886,7 @@ EXPORT_SYMBOL_GPL(debug_object_free); */ void debug_object_assert_init(void *addr, const struct debug_obj_descr *descr) { + struct debug_obj o = { .object = addr, .state = ODEBUG_STATE_NOTAVAILABLE, .descr = descr }; struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; @@ -879,31 +897,20 @@ void debug_object_assert_init(void *addr, const struct debug_obj_descr *descr) db = get_bucket((unsigned long) addr); raw_spin_lock_irqsave(&db->lock, flags); + obj = lookup_object_or_alloc(addr, db, descr, false, true); + raw_spin_unlock_irqrestore(&db->lock, flags); + if (likely(!IS_ERR_OR_NULL(obj))) + return; - obj = lookup_object(addr, db); + /* If NULL the allocation has hit OOM */ if (!obj) { - struct debug_obj o = { .object = addr, - .state = ODEBUG_STATE_NOTAVAILABLE, - .descr = descr }; - - raw_spin_unlock_irqrestore(&db->lock, flags); - /* - * Maybe the object is static, and we let the type specific - * code confirm. Track this static object if true, else invoke - * fixup. - */ - if (descr->is_static_object && descr->is_static_object(addr)) { - /* Track this static object */ - debug_object_init(addr, descr); - } else { - debug_print_object(&o, "assert_init"); - debug_object_fixup(descr->fixup_assert_init, addr, - ODEBUG_STATE_NOTAVAILABLE); - } + debug_objects_oom(); return; } - raw_spin_unlock_irqrestore(&db->lock, flags); + /* Object is neither tracked nor static. It's not initialized. */ + debug_print_object(&o, "assert_init"); + debug_object_fixup(descr->fixup_assert_init, addr, ODEBUG_STATE_NOTAVAILABLE); } EXPORT_SYMBOL_GPL(debug_object_assert_init); -- cgit From 1f5f12ece722aacea1769fb644f27790ede339dc Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Apr 2023 12:10:04 +0800 Subject: maple_tree: fix a potential memory leak, OOB access, or other unpredictable bug In mas_alloc_nodes(), "node->node_count = 0" means to initialize the node_count field of the new node, but the node may not be a new node. It may be a node that existed before and node_count has a value, setting it to 0 will cause a memory leak. At this time, mas->alloc->total will be greater than the actual number of nodes in the linked list, which may cause many other errors. For example, out-of-bounds access in mas_pop_node(), and mas_pop_node() may return addresses that should not be used. Fix it by initializing node_count only for new nodes. Also, by the way, an if-else statement was removed to simplify the code. Link: https://lkml.kernel.org/r/20230411041005.26205-1-zhangpeng.00@bytedance.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index db60edb55f2f..7ff2a821a2a1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1303,26 +1303,21 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) node = mas->alloc; node->request_count = 0; while (requested) { - max_req = MAPLE_ALLOC_SLOTS; - if (node->node_count) { - unsigned int offset = node->node_count; - - slots = (void **)&node->slot[offset]; - max_req -= offset; - } else { - slots = (void **)&node->slot; - } - + max_req = MAPLE_ALLOC_SLOTS - node->node_count; + slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; + if (node->node_count == 0) { + node->slot[0]->node_count = 0; + node->slot[0]->request_count = 0; + } + node->node_count += count; allocated += count; node = node->slot[0]; - node->node_count = 0; - node->request_count = 0; requested -= count; } mas->alloc->total = allocated; -- cgit From d325c162631eb7c21f244f48184f3c8ace868039 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 15 Apr 2023 21:57:05 +0900 Subject: fault-inject: fix build error when FAULT_INJECTION_CONFIGFS=y and CONFIGFS_FS=m This fixes a build error when CONFIG_FAULT_INJECTION_CONFIGFS=y and CONFIG_CONFIGFS_FS=m. Since the fault-injection library cannot built as a module, avoid building configfs as a module. Fixes: 4668c7a2940d ("fault-inject: allow configuration via configfs") Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202304150025.K0hczLR4-lkp@intel.com/ Signed-off-by: Akinobu Mita Signed-off-by: Jens Axboe --- lib/Kconfig.debug | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e700b29d7756..48375e027b09 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1960,7 +1960,8 @@ config FAIL_SUNRPC config FAULT_INJECTION_CONFIGFS bool "Configfs interface for fault-injection capabilities" - depends on FAULT_INJECTION && CONFIGFS_FS + depends on FAULT_INJECTION + select CONFIGFS_FS help This option allows configfs-based drivers to dynamically configure fault-injection via configfs. Each parameter for driver-specific -- cgit From 7533583e125d65bac1435410ecb56433e95eade0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2023 07:49:05 +0200 Subject: libcrc32c: remove crc32c_impl This was only ever used by btrfs, and the usage just went away. This effectively reverts df91f56adce1 ("libcrc32c: Add crc32c_impl function"). Acked-by: Herbert Xu Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- lib/libcrc32c.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'lib') diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c index 5ca0d815a95d..649e687413a0 100644 --- a/lib/libcrc32c.c +++ b/lib/libcrc32c.c @@ -65,12 +65,6 @@ static void __exit libcrc32c_mod_fini(void) crypto_free_shash(tfm); } -const char *crc32c_impl(void) -{ - return crypto_shash_driver_name(tfm); -} -EXPORT_SYMBOL(crc32c_impl); - module_init(libcrc32c_mod_init); module_exit(libcrc32c_mod_fini); -- cgit From fad8e4291da5e3243e086622df63cb952db444d8 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Apr 2023 10:57:26 -0400 Subject: maple_tree: make maple state reusable after mas_empty_area_rev() Stop using maple state min/max for the range by passing through pointers for those values. This will allow the maple state to be reused without resetting. Also add some logic to fail out early on searching with invalid arguments. Link: https://lkml.kernel.org/r/20230414145728.4067069-1-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reported-by: Rick Edgecombe Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7ff2a821a2a1..d197b49eee67 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4965,7 +4965,8 @@ not_found: * Return: True if found in a leaf, false otherwise. * */ -static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) +static bool mas_rev_awalk(struct ma_state *mas, unsigned long size, + unsigned long *gap_min, unsigned long *gap_max) { enum maple_type type = mte_node_type(mas->node); struct maple_node *node = mas_mn(mas); @@ -5030,8 +5031,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) if (unlikely(ma_is_leaf(type))) { mas->offset = offset; - mas->min = min; - mas->max = min + gap - 1; + *gap_min = min; + *gap_max = min + gap - 1; return true; } @@ -5307,6 +5308,9 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long *pivots; enum maple_type mt; + if (min >= max) + return -EINVAL; + if (mas_is_start(mas)) mas_start(mas); else if (mas->offset >= 2) @@ -5361,6 +5365,9 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, { struct maple_enode *last = mas->node; + if (min >= max) + return -EINVAL; + if (mas_is_start(mas)) { mas_start(mas); mas->offset = mas_data_end(mas); @@ -5380,7 +5387,7 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, mas->index = min; mas->last = max; - while (!mas_rev_awalk(mas, size)) { + while (!mas_rev_awalk(mas, size, &min, &max)) { if (last == mas->node) { if (!mas_rewind_node(mas)) return -EBUSY; @@ -5395,17 +5402,9 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, if (unlikely(mas->offset == MAPLE_NODE_SLOTS)) return -EBUSY; - /* - * mas_rev_awalk() has set mas->min and mas->max to the gap values. If - * the maximum is outside the window we are searching, then use the last - * location in the search. - * mas->max and mas->min is the range of the gap. - * mas->index and mas->last are currently set to the search range. - */ - /* Trim the upper limit to the max. */ - if (mas->max <= mas->last) - mas->last = mas->max; + if (max <= mas->last) + mas->last = max; mas->index = mas->last - size + 1; return 0; -- cgit From 06e8fd999334bcd76b4d72d7b9206d4aea89764e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Apr 2023 10:57:27 -0400 Subject: maple_tree: fix mas_empty_area() search The internal function of mas_awalk() was incorrectly skipping the last entry in a node, which could potentially be NULL. This is only a problem for the left-most node in the tree - otherwise that NULL would not exist. Fix mas_awalk() by using the metadata to obtain the end of the node for the loop and the logical pivot as apposed to the raw pivot value. Link: https://lkml.kernel.org/r/20230414145728.4067069-2-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reported-by: Rick Edgecombe Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d197b49eee67..1281a40d5735 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5056,10 +5056,10 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) { enum maple_type type = mte_node_type(mas->node); unsigned long pivot, min, gap = 0; - unsigned char offset; - unsigned long *gaps; - unsigned long *pivots = ma_pivots(mas_mn(mas), type); - void __rcu **slots = ma_slots(mas_mn(mas), type); + unsigned char offset, data_end; + unsigned long *gaps, *pivots; + void __rcu **slots; + struct maple_node *node; bool found = false; if (ma_is_dense(type)) { @@ -5067,13 +5067,15 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) return true; } - gaps = ma_gaps(mte_to_node(mas->node), type); + node = mas_mn(mas); + pivots = ma_pivots(node, type); + slots = ma_slots(node, type); + gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); - for (; offset < mt_slots[type]; offset++) { - pivot = mas_safe_pivot(mas, pivots, offset, type); - if (offset && !pivot) - break; + data_end = ma_data_end(node, type, pivots, mas->max); + for (; offset <= data_end; offset++) { + pivot = mas_logical_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) -- cgit From 869cb29a61a14bbc52e7bc8b18e8810874caf320 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 30 Mar 2023 21:06:39 +0200 Subject: lib/test_vmalloc.c: add vm_map_ram()/vm_unmap_ram() test case Add vm_map_ram()/vm_unmap_ram() test case to our stress test-suite. [akpm@linux-foundation.org: fix whitespace, per Lorenzo] Link: https://lkml.kernel.org/r/20230330190639.431589-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Lorenzo Stoakes Reviewed-by: Baoquan He Cc: Christoph Hellwig Cc: Dave Chinner Cc: Matthew Wilcox (Oracle) Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'lib') diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index de4ee0d50906..84c124f097b7 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -53,6 +53,7 @@ __param(int, run_test_mask, INT_MAX, "\t\tid: 128, name: pcpu_alloc_test\n" "\t\tid: 256, name: kvfree_rcu_1_arg_vmalloc_test\n" "\t\tid: 512, name: kvfree_rcu_2_arg_vmalloc_test\n" + "\t\tid: 1024, name: vm_map_ram_test\n" /* Add a new test case description here. */ ); @@ -358,6 +359,41 @@ kvfree_rcu_2_arg_vmalloc_test(void) return 0; } +static int +vm_map_ram_test(void) +{ + unsigned long nr_allocated; + unsigned int map_nr_pages; + unsigned char *v_ptr; + struct page **pages; + int i; + + map_nr_pages = nr_pages > 0 ? nr_pages:1; + pages = kmalloc(map_nr_pages * sizeof(struct page), GFP_KERNEL); + if (!pages) + return -1; + + nr_allocated = alloc_pages_bulk_array(GFP_KERNEL, map_nr_pages, pages); + if (nr_allocated != map_nr_pages) + goto cleanup; + + /* Run the test loop. */ + for (i = 0; i < test_loop_count; i++) { + v_ptr = vm_map_ram(pages, map_nr_pages, NUMA_NO_NODE); + *v_ptr = 'a'; + vm_unmap_ram(v_ptr, map_nr_pages); + } + +cleanup: + for (i = 0; i < nr_allocated; i++) + __free_page(pages[i]); + + kfree(pages); + + /* 0 indicates success. */ + return nr_allocated != map_nr_pages; +} + struct test_case_desc { const char *test_name; int (*test_func)(void); @@ -374,6 +410,7 @@ static struct test_case_desc test_case_array[] = { { "pcpu_alloc_test", pcpu_alloc_test }, { "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test }, { "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test }, + { "vm_map_ram_test", vm_map_ram_test }, /* Add a new test case here. */ }; -- cgit From 97f7e09481f312b143db53cadbdfe81abac97e73 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 14 Mar 2023 20:42:02 +0800 Subject: maple_tree: simplify mas_wr_node_walk() Simplify code of mas_wr_node_walk() without changing functionality, and improve readability. Remove some special judgments. Instead of dynamically recording the min and max in the loop, get the final min and max directly at the end. Link: https://lkml.kernel.org/r/20230314124203.91572-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 34 +++++----------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4a6ecdb12a92..f475bac9d914 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2312,9 +2312,7 @@ static inline struct maple_enode *mte_node_or_none(struct maple_enode *enode) static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; - unsigned char count; - unsigned char offset; - unsigned long index, min, max; + unsigned char count, offset; if (unlikely(ma_is_dense(wr_mas->type))) { wr_mas->r_max = wr_mas->r_min = mas->index; @@ -2327,34 +2325,12 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) count = wr_mas->node_end = ma_data_end(wr_mas->node, wr_mas->type, wr_mas->pivots, mas->max); offset = mas->offset; - min = mas_safe_min(mas, wr_mas->pivots, offset); - if (unlikely(offset == count)) - goto max; - - max = wr_mas->pivots[offset]; - index = mas->index; - if (unlikely(index <= max)) - goto done; - - if (unlikely(!max && offset)) - goto max; - min = max + 1; - while (++offset < count) { - max = wr_mas->pivots[offset]; - if (index <= max) - goto done; - else if (unlikely(!max)) - break; - - min = max + 1; - } + while (offset < count && mas->index > wr_mas->pivots[offset]) + offset++; -max: - max = mas->max; -done: - wr_mas->r_max = max; - wr_mas->r_min = min; + wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max; + wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset); wr_mas->offset_end = mas->offset = offset; } -- cgit From fb20e99a74f8f08c53061e0186d0c26d546dc843 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Apr 2023 10:35:13 +0800 Subject: maple_tree: use correct variable type in sizeof The type of variable pointed to by pivs is unsigned long, but the type used in sizeof is a pointer type. Change it to unsigned long. This change has no runtime effect, as sizeof(ul) == sizeof(ul *). Link: https://lkml.kernel.org/r/20230411023513.15227-1-zhangpeng.00@bytedance.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Peng Zhang Reported-by: David Binderman Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index f475bac9d914..9172bcee94b4 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3258,7 +3258,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end if (tmp < max_p) memset(pivs + tmp, 0, - sizeof(unsigned long *) * (max_p - tmp)); + sizeof(unsigned long) * (max_p - tmp)); if (tmp < mt_slots[mt]) memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp)); -- cgit From b0687c1119b4e8c88a651b6e876b7eae28d076e3 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 4 Apr 2023 17:13:51 -0500 Subject: lib/rbtree: use '+' instead of '|' for setting color. This has a slight benefit for x86 and has no effect on other targets. The benefit to x86 is it change the codegen for setting a node to block from `mov %r0, %r1; or $RB_BLACK, %r1` to `lea RB_BLACK(%r0), %r1` which saves an instructions. In all other cases it just replace ALU with ALU (or -> and) which perform the same on all machines I am aware of. Total instructions in rbtree.o: Before - 802 After - 782 so it saves about 20 `mov` instructions. Link: https://lkml.kernel.org/r/20230404221350.3806566-1-goldstein.w.n@gmail.com Signed-off-by: Noah Goldstein Cc: Michel Lespinasse Signed-off-by: Andrew Morton --- lib/rbtree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/rbtree.c b/lib/rbtree.c index c4ac5c2421f2..5114eda6309c 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -58,7 +58,7 @@ static inline void rb_set_black(struct rb_node *rb) { - rb->__rb_parent_color |= RB_BLACK; + rb->__rb_parent_color += RB_BLACK; } static inline struct rb_node *rb_red_parent(struct rb_node *red) -- cgit From aaf0594829c3a6f16bdf5d30904a7db4548dae15 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 23 Mar 2023 13:30:33 +0800 Subject: lib/group_cpus: Export group_cpus_evenly() Export group_cpus_evenly() so that some modules can make use of it to group CPUs evenly according to NUMA and CPU locality. Signed-off-by: Xie Yongji Acked-by: Jason Wang Message-Id: <20230323053043.35-2-xieyongji@bytedance.com> Signed-off-by: Michael S. Tsirkin --- lib/group_cpus.c | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/group_cpus.c b/lib/group_cpus.c index 9c837a35fef7..aa3f6815bb12 100644 --- a/lib/group_cpus.c +++ b/lib/group_cpus.c @@ -426,3 +426,4 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) return masks; } #endif /* CONFIG_SMP */ +EXPORT_SYMBOL_GPL(group_cpus_evenly); -- cgit From 13215e8a4bb336dac2af561d4f5c34a071810ee4 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Mon, 17 Apr 2023 11:52:26 +0800 Subject: lib/show_mem.c: use for_each_populated_zone() simplify code __show_mem() needs to iterate over all zones that have memory, we can simplify the code by using for_each_populated_zone(). Link: https://lkml.kernel.org/r/20230417035226.4013584-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton --- lib/show_mem.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'lib') diff --git a/lib/show_mem.c b/lib/show_mem.c index 0d7585cde2a6..1485c87be935 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -10,26 +10,19 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { - pg_data_t *pgdat; unsigned long total = 0, reserved = 0, highmem = 0; + struct zone *zone; printk("Mem-Info:\n"); __show_free_areas(filter, nodemask, max_zone_idx); - for_each_online_pgdat(pgdat) { - int zoneid; + for_each_populated_zone(zone) { - for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { - struct zone *zone = &pgdat->node_zones[zoneid]; - if (!populated_zone(zone)) - continue; + total += zone->present_pages; + reserved += zone->present_pages - zone_managed_pages(zone); - total += zone->present_pages; - reserved += zone->present_pages - zone_managed_pages(zone); - - if (is_highmem_idx(zoneid)) - highmem += zone->present_pages; - } + if (is_highmem(zone)) + highmem += zone->present_pages; } printk("%lu pages RAM\n", total); -- cgit From 29ad6bb313487370f9dfe5441fc8982593b6384e Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 19 Apr 2023 17:36:25 +0800 Subject: maple_tree: fix allocation in mas_sparse_area() In the case of reverse allocation, mas->index and mas->last do not point to the correct allocation range, which will cause users to get incorrect allocation results, so fix it. If the user does not use it in a specific way, this bug will not be triggered. This is a bug, but only VMA uses it now, the way VMA is used now will not trigger it. There is a possibility that a user will trigger it in the future. Also re-check whether the size is still satisfied after the lower bound was increased, which is a corner case and is incorrect in previous versions. Link: https://lkml.kernel.org/r/20230419093625.99201-1-zhangpeng.00@bytedance.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Peng Zhang Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 9172bcee94b4..110a36479dce 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5250,25 +5250,28 @@ static inline void mas_fill_gap(struct ma_state *mas, void *entry, * @size: The size of the gap * @fwd: Searching forward or back */ -static inline void mas_sparse_area(struct ma_state *mas, unsigned long min, +static inline int mas_sparse_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size, bool fwd) { - unsigned long start = 0; - - if (!unlikely(mas_is_none(mas))) - start++; + if (!unlikely(mas_is_none(mas)) && min == 0) { + min++; + /* + * At this time, min is increased, we need to recheck whether + * the size is satisfied. + */ + if (min > max || max - min + 1 < size) + return -EBUSY; + } /* mas_is_ptr */ - if (start < min) - start = min; - if (fwd) { - mas->index = start; - mas->last = start + size - 1; - return; + mas->index = min; + mas->last = min + size - 1; + } else { + mas->last = max; + mas->index = max - size + 1; } - - mas->index = max; + return 0; } /* @@ -5297,10 +5300,8 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, return -EBUSY; /* Empty set */ - if (mas_is_none(mas) || mas_is_ptr(mas)) { - mas_sparse_area(mas, min, max, size, true); - return 0; - } + if (mas_is_none(mas) || mas_is_ptr(mas)) + return mas_sparse_area(mas, min, max, size, true); /* The start of the window can only be within these values */ mas->index = min; @@ -5356,10 +5357,8 @@ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, } /* Empty set. */ - if (mas_is_none(mas) || mas_is_ptr(mas)) { - mas_sparse_area(mas, min, max, size, false); - return 0; - } + if (mas_is_none(mas) || mas_is_ptr(mas)) + return mas_sparse_area(mas, min, max, size, false); /* The start of the window can only be within these values. */ mas->index = min; -- cgit From 487c20b016dc48230367a7be017f40313e53e3bd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 30 Mar 2023 14:53:51 -0700 Subject: iov: improve copy_iovec_from_user() code generation Use the same pattern as the compat version of this code does: instead of copying the whole array to a kernel buffer and then having a separate phase of verifying it, just do it one entry at a time, verifying as you go. On Jens' /dev/zero readv() test this improves performance by ~6%. [ This was obviously triggered by Jens' ITER_UBUF updates series ] Reported-and-tested-by: Jens Axboe Link: https://lore.kernel.org/all/de35d11d-bce7-e976-7372-1f2caf417103@kernel.dk/ Signed-off-by: Linus Torvalds --- lib/iov_iter.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 86a066aa9bcc..967fba189c5f 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1735,18 +1735,35 @@ uaccess_end: } static int copy_iovec_from_user(struct iovec *iov, - const struct iovec __user *uvec, unsigned long nr_segs) + const struct iovec __user *uiov, unsigned long nr_segs) { - unsigned long seg; + int ret = -EFAULT; - if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) + if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; - for (seg = 0; seg < nr_segs; seg++) { - if ((ssize_t)iov[seg].iov_len < 0) - return -EINVAL; - } - return 0; + do { + void __user *buf; + ssize_t len; + + unsafe_get_user(len, &uiov->iov_len, uaccess_end); + unsafe_get_user(buf, &uiov->iov_base, uaccess_end); + + /* check for size_t not fitting in ssize_t .. */ + if (unlikely(len < 0)) { + ret = -EINVAL; + goto uaccess_end; + } + iov->iov_base = buf; + iov->iov_len = len; + + uiov++; iov++; + } while (--nr_segs); + + ret = 0; +uaccess_end: + user_access_end(); + return ret; } struct iovec *iovec_from_user(const struct iovec __user *uvec, @@ -1771,7 +1788,7 @@ struct iovec *iovec_from_user(const struct iovec __user *uvec, return ERR_PTR(-ENOMEM); } - if (compat) + if (unlikely(compat)) ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); else ret = copy_iovec_from_user(iov, uvec, nr_segs); -- cgit From 96928d9032a7c34f12a88df879665562bcebf59a Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 15 Apr 2023 19:01:10 +0900 Subject: seq_buf: Add seq_buf_do_printk() helper Sometimes we use seq_buf to format a string buffer, which we then pass to printk(). However, in certain situations the seq_buf string buffer can get too big, exceeding the PRINTKRB_RECORD_MAX bytes limit, and causing printk() to truncate the string. Add a new seq_buf helper. This helper prints the seq_buf string buffer line by line, using \n as a delimiter, rather than passing the whole string buffer to printk() at once. Link: https://lkml.kernel.org/r/20230415100110.1419872-1-senozhatsky@chromium.org Cc: Andrew Morton Signed-off-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Tested-by: Yosry Ahmed Signed-off-by: Steven Rostedt (Google) --- lib/seq_buf.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'lib') diff --git a/lib/seq_buf.c b/lib/seq_buf.c index 0a68f7aa85d6..45c450f423fa 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -93,6 +93,38 @@ int seq_buf_printf(struct seq_buf *s, const char *fmt, ...) } EXPORT_SYMBOL_GPL(seq_buf_printf); +/** + * seq_buf_do_printk - printk seq_buf line by line + * @s: seq_buf descriptor + * @lvl: printk level + * + * printk()-s a multi-line sequential buffer line by line. The function + * makes sure that the buffer in @s is nul terminated and safe to read + * as a string. + */ +void seq_buf_do_printk(struct seq_buf *s, const char *lvl) +{ + const char *start, *lf; + + if (s->size == 0 || s->len == 0) + return; + + seq_buf_terminate(s); + + start = s->buffer; + while ((lf = strchr(start, '\n'))) { + int len = lf - start + 1; + + printk("%s%.*s", lvl, len, start); + start = ++lf; + } + + /* No trailing LF */ + if (start < s->buffer + s->len) + printk("%s%s\n", lvl, start); +} +EXPORT_SYMBOL_GPL(seq_buf_do_printk); + #ifdef CONFIG_BINARY_PRINTF /** * seq_buf_bprintf - Write the printf string from binary arguments -- cgit From 0af462f19e635ad522f28981238334620881badc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 1 May 2023 17:42:06 +0200 Subject: debugobject: Ensure pool refill (again) The recent fix to ensure atomicity of lookup and allocation inadvertently broke the pool refill mechanism. Prior to that change debug_objects_activate() and debug_objecs_assert_init() invoked debug_objecs_init() to set up the tracking object for statically initialized objects. That's not longer the case and debug_objecs_init() is now the only place which does pool refills. Depending on the number of statically initialized objects this can be enough to actually deplete the pool, which was observed by Ido via a debugobjects OOM warning. Restore the old behaviour by adding explicit refill opportunities to debug_objects_activate() and debug_objecs_assert_init(). Fixes: 63a759694eed ("debugobject: Prevent init race with static objects") Reported-by: Ido Schimmel Signed-off-by: Thomas Gleixner Tested-by: Ido Schimmel Link: https://lore.kernel.org/r/871qk05a9d.ffs@tglx --- lib/debugobjects.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index b796799fadb2..003edc5ebd67 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -587,6 +587,16 @@ static struct debug_obj *lookup_object_or_alloc(void *addr, struct debug_bucket return NULL; } +static void debug_objects_fill_pool(void) +{ + /* + * On RT enabled kernels the pool refill must happen in preemptible + * context: + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) + fill_pool(); +} + static void __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack) { @@ -595,12 +605,7 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack struct debug_obj *obj; unsigned long flags; - /* - * On RT enabled kernels the pool refill must happen in preemptible - * context: - */ - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) - fill_pool(); + debug_objects_fill_pool(); db = get_bucket((unsigned long) addr); @@ -685,6 +690,8 @@ int debug_object_activate(void *addr, const struct debug_obj_descr *descr) if (!debug_objects_enabled) return 0; + debug_objects_fill_pool(); + db = get_bucket((unsigned long) addr); raw_spin_lock_irqsave(&db->lock, flags); @@ -894,6 +901,8 @@ void debug_object_assert_init(void *addr, const struct debug_obj_descr *descr) if (!debug_objects_enabled) return; + debug_objects_fill_pool(); + db = get_bucket((unsigned long) addr); raw_spin_lock_irqsave(&db->lock, flags); -- cgit From 0cce06ba859a515bd06224085d3addb870608b6d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Apr 2023 17:03:13 +0200 Subject: debugobjects,locking: Annotate debug_object_fill_pool() wait type violation There is an explicit wait-type violation in debug_object_fill_pool() for PREEMPT_RT=n kernels which allows them to more easily fill the object pool and reduce the chance of allocation failures. Lockdep's wait-type checks are designed to check the PREEMPT_RT locking rules even for PREEMPT_RT=n kernels and object to this, so create a lockdep annotation to allow this to stand. Specifically, create a 'lock' type that overrides the inner wait-type while it is held -- allowing one to temporarily raise it, such that the violation is hidden. Reported-by: Vlastimil Babka Reported-by: Qi Zheng Signed-off-by: Peter Zijlstra (Intel) Tested-by: Qi Zheng Link: https://lkml.kernel.org/r/20230429100614.GA1489784@hirez.programming.kicks-ass.net --- lib/debugobjects.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 003edc5ebd67..826c617b10a7 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -591,10 +591,21 @@ static void debug_objects_fill_pool(void) { /* * On RT enabled kernels the pool refill must happen in preemptible - * context: + * context -- for !RT kernels we rely on the fact that spinlock_t and + * raw_spinlock_t are basically the same type and this lock-type + * inversion works just fine. */ - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) + if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { + /* + * Annotate away the spinlock_t inside raw_spinlock_t warning + * by temporarily raising the wait-type to WAIT_SLEEP, matching + * the preemptible() condition above. + */ + static DEFINE_WAIT_OVERRIDE_MAP(fill_pool_map, LD_WAIT_SLEEP); + lock_map_acquire_try(&fill_pool_map); fill_pool(); + lock_map_release(&fill_pool_map); + } } static void -- cgit From 245f0922689364b21163af4937a05ea0ba576fae Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 17 Apr 2023 12:53:23 +0800 Subject: mm: hwpoison: coredump: support recovery from dump_user_range() dump_user_range() is used to copy the user page to a coredump file, but if a hardware memory error occurred during copy, which called from __kernel_write_iter() in dump_user_range(), it crashes, CPU: 112 PID: 7014 Comm: mca-recover Not tainted 6.3.0-rc2 #425 pc : __memcpy+0x110/0x260 lr : _copy_from_iter+0x3bc/0x4c8 ... Call trace: __memcpy+0x110/0x260 copy_page_from_iter+0xcc/0x130 pipe_write+0x164/0x6d8 __kernel_write_iter+0x9c/0x210 dump_user_range+0xc8/0x1d8 elf_core_dump+0x308/0x368 do_coredump+0x2e8/0xa40 get_signal+0x59c/0x788 do_signal+0x118/0x1f8 do_notify_resume+0xf0/0x280 el0_da+0x130/0x138 el0t_64_sync_handler+0x68/0xc0 el0t_64_sync+0x188/0x190 Generally, the '->write_iter' of file ops will use copy_page_from_iter() and copy_page_from_iter_atomic(), change memcpy() to copy_mc_to_kernel() in both of them to handle #MC during source read, which stop coredump processing and kill the task instead of kernel panic, but the source address may not always a user address, so introduce a new copy_mc flag in struct iov_iter{} to indicate that the iter could do a safe memory copy, also introduce the helpers to set/cleck the flag, for now, it's only used in coredump's dump_user_range(), but it could expand to any other scenarios to fix the similar issue. Link: https://lkml.kernel.org/r/20230417045323.11054-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Alexander Viro Cc: Christian Brauner Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Tong Tiangen Cc: Jens Axboe Signed-off-by: Andrew Morton --- lib/iov_iter.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/iov_iter.c b/lib/iov_iter.c index c3dbe994112c..960223ed9199 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -434,6 +434,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_IOVEC, + .copy_mc = false, .nofault = false, .user_backed = true, .data_source = direction, @@ -630,6 +631,14 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) EXPORT_SYMBOL_GPL(_copy_mc_to_iter); #endif /* CONFIG_ARCH_HAS_COPY_MC */ +static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from, + size_t size) +{ + if (iov_iter_is_copy_mc(i)) + return (void *)copy_mc_to_kernel(to, from, size); + return memcpy(to, from, size); +} + size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) @@ -639,7 +648,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) might_fault(); iterate_and_advance(i, bytes, base, len, off, copyin(addr + off, base, len), - memcpy(addr + off, base, len) + memcpy_from_iter(i, addr + off, base, len) ) return bytes; @@ -862,7 +871,7 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt } iterate_and_advance(i, bytes, base, len, off, copyin(p + off, base, len), - memcpy(p + off, base, len) + memcpy_from_iter(i, p + off, base, len) ) kunmap_atomic(kaddr); return bytes; @@ -1043,6 +1052,7 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_KVEC, + .copy_mc = false, .data_source = direction, .kvec = kvec, .nr_segs = nr_segs, @@ -1059,6 +1069,7 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_BVEC, + .copy_mc = false, .data_source = direction, .bvec = bvec, .nr_segs = nr_segs, @@ -1105,6 +1116,7 @@ void iov_iter_xarray(struct iov_iter *i, unsigned int direction, BUG_ON(direction & ~1); *i = (struct iov_iter) { .iter_type = ITER_XARRAY, + .copy_mc = false, .data_source = direction, .xarray = xarray, .xarray_start = start, @@ -1128,6 +1140,7 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) BUG_ON(direction != READ); *i = (struct iov_iter){ .iter_type = ITER_DISCARD, + .copy_mc = false, .data_source = false, .count = count, .iov_offset = 0 -- cgit From 162bd18eb55adf464a0fa2b4144b8d61c75ff7c2 Mon Sep 17 00:00:00 2001 From: Roy Novich Date: Sun, 7 May 2023 16:57:43 +0300 Subject: linux/dim: Do nothing if no time delta between samples Add return value for dim_calc_stats. This is an indication for the caller if curr_stats was assigned by the function. Avoid using curr_stats uninitialized over {rdma/net}_dim, when no time delta between samples. Coverity reported this potential use of an uninitialized variable. Fixes: 4c4dbb4a7363 ("net/mlx5e: Move dynamic interrupt coalescing code to include/linux") Fixes: cb3c7fd4f839 ("net/mlx5e: Support adaptive RX coalescing") Signed-off-by: Roy Novich Reviewed-by: Aya Levin Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Reviewed-by: Leon Romanovsky Reviewed-by: Michal Kubiak Link: https://lore.kernel.org/r/20230507135743.138993-1-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- lib/dim/dim.c | 5 +++-- lib/dim/net_dim.c | 3 ++- lib/dim/rdma_dim.c | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/dim/dim.c b/lib/dim/dim.c index 38045d6d0538..e89aaf07bde5 100644 --- a/lib/dim/dim.c +++ b/lib/dim/dim.c @@ -54,7 +54,7 @@ void dim_park_tired(struct dim *dim) } EXPORT_SYMBOL(dim_park_tired); -void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, +bool dim_calc_stats(struct dim_sample *start, struct dim_sample *end, struct dim_stats *curr_stats) { /* u32 holds up to 71 minutes, should be enough */ @@ -66,7 +66,7 @@ void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, start->comp_ctr); if (!delta_us) - return; + return false; curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us); curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us); @@ -79,5 +79,6 @@ void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, else curr_stats->cpe_ratio = 0; + return true; } EXPORT_SYMBOL(dim_calc_stats); diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c index 53f6b9c6e936..4e32f7aaac86 100644 --- a/lib/dim/net_dim.c +++ b/lib/dim/net_dim.c @@ -227,7 +227,8 @@ void net_dim(struct dim *dim, struct dim_sample end_sample) dim->start_sample.event_ctr); if (nevents < DIM_NEVENTS) break; - dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats); + if (!dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats)) + break; if (net_dim_decision(&curr_stats, dim)) { dim->state = DIM_APPLY_NEW_PROFILE; schedule_work(&dim->work); diff --git a/lib/dim/rdma_dim.c b/lib/dim/rdma_dim.c index 15462d54758d..88f779486707 100644 --- a/lib/dim/rdma_dim.c +++ b/lib/dim/rdma_dim.c @@ -88,7 +88,8 @@ void rdma_dim(struct dim *dim, u64 completions) nevents = curr_sample->event_ctr - dim->start_sample.event_ctr; if (nevents < DIM_NEVENTS) break; - dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats); + if (!dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats)) + break; if (rdma_dim_decision(&curr_stats, dim)) { dim->state = DIM_APPLY_NEW_PROFILE; schedule_work(&dim->work); -- cgit From 0257d9908d38c0b1669af4bb1bc4dbca1f273fe6 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 5 May 2023 22:58:29 +0800 Subject: maple_tree: make maple state reusable after mas_empty_area() Make mas->min and mas->max point to a node range instead of a leaf entry range. This allows mas to still be usable after mas_empty_area() returns. Users would get unexpected results from other operations on the maple state after calling the affected function. For example, x86 MAP_32BIT mmap() acts as if there is no suitable gap when there should be one. Link: https://lkml.kernel.org/r/20230505145829.74574-1-zhangpeng.00@bytedance.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Peng Zhang Reported-by: "Edgecombe, Rick P" Reported-by: Tad Reported-by: Michael Keyes Link: https://lore.kernel.org/linux-mm/32f156ba80010fd97dbaf0a0cdfc84366608624d.camel@intel.com/ Link: https://lore.kernel.org/linux-mm/e6108286ac025c268964a7ead3aab9899f9bc6e9.camel@spotco.us/ Reviewed-by: Liam R. Howlett Tested-by: Rick Edgecombe Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 110a36479dce..8ebc43d4cc8c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5317,15 +5317,9 @@ int mas_empty_area(struct ma_state *mas, unsigned long min, mt = mte_node_type(mas->node); pivots = ma_pivots(mas_mn(mas), mt); - if (offset) - mas->min = pivots[offset - 1] + 1; - - if (offset < mt_pivots[mt]) - mas->max = pivots[offset]; - - if (mas->index < mas->min) - mas->index = mas->min; - + min = mas_safe_min(mas, pivots, offset); + if (mas->index < min) + mas->index = min; mas->last = mas->index + size - 1; return 0; } -- cgit From eb799279fb1f9c63c520fe8c1c41cb9154252db6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 11 May 2023 22:47:32 +0900 Subject: debugobjects: Don't wake up kswapd from fill_pool() syzbot is reporting a lockdep warning in fill_pool() because the allocation from debugobjects is using GFP_ATOMIC, which is (__GFP_HIGH | __GFP_KSWAPD_RECLAIM) and therefore tries to wake up kswapd, which acquires kswapd_wait::lock. Since fill_pool() might be called with arbitrary locks held, fill_pool() should not assume that acquiring kswapd_wait::lock is safe. Use __GFP_HIGH instead and remove __GFP_NORETRY as it is pointless for !__GFP_DIRECT_RECLAIM allocation. Fixes: 3ac7fe5a4aab ("infrastructure to debug (dynamic) objects") Reported-by: syzbot Signed-off-by: Tetsuo Handa Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/6577e1fa-b6ee-f2be-2414-a2b51b1c5e30@I-love.SAKURA.ne.jp Closes: https://syzkaller.appspot.com/bug?extid=fe0c72f0ccbb93786380 --- lib/debugobjects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 826c617b10a7..984985c39c9b 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -126,7 +126,7 @@ static const char *obj_states[ODEBUG_STATE_MAX] = { static void fill_pool(void) { - gfp_t gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; + gfp_t gfp = __GFP_HIGH | __GFP_NOWARN; struct debug_obj *obj; unsigned long flags; -- cgit From 4acfe3dfde685a5a9eaec5555351918e2d7266a1 Mon Sep 17 00:00:00 2001 From: Mirsad Goran Todorovac Date: Tue, 9 May 2023 10:47:45 +0200 Subject: test_firmware: prevent race conditions by a correct implementation of locking Dan Carpenter spotted a race condition in a couple of situations like these in the test_firmware driver: static int test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg) { u8 val; int ret; ret = kstrtou8(buf, 10, &val); if (ret) return ret; mutex_lock(&test_fw_mutex); *(u8 *)cfg = val; mutex_unlock(&test_fw_mutex); /* Always return full write size even if we didn't consume all */ return size; } static ssize_t config_num_requests_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int rc; mutex_lock(&test_fw_mutex); if (test_fw_config->reqs) { pr_err("Must call release_all_firmware prior to changing config\n"); rc = -EINVAL; mutex_unlock(&test_fw_mutex); goto out; } mutex_unlock(&test_fw_mutex); rc = test_dev_config_update_u8(buf, count, &test_fw_config->num_requests); out: return rc; } static ssize_t config_read_fw_idx_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { return test_dev_config_update_u8(buf, count, &test_fw_config->read_fw_idx); } The function test_dev_config_update_u8() is called from both the locked and the unlocked context, function config_num_requests_store() and config_read_fw_idx_store() which can both be called asynchronously as they are driver's methods, while test_dev_config_update_u8() and siblings change their argument pointed to by u8 *cfg or similar pointer. To avoid deadlock on test_fw_mutex, the lock is dropped before calling test_dev_config_update_u8() and re-acquired within test_dev_config_update_u8() itself, but alas this creates a race condition. Having two locks wouldn't assure a race-proof mutual exclusion. This situation is best avoided by the introduction of a new, unlocked function __test_dev_config_update_u8() which can be called from the locked context and reducing test_dev_config_update_u8() to: static int test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg) { int ret; mutex_lock(&test_fw_mutex); ret = __test_dev_config_update_u8(buf, size, cfg); mutex_unlock(&test_fw_mutex); return ret; } doing the locking and calling the unlocked primitive, which enables both locked and unlocked versions without duplication of code. The similar approach was applied to all functions called from the locked and the unlocked context, which safely mitigates both deadlocks and race conditions in the driver. __test_dev_config_update_bool(), __test_dev_config_update_u8() and __test_dev_config_update_size_t() unlocked versions of the functions were introduced to be called from the locked contexts as a workaround without releasing the main driver's lock and thereof causing a race condition. The test_dev_config_update_bool(), test_dev_config_update_u8() and test_dev_config_update_size_t() locked versions of the functions are being called from driver methods without the unnecessary multiplying of the locking and unlocking code for each method, and complicating the code with saving of the return value across lock. Fixes: 7feebfa487b92 ("test_firmware: add support for request_firmware_into_buf") Cc: Luis Chamberlain Cc: Greg Kroah-Hartman Cc: Russ Weight Cc: Takashi Iwai Cc: Tianfei Zhang Cc: Shuah Khan Cc: Colin Ian King Cc: Randy Dunlap Cc: linux-kselftest@vger.kernel.org Cc: stable@vger.kernel.org # v5.4 Suggested-by: Dan Carpenter Signed-off-by: Mirsad Goran Todorovac Link: https://lore.kernel.org/r/20230509084746.48259-1-mirsad.todorovac@alu.unizg.hr Signed-off-by: Greg Kroah-Hartman --- lib/test_firmware.c | 52 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/lib/test_firmware.c b/lib/test_firmware.c index 05ed84c2fc4c..35417e0af3f4 100644 --- a/lib/test_firmware.c +++ b/lib/test_firmware.c @@ -353,16 +353,26 @@ static ssize_t config_test_show_str(char *dst, return len; } -static int test_dev_config_update_bool(const char *buf, size_t size, +static inline int __test_dev_config_update_bool(const char *buf, size_t size, bool *cfg) { int ret; - mutex_lock(&test_fw_mutex); if (kstrtobool(buf, cfg) < 0) ret = -EINVAL; else ret = size; + + return ret; +} + +static int test_dev_config_update_bool(const char *buf, size_t size, + bool *cfg) +{ + int ret; + + mutex_lock(&test_fw_mutex); + ret = __test_dev_config_update_bool(buf, size, cfg); mutex_unlock(&test_fw_mutex); return ret; @@ -373,7 +383,8 @@ static ssize_t test_dev_config_show_bool(char *buf, bool val) return snprintf(buf, PAGE_SIZE, "%d\n", val); } -static int test_dev_config_update_size_t(const char *buf, +static int __test_dev_config_update_size_t( + const char *buf, size_t size, size_t *cfg) { @@ -384,9 +395,7 @@ static int test_dev_config_update_size_t(const char *buf, if (ret) return ret; - mutex_lock(&test_fw_mutex); *(size_t *)cfg = new; - mutex_unlock(&test_fw_mutex); /* Always return full write size even if we didn't consume all */ return size; @@ -402,7 +411,7 @@ static ssize_t test_dev_config_show_int(char *buf, int val) return snprintf(buf, PAGE_SIZE, "%d\n", val); } -static int test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg) +static int __test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg) { u8 val; int ret; @@ -411,14 +420,23 @@ static int test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg) if (ret) return ret; - mutex_lock(&test_fw_mutex); *(u8 *)cfg = val; - mutex_unlock(&test_fw_mutex); /* Always return full write size even if we didn't consume all */ return size; } +static int test_dev_config_update_u8(const char *buf, size_t size, u8 *cfg) +{ + int ret; + + mutex_lock(&test_fw_mutex); + ret = __test_dev_config_update_u8(buf, size, cfg); + mutex_unlock(&test_fw_mutex); + + return ret; +} + static ssize_t test_dev_config_show_u8(char *buf, u8 val) { return snprintf(buf, PAGE_SIZE, "%u\n", val); @@ -471,10 +489,10 @@ static ssize_t config_num_requests_store(struct device *dev, mutex_unlock(&test_fw_mutex); goto out; } - mutex_unlock(&test_fw_mutex); - rc = test_dev_config_update_u8(buf, count, - &test_fw_config->num_requests); + rc = __test_dev_config_update_u8(buf, count, + &test_fw_config->num_requests); + mutex_unlock(&test_fw_mutex); out: return rc; @@ -518,10 +536,10 @@ static ssize_t config_buf_size_store(struct device *dev, mutex_unlock(&test_fw_mutex); goto out; } - mutex_unlock(&test_fw_mutex); - rc = test_dev_config_update_size_t(buf, count, - &test_fw_config->buf_size); + rc = __test_dev_config_update_size_t(buf, count, + &test_fw_config->buf_size); + mutex_unlock(&test_fw_mutex); out: return rc; @@ -548,10 +566,10 @@ static ssize_t config_file_offset_store(struct device *dev, mutex_unlock(&test_fw_mutex); goto out; } - mutex_unlock(&test_fw_mutex); - rc = test_dev_config_update_size_t(buf, count, - &test_fw_config->file_offset); + rc = __test_dev_config_update_size_t(buf, count, + &test_fw_config->file_offset); + mutex_unlock(&test_fw_mutex); out: return rc; -- cgit From be37bed754ed90b2655382f93f9724b3c1aae847 Mon Sep 17 00:00:00 2001 From: Mirsad Goran Todorovac Date: Tue, 9 May 2023 10:47:47 +0200 Subject: test_firmware: fix a memory leak with reqs buffer Dan Carpenter spotted that test_fw_config->reqs will be leaked if trigger_batched_requests_store() is called two or more times. The same appears with trigger_batched_requests_async_store(). This bug wasn't trigger by the tests, but observed by Dan's visual inspection of the code. The recommended workaround was to return -EBUSY if test_fw_config->reqs is already allocated. Fixes: 7feebfa487b92 ("test_firmware: add support for request_firmware_into_buf") Cc: Luis Chamberlain Cc: Greg Kroah-Hartman Cc: Russ Weight Cc: Tianfei Zhang Cc: Shuah Khan Cc: Colin Ian King Cc: Randy Dunlap Cc: linux-kselftest@vger.kernel.org Cc: stable@vger.kernel.org # v5.4 Suggested-by: Dan Carpenter Suggested-by: Takashi Iwai Signed-off-by: Mirsad Goran Todorovac Reviewed-by: Dan Carpenter Acked-by: Luis Chamberlain Link: https://lore.kernel.org/r/20230509084746.48259-2-mirsad.todorovac@alu.unizg.hr Signed-off-by: Greg Kroah-Hartman --- lib/test_firmware.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'lib') diff --git a/lib/test_firmware.c b/lib/test_firmware.c index 35417e0af3f4..91b232ed3161 100644 --- a/lib/test_firmware.c +++ b/lib/test_firmware.c @@ -913,6 +913,11 @@ static ssize_t trigger_batched_requests_store(struct device *dev, mutex_lock(&test_fw_mutex); + if (test_fw_config->reqs) { + rc = -EBUSY; + goto out_bail; + } + test_fw_config->reqs = vzalloc(array3_size(sizeof(struct test_batched_req), test_fw_config->num_requests, 2)); @@ -1011,6 +1016,11 @@ ssize_t trigger_batched_requests_async_store(struct device *dev, mutex_lock(&test_fw_mutex); + if (test_fw_config->reqs) { + rc = -EBUSY; + goto out_bail; + } + test_fw_config->reqs = vzalloc(array3_size(sizeof(struct test_batched_req), test_fw_config->num_requests, 2)); -- cgit From 48e156023059e57a8fc68b498439832f7600ffff Mon Sep 17 00:00:00 2001 From: Mirsad Goran Todorovac Date: Tue, 9 May 2023 10:47:49 +0200 Subject: test_firmware: fix the memory leak of the allocated firmware buffer The following kernel memory leak was noticed after running tools/testing/selftests/firmware/fw_run_tests.sh: [root@pc-mtodorov firmware]# cat /sys/kernel/debug/kmemleak . . . unreferenced object 0xffff955389bc3400 (size 1024): comm "test_firmware-0", pid 5451, jiffies 4294944822 (age 65.652s) hex dump (first 32 bytes): 47 48 34 35 36 37 0a 00 00 00 00 00 00 00 00 00 GH4567.......... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] slab_post_alloc_hook+0x8c/0x3c0 [] __kmem_cache_alloc_node+0x184/0x240 [] kmalloc_trace+0x2e/0xc0 [] test_fw_run_batch_request+0x9d/0x180 [] kthread+0x10b/0x140 [] ret_from_fork+0x29/0x50 unreferenced object 0xffff9553c334b400 (size 1024): comm "test_firmware-1", pid 5452, jiffies 4294944822 (age 65.652s) hex dump (first 32 bytes): 47 48 34 35 36 37 0a 00 00 00 00 00 00 00 00 00 GH4567.......... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] slab_post_alloc_hook+0x8c/0x3c0 [] __kmem_cache_alloc_node+0x184/0x240 [] kmalloc_trace+0x2e/0xc0 [] test_fw_run_batch_request+0x9d/0x180 [] kthread+0x10b/0x140 [] ret_from_fork+0x29/0x50 unreferenced object 0xffff9553c334f000 (size 1024): comm "test_firmware-2", pid 5453, jiffies 4294944822 (age 65.652s) hex dump (first 32 bytes): 47 48 34 35 36 37 0a 00 00 00 00 00 00 00 00 00 GH4567.......... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] slab_post_alloc_hook+0x8c/0x3c0 [] __kmem_cache_alloc_node+0x184/0x240 [] kmalloc_trace+0x2e/0xc0 [] test_fw_run_batch_request+0x9d/0x180 [] kthread+0x10b/0x140 [] ret_from_fork+0x29/0x50 unreferenced object 0xffff9553c3348400 (size 1024): comm "test_firmware-3", pid 5454, jiffies 4294944822 (age 65.652s) hex dump (first 32 bytes): 47 48 34 35 36 37 0a 00 00 00 00 00 00 00 00 00 GH4567.......... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] slab_post_alloc_hook+0x8c/0x3c0 [] __kmem_cache_alloc_node+0x184/0x240 [] kmalloc_trace+0x2e/0xc0 [] test_fw_run_batch_request+0x9d/0x180 [] kthread+0x10b/0x140 [] ret_from_fork+0x29/0x50 [root@pc-mtodorov firmware]# Note that the size 1024 corresponds to the size of the test firmware buffer. The actual number of the buffers leaked is around 70-110, depending on the test run. The cause of the leak is the following: request_partial_firmware_into_buf() and request_firmware_into_buf() provided firmware buffer isn't released on release_firmware(), we have allocated it and we are responsible for deallocating it manually. This is introduced in a number of context where previously only release_firmware() was called, which was insufficient. Reported-by: Mirsad Goran Todorovac Fixes: 7feebfa487b92 ("test_firmware: add support for request_firmware_into_buf") Cc: Greg Kroah-Hartman Cc: Dan Carpenter Cc: Takashi Iwai Cc: Luis Chamberlain Cc: Russ Weight Cc: Tianfei zhang Cc: Christophe JAILLET Cc: Zhengchao Shao Cc: Colin Ian King Cc: linux-kernel@vger.kernel.org Cc: Kees Cook Cc: Scott Branden Cc: Luis R. Rodriguez Cc: linux-kselftest@vger.kernel.org Cc: stable@vger.kernel.org # v5.4 Signed-off-by: Mirsad Goran Todorovac Link: https://lore.kernel.org/r/20230509084746.48259-3-mirsad.todorovac@alu.unizg.hr Signed-off-by: Greg Kroah-Hartman --- lib/test_firmware.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_firmware.c b/lib/test_firmware.c index 91b232ed3161..1d7d480b8eeb 100644 --- a/lib/test_firmware.c +++ b/lib/test_firmware.c @@ -45,6 +45,7 @@ struct test_batched_req { bool sent; const struct firmware *fw; const char *name; + const char *fw_buf; struct completion completion; struct task_struct *task; struct device *dev; @@ -175,8 +176,14 @@ static void __test_release_all_firmware(void) for (i = 0; i < test_fw_config->num_requests; i++) { req = &test_fw_config->reqs[i]; - if (req->fw) + if (req->fw) { + if (req->fw_buf) { + kfree_const(req->fw_buf); + req->fw_buf = NULL; + } release_firmware(req->fw); + req->fw = NULL; + } } vfree(test_fw_config->reqs); @@ -670,6 +677,8 @@ static ssize_t trigger_request_store(struct device *dev, mutex_lock(&test_fw_mutex); release_firmware(test_firmware); + if (test_fw_config->reqs) + __test_release_all_firmware(); test_firmware = NULL; rc = request_firmware(&test_firmware, name, dev); if (rc) { @@ -770,6 +779,8 @@ static ssize_t trigger_async_request_store(struct device *dev, mutex_lock(&test_fw_mutex); release_firmware(test_firmware); test_firmware = NULL; + if (test_fw_config->reqs) + __test_release_all_firmware(); rc = request_firmware_nowait(THIS_MODULE, 1, name, dev, GFP_KERNEL, NULL, trigger_async_request_cb); if (rc) { @@ -812,6 +823,8 @@ static ssize_t trigger_custom_fallback_store(struct device *dev, mutex_lock(&test_fw_mutex); release_firmware(test_firmware); + if (test_fw_config->reqs) + __test_release_all_firmware(); test_firmware = NULL; rc = request_firmware_nowait(THIS_MODULE, FW_ACTION_NOUEVENT, name, dev, GFP_KERNEL, NULL, @@ -874,6 +887,8 @@ static int test_fw_run_batch_request(void *data) test_fw_config->buf_size); if (!req->fw) kfree(test_buf); + else + req->fw_buf = test_buf; } else { req->rc = test_fw_config->req_firmware(&req->fw, req->name, @@ -934,6 +949,7 @@ static ssize_t trigger_batched_requests_store(struct device *dev, req->fw = NULL; req->idx = i; req->name = test_fw_config->name; + req->fw_buf = NULL; req->dev = dev; init_completion(&req->completion); req->task = kthread_run(test_fw_run_batch_request, req, @@ -1038,6 +1054,7 @@ ssize_t trigger_batched_requests_async_store(struct device *dev, for (i = 0; i < test_fw_config->num_requests; i++) { req = &test_fw_config->reqs[i]; req->name = test_fw_config->name; + req->fw_buf = NULL; req->fw = NULL; req->idx = i; init_completion(&req->completion); -- cgit From 7c5d4801ecf0564c860033d89726b99723c55146 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 2 Jun 2023 20:28:15 +0200 Subject: lib: cpu_rmap: Fix potential use-after-free in irq_cpu_rmap_release() irq_cpu_rmap_release() calls cpu_rmap_put(), which may free the rmap. So we need to clear the pointer to our glue structure in rmap before doing that, not after. Fixes: 4e0473f1060a ("lib: cpu_rmap: Avoid use after free on rmap->obj array entries") Signed-off-by: Ben Hutchings Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/ZHo0vwquhOy3FaXc@decadent.org.uk Signed-off-by: Jakub Kicinski --- lib/cpu_rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c index 73c1636b927b..4c348670da31 100644 --- a/lib/cpu_rmap.c +++ b/lib/cpu_rmap.c @@ -280,8 +280,8 @@ static void irq_cpu_rmap_release(struct kref *ref) struct irq_glue *glue = container_of(ref, struct irq_glue, notify.kref); - cpu_rmap_put(glue->rmap); glue->rmap->obj[glue->index] = NULL; + cpu_rmap_put(glue->rmap); kfree(glue); } -- cgit From bde1597d0f045149a254b0c2ec6f029c82e459d5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 May 2023 21:41:54 +0200 Subject: radix-tree: move declarations to header The xarray.c file contains the only call to radix_tree_node_rcu_free(), and it comes with its own extern declaration for it. This means the function definition causes a missing-prototype warning: lib/radix-tree.c:288:6: error: no previous prototype for 'radix_tree_node_rcu_free' [-Werror=missing-prototypes] Instead, move the declaration for this function to a new header that can be included by both, and do the same for the radix_tree_node_cachep variable that has the same underlying problem but does not cause a warning with gcc. [zhangpeng.00@bytedance.com: fix building radix tree test suite] Link: https://lkml.kernel.org/r/20230521095450.21332-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20230516194212.548910-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Peng Zhang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- lib/radix-tree.c | 2 ++ lib/radix-tree.h | 8 ++++++++ lib/xarray.c | 6 ++---- 3 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 lib/radix-tree.h (limited to 'lib') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 049ba132f7ef..1a31065b2036 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -27,6 +27,8 @@ #include #include +#include "radix-tree.h" + /* * Radix tree node cache. */ diff --git a/lib/radix-tree.h b/lib/radix-tree.h new file mode 100644 index 000000000000..40d5c03e2b09 --- /dev/null +++ b/lib/radix-tree.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* radix-tree helpers that are only shared with xarray */ + +struct kmem_cache; +struct rcu_head; + +extern struct kmem_cache *radix_tree_node_cachep; +extern void radix_tree_node_rcu_free(struct rcu_head *head); diff --git a/lib/xarray.c b/lib/xarray.c index ea9ce1f0b386..2071a3718f4e 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -12,6 +12,8 @@ #include #include +#include "radix-tree.h" + /* * Coding conventions in this file: * @@ -247,10 +249,6 @@ void *xas_load(struct xa_state *xas) } EXPORT_SYMBOL_GPL(xas_load); -/* Move the radix tree node cache here */ -extern struct kmem_cache *radix_tree_node_cachep; -extern void radix_tree_node_rcu_free(struct rcu_head *head); - #define XA_RCU_FREE ((struct xarray *)1) static void xa_node_free(struct xa_node *node) -- cgit From 9f6c6ad161f1af37548a6b80fb15710998ccfd1e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 24 May 2023 09:24:24 +0100 Subject: lib/test_vmalloc.c: avoid garbage in page array It turns out that alloc_pages_bulk_array() does not treat the page_array parameter as an output parameter, but rather reads the array and skips any entries that have already been allocated. This is somewhat unexpected and breaks this test, as we allocate the pages array uninitialised on the assumption it will be overwritten. As a result, the test was referencing uninitialised data and causing the PFN to not be valid and thus a WARN_ON() followed by a null pointer deref and panic. In addition, this is an array of pointers not of struct page objects, so we need only allocate an array with elements of pointer size. We solve both problems by simply using kcalloc() and referencing sizeof(struct page *) rather than sizeof(struct page). Link: https://lkml.kernel.org/r/20230524082424.10022-1-lstoakes@gmail.com Fixes: 869cb29a61a1 ("lib/test_vmalloc.c: add vm_map_ram()/vm_unmap_ram() test case") Signed-off-by: Lorenzo Stoakes Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 9dd9745d365f..3718d9886407 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -369,7 +369,7 @@ vm_map_ram_test(void) int i; map_nr_pages = nr_pages > 0 ? nr_pages:1; - pages = kmalloc(map_nr_pages * sizeof(struct page), GFP_KERNEL); + pages = kcalloc(map_nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) return -1; -- cgit