diff options
Diffstat (limited to 'kernel')
31 files changed, 823 insertions, 409 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 1224dd937df0..422270d64820 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -148,6 +148,17 @@ config CRASH_DM_CRYPT_CONFIGS CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that is required to be built-in. +config CRASH_DUMP_KUNIT_TEST + tristate "Unit Tests for kernel crash dumps" if !KUNIT_ALL_TESTS + depends on CRASH_DUMP && KUNIT + default KUNIT_ALL_TESTS + help + This option builds KUnit unit tests for kernel crash dumps. The unit + tests will be used to verify the correctness of covered functions and + also prevent any regression. + + If unsure, say N. + config CRASH_HOTPLUG bool "Update the crash elfcorehdr on system configuration changes" default y diff --git a/kernel/Makefile b/kernel/Makefile index 41751834e764..df3dd8291bb6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -78,6 +78,7 @@ obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o +obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o diff --git a/kernel/acct.c b/kernel/acct.c index 6520baa13669..61630110e29d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -44,19 +44,14 @@ * a struct file opened for write. Fixed. 2/6/2000, AV. */ -#include <linux/mm.h> #include <linux/slab.h> #include <linux/acct.h> #include <linux/capability.h> -#include <linux/file.h> #include <linux/tty.h> -#include <linux/security.h> -#include <linux/vfs.h> +#include <linux/statfs.h> #include <linux/jiffies.h> -#include <linux/times.h> #include <linux/syscalls.h> -#include <linux/mount.h> -#include <linux/uaccess.h> +#include <linux/namei.h> #include <linux/sched/cputime.h> #include <asm/div64.h> @@ -217,84 +212,70 @@ static void close_work(struct work_struct *work) complete(&acct->done); } -static int acct_on(struct filename *pathname) +DEFINE_FREE(fput_sync, struct file *, if (!IS_ERR_OR_NULL(_T)) __fput_sync(_T)) +static int acct_on(const char __user *name) { - struct file *file; - struct vfsmount *mnt, *internal; + /* Difference from BSD - they don't do O_APPEND */ + const int open_flags = O_WRONLY|O_APPEND|O_LARGEFILE; struct pid_namespace *ns = task_active_pid_ns(current); + struct filename *pathname __free(putname) = getname(name); + struct file *original_file __free(fput) = NULL; // in that order + struct path internal __free(path_put) = {}; // in that order + struct file *file __free(fput_sync) = NULL; // in that order struct bsd_acct_struct *acct; + struct vfsmount *mnt; struct fs_pin *old; - int err; - acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); - if (!acct) - return -ENOMEM; + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + original_file = file_open_name(pathname, open_flags, 0); + if (IS_ERR(original_file)) + return PTR_ERR(original_file); - /* Difference from BSD - they don't do O_APPEND */ - file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); - if (IS_ERR(file)) { - kfree(acct); + mnt = mnt_clone_internal(&original_file->f_path); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + internal.mnt = mnt; + internal.dentry = dget(mnt->mnt_root); + + file = dentry_open(&internal, open_flags, current_cred()); + if (IS_ERR(file)) return PTR_ERR(file); - } - if (!S_ISREG(file_inode(file)->i_mode)) { - kfree(acct); - filp_close(file, NULL); + if (!S_ISREG(file_inode(file)->i_mode)) return -EACCES; - } /* Exclude kernel kernel internal filesystems. */ - if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) { - kfree(acct); - filp_close(file, NULL); + if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) return -EINVAL; - } /* Exclude procfs and sysfs. */ - if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) { - kfree(acct); - filp_close(file, NULL); + if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) return -EINVAL; - } - if (!(file->f_mode & FMODE_CAN_WRITE)) { - kfree(acct); - filp_close(file, NULL); + if (!(file->f_mode & FMODE_CAN_WRITE)) return -EIO; - } - internal = mnt_clone_internal(&file->f_path); - if (IS_ERR(internal)) { - kfree(acct); - filp_close(file, NULL); - return PTR_ERR(internal); - } - err = mnt_get_write_access(internal); - if (err) { - mntput(internal); - kfree(acct); - filp_close(file, NULL); - return err; - } - mnt = file->f_path.mnt; - file->f_path.mnt = internal; + + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (!acct) + return -ENOMEM; atomic_long_set(&acct->count, 1); init_fs_pin(&acct->pin, acct_pin_kill); - acct->file = file; + acct->file = no_free_ptr(file); acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); INIT_WORK(&acct->work, close_work); init_completion(&acct->done); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ - pin_insert(&acct->pin, mnt); + pin_insert(&acct->pin, original_file->f_path.mnt); rcu_read_lock(); old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - mnt_put_write_access(mnt); - mntput(mnt); return 0; } @@ -319,14 +300,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name) return -EPERM; if (name) { - struct filename *tmp = getname(name); - - if (IS_ERR(tmp)) - return PTR_ERR(tmp); mutex_lock(&acct_on_mutex); - error = acct_on(tmp); + error = acct_on(name); mutex_unlock(&acct_on_mutex); - putname(tmp); } else { rcu_read_lock(); pin_kill(task_active_pid_ns(current)->bacct); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 1605df0a171e..fda6beb041e0 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -680,7 +680,7 @@ void audit_trim_trees(void) struct audit_tree *tree; struct path path; struct audit_node *node; - struct path *paths; + const struct path *paths; struct path array[16]; int err; @@ -703,7 +703,7 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - for (struct path *p = paths; p->dentry; p++) { + for (const struct path *p = paths; p->dentry; p++) { struct inode *inode = p->dentry->d_inode; if (inode_to_key(inode) == chunk->key) { node->index &= ~(1U<<31); @@ -742,9 +742,9 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mounts(struct path *paths, struct audit_tree *tree) +static int tag_mounts(const struct path *paths, struct audit_tree *tree) { - for (struct path *p = paths; p->dentry; p++) { + for (const struct path *p = paths; p->dentry; p++) { int err = tag_chunk(p->dentry->d_inode, tree); if (err) return err; @@ -807,7 +807,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct audit_tree *seed = rule->tree, *tree; struct path path; struct path array[16]; - struct path *paths; + const struct path *paths; int err; rule->tree = NULL; @@ -879,7 +879,7 @@ int audit_tag_tree(char *old, char *new) int failed = 0; struct path path1, path2; struct path array[16]; - struct path *paths; + const struct path *paths; int err; err = kern_path(new, 0, &path2); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 73bba397672a..ff40e5e65c43 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15645,7 +15645,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - if (opcode == BPF_NEG) { + if (opcode == BPF_NEG && + regs[insn->dst_reg].type == SCALAR_VALUE) { err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); err = err ?: adjust_scalar_min_max_vals(env, insn, ®s[insn->dst_reg], @@ -15803,7 +15804,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0 || insn->off > 1 || + if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) || (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; @@ -15813,7 +15814,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; } else { - if (insn->src_reg != BPF_REG_0 || insn->off > 1 || + if (insn->src_reg != BPF_REG_0 || (insn->off != 0 && insn->off != 1) || (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a4ef79591eb2..3b1c43382eec 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -22,6 +22,7 @@ #include <linux/btf.h> #include <linux/objtool.h> #include <linux/delay.h> +#include <linux/panic.h> #include <asm/page.h> #include <asm/sections.h> @@ -143,17 +144,7 @@ STACK_FRAME_NON_STANDARD(__crash_kexec); __bpf_kfunc void crash_kexec(struct pt_regs *regs) { - int old_cpu, this_cpu; - - /* - * Only one CPU is allowed to execute the crash_kexec() code as with - * panic(). Otherwise parallel calls of panic() and crash_kexec() - * may stop each other. To exclude them, we use panic_cpu here too. - */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + if (panic_try_start()) { /* This is the 1st CPU which comes here, so go ahead. */ __crash_kexec(regs); @@ -161,7 +152,7 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs) * Reset panic_cpu to allow another panic()/crash_kexec() * call. */ - atomic_set(&panic_cpu, PANIC_CPU_INVALID); + panic_reset(); } } @@ -274,6 +265,20 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, return 0; } +/** + * crash_exclude_mem_range - exclude a mem range for existing ranges + * @mem: mem->range contains an array of ranges sorted in ascending order + * @mstart: the start of to-be-excluded range + * @mend: the start of to-be-excluded range + * + * If you are unsure if a range split will happen, to avoid function call + * failure because of -ENOMEM, always make sure + * mem->max_nr_ranges == mem->nr_ranges + 1 + * before calling the function each time. + * + * returns 0 if a memory range is excluded successfully + * return -ENOMEM if mem->ranges doesn't have space to hold split ranges + */ int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { @@ -333,6 +338,7 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } +EXPORT_SYMBOL_GPL(crash_exclude_mem_range); ssize_t crash_get_memory_size(void) { diff --git a/kernel/crash_core_test.c b/kernel/crash_core_test.c new file mode 100644 index 000000000000..8aadf6801530 --- /dev/null +++ b/kernel/crash_core_test.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <kunit/test.h> +#include <linux/crash_core.h> // For struct crash_mem and struct range if defined there + +// Helper to create and initialize crash_mem +static struct crash_mem *create_crash_mem(struct kunit *test, unsigned int max_ranges, + unsigned int nr_initial_ranges, + const struct range *initial_ranges) +{ + struct crash_mem *mem; + size_t alloc_size; + + // Check if max_ranges can even hold initial_ranges + if (max_ranges < nr_initial_ranges) { + kunit_err(test, "max_ranges (%u) < nr_initial_ranges (%u)\n", + max_ranges, nr_initial_ranges); + return NULL; + } + + alloc_size = sizeof(struct crash_mem) + (size_t)max_ranges * sizeof(struct range); + mem = kunit_kzalloc(test, alloc_size, GFP_KERNEL); + if (!mem) { + kunit_err(test, "Failed to allocate crash_mem\n"); + return NULL; + } + + mem->max_nr_ranges = max_ranges; + mem->nr_ranges = nr_initial_ranges; + if (initial_ranges && nr_initial_ranges > 0) { + memcpy(mem->ranges, initial_ranges, + nr_initial_ranges * sizeof(struct range)); + } + + return mem; +} + +// Helper to compare ranges for assertions +static void assert_ranges_equal(struct kunit *test, + const struct range *actual_ranges, + unsigned int actual_nr_ranges, + const struct range *expected_ranges, + unsigned int expected_nr_ranges, + const char *case_name) +{ + unsigned int i; + + KUNIT_ASSERT_EQ_MSG(test, expected_nr_ranges, actual_nr_ranges, + "%s: Number of ranges mismatch.", case_name); + + for (i = 0; i < expected_nr_ranges; i++) { + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].start, actual_ranges[i].start, + "%s: Range %u start mismatch.", case_name, i); + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].end, actual_ranges[i].end, + "%s: Range %u end mismatch.", case_name, i); + } +} + +// Structure for test parameters +struct exclude_test_param { + const char *description; + unsigned long long exclude_start; + unsigned long long exclude_end; + unsigned int initial_max_ranges; + const struct range *initial_ranges; + unsigned int initial_nr_ranges; + const struct range *expected_ranges; + unsigned int expected_nr_ranges; + int expected_ret; +}; + +static void run_exclude_test_case(struct kunit *test, const struct exclude_test_param *params) +{ + struct crash_mem *mem; + int ret; + + kunit_info(test, "%s", params->description); + + mem = create_crash_mem(test, params->initial_max_ranges, + params->initial_nr_ranges, params->initial_ranges); + if (!mem) + return; // Error already logged by create_crash_mem or kunit_kzalloc + + ret = crash_exclude_mem_range(mem, params->exclude_start, params->exclude_end); + + KUNIT_ASSERT_EQ_MSG(test, params->expected_ret, ret, + "%s: Return value mismatch.", params->description); + + if (params->expected_ret == 0) { + assert_ranges_equal(test, mem->ranges, mem->nr_ranges, + params->expected_ranges, params->expected_nr_ranges, + params->description); + } else { + // If an error is expected, nr_ranges might still be relevant to check + // depending on the exact point of failure. For ENOMEM on split, + // nr_ranges shouldn't have changed. + KUNIT_ASSERT_EQ_MSG(test, params->initial_nr_ranges, + mem->nr_ranges, + "%s: Number of ranges mismatch on error.", + params->description); + } +} + +/* + * Test Strategy 1: One to-be-excluded range A and one existing range B. + * + * Exhaust all possibilities of the position of A regarding B. + */ + +static const struct range single_range_b = { .start = 100, .end = 199 }; + +static const struct exclude_test_param exclude_single_range_test_data[] = { + { + .description = "1.1: A is left of B, no overlap", + .exclude_start = 10, .exclude_end = 50, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.2: A's right boundary touches B's left boundary", + .exclude_start = 10, .exclude_end = 99, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.3: A overlaps B's left part", + .exclude_start = 50, .exclude_end = 149, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 150, .end = 199 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.4: A is completely inside B", + .exclude_start = 120, .exclude_end = 179, + .initial_max_ranges = 2, // Needs space for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){ + { .start = 100, .end = 119 }, + { .start = 180, .end = 199 } + }, + .expected_nr_ranges = 2, + .expected_ret = 0, + }, + { + .description = "1.5: A overlaps B's right part", + .exclude_start = 150, .exclude_end = 249, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 100, .end = 149 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.6: A's left boundary touches B's right boundary", + .exclude_start = 200, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.7: A is right of B, no overlap", + .exclude_start = 250, .exclude_end = 300, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.8: A completely covers B and extends beyond", + .exclude_start = 50, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.9: A covers B and extends to the left", + .exclude_start = 50, .exclude_end = 199, // A ends exactly where B ends + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.10: A covers B and extends to the right", + .exclude_start = 100, .exclude_end = 250, // A starts exactly where B starts + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.11: A is identical to B", + .exclude_start = 100, .exclude_end = 199, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.12: A is a point, left of B, no overlap", + .exclude_start = 10, .exclude_end = 10, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.13: A is a point, at start of B", + .exclude_start = 100, .exclude_end = 100, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 101, .end = 199 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.14: A is a point, in middle of B (causes split)", + .exclude_start = 150, .exclude_end = 150, + .initial_max_ranges = 2, // Needs space for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){ + { .start = 100, .end = 149 }, + { .start = 151, .end = 199 } + }, + .expected_nr_ranges = 2, + .expected_ret = 0, + }, + { + .description = "1.15: A is a point, at end of B", + .exclude_start = 199, .exclude_end = 199, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 100, .end = 198 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.16: A is a point, right of B, no overlap", + .exclude_start = 250, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + // ENOMEM case for single range split + { + .description = "1.17: A completely inside B (split), no space (ENOMEM)", + .exclude_start = 120, .exclude_end = 179, + .initial_max_ranges = 1, // Not enough for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content + .expected_nr_ranges = 1, // Should remain unchanged + .expected_ret = -ENOMEM, + }, +}; + + +static void exclude_single_range_test(struct kunit *test) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(exclude_single_range_test_data); i++) { + kunit_log(KERN_INFO, test, "Running: %s", exclude_single_range_test_data[i].description); + run_exclude_test_case(test, &exclude_single_range_test_data[i]); + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case + } +} + +/* + * Test Strategy 2: Regression test. + */ + +static const struct exclude_test_param exclude_range_regression_test_data[] = { + // Test data from commit a2e9a95d2190 + { + .description = "2.1: exclude low 1M", + .exclude_start = 0, .exclude_end = (1 << 20) - 1, + .initial_max_ranges = 3, + .initial_ranges = (const struct range[]){ + { .start = 0, .end = 0x3efff }, + { .start = 0x3f000, .end = 0x3ffff }, + { .start = 0x40000, .end = 0x9ffff } + }, + .initial_nr_ranges = 3, + .expected_nr_ranges = 0, + .expected_ret = 0, + }, + // Test data from https://lore.kernel.org/all/ZXrY7QbXAlxydsSC@MiWiFi-R3L-srv/T/#u + { + .description = "2.2: when range out of bound", + .exclude_start = 100, .exclude_end = 200, + .initial_max_ranges = 3, + .initial_ranges = (const struct range[]){ + { .start = 1, .end = 299 }, + { .start = 401, .end = 1000 }, + { .start = 1001, .end = 2000 } + }, + .initial_nr_ranges = 3, + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content + .expected_nr_ranges = 3, // Should remain unchanged + .expected_ret = -ENOMEM + }, + +}; + + +static void exclude_range_regression_test(struct kunit *test) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(exclude_range_regression_test_data); i++) { + kunit_log(KERN_INFO, test, "Running: %s", exclude_range_regression_test_data[i].description); + run_exclude_test_case(test, &exclude_range_regression_test_data[i]); + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case + } +} + +/* + * KUnit Test Suite + */ +static struct kunit_case crash_exclude_mem_range_test_cases[] = { + KUNIT_CASE(exclude_single_range_test), + KUNIT_CASE(exclude_range_regression_test), + {} +}; + +static struct kunit_suite crash_exclude_mem_range_suite = { + .name = "crash_exclude_mem_range_tests", + .test_cases = crash_exclude_mem_range_test_cases, + // .init and .exit can be NULL if not needed globally for the suite +}; + +kunit_test_suite(crash_exclude_mem_range_suite); + +MODULE_DESCRIPTION("crash dump KUnit test suite"); +MODULE_LICENSE("GPL"); diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index b82399437db0..1e5c64cb6a42 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -38,8 +38,8 @@ enum { dma_debug_single, dma_debug_sg, dma_debug_coherent, - dma_debug_resource, dma_debug_noncoherent, + dma_debug_phy, }; enum map_err_types { @@ -141,8 +141,8 @@ static const char *type2name[] = { [dma_debug_single] = "single", [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", - [dma_debug_resource] = "resource", [dma_debug_noncoherent] = "noncoherent", + [dma_debug_phy] = "phy", }; static const char *dir2name[] = { @@ -1054,17 +1054,16 @@ static void check_unmap(struct dma_debug_entry *ref) dma_entry_free(entry); } -static void check_for_stack(struct device *dev, - struct page *page, size_t offset) +static void check_for_stack(struct device *dev, phys_addr_t phys) { void *addr; struct vm_struct *stack_vm_area = task_stack_vm_area(current); if (!stack_vm_area) { /* Stack is direct-mapped. */ - if (PageHighMem(page)) + if (PhysHighMem(phys)) return; - addr = page_address(page) + offset; + addr = phys_to_virt(phys); if (object_is_on_stack(addr)) err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr); } else { @@ -1072,10 +1071,12 @@ static void check_for_stack(struct device *dev, int i; for (i = 0; i < stack_vm_area->nr_pages; i++) { - if (page != stack_vm_area->pages[i]) + if (__phys_to_pfn(phys) != + page_to_pfn(stack_vm_area->pages[i])) continue; - addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + addr = (u8 *)current->stack + i * PAGE_SIZE + + (phys % PAGE_SIZE); err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr); break; } @@ -1204,9 +1205,8 @@ void debug_dma_map_single(struct device *dev, const void *addr, } EXPORT_SYMBOL(debug_dma_map_single); -void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, - size_t size, int direction, dma_addr_t dma_addr, - unsigned long attrs) +void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + int direction, dma_addr_t dma_addr, unsigned long attrs) { struct dma_debug_entry *entry; @@ -1221,19 +1221,18 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, return; entry->dev = dev; - entry->type = dma_debug_single; - entry->paddr = page_to_phys(page) + offset; + entry->type = dma_debug_phy; + entry->paddr = phys; entry->dev_addr = dma_addr; entry->size = size; entry->direction = direction; entry->map_err_type = MAP_ERR_NOT_CHECKED; - check_for_stack(dev, page, offset); + if (!(attrs & DMA_ATTR_MMIO)) { + check_for_stack(dev, phys); - if (!PageHighMem(page)) { - void *addr = page_address(page) + offset; - - check_for_illegal_area(dev, addr, size); + if (!PhysHighMem(phys)) + check_for_illegal_area(dev, phys_to_virt(phys), size); } add_dma_entry(entry, attrs); @@ -1277,11 +1276,11 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) } EXPORT_SYMBOL(debug_dma_mapping_error); -void debug_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, +void debug_dma_unmap_phys(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) { struct dma_debug_entry ref = { - .type = dma_debug_single, + .type = dma_debug_phy, .dev = dev, .dev_addr = dma_addr, .size = size, @@ -1305,7 +1304,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, return; for_each_sg(sg, s, nents, i) { - check_for_stack(dev, sg_page(s), s->offset); + check_for_stack(dev, sg_phys(s)); if (!PageHighMem(sg_page(s))) check_for_illegal_area(dev, sg_virt(s), s->length); } @@ -1445,47 +1444,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size, check_unmap(&ref); } -void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, - int direction, dma_addr_t dma_addr, - unsigned long attrs) -{ - struct dma_debug_entry *entry; - - if (unlikely(dma_debug_disabled())) - return; - - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->type = dma_debug_resource; - entry->dev = dev; - entry->paddr = addr; - entry->size = size; - entry->dev_addr = dma_addr; - entry->direction = direction; - entry->map_err_type = MAP_ERR_NOT_CHECKED; - - add_dma_entry(entry, attrs); -} - -void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) -{ - struct dma_debug_entry ref = { - .type = dma_debug_resource, - .dev = dev, - .dev_addr = dma_addr, - .size = size, - .direction = direction, - }; - - if (unlikely(dma_debug_disabled())) - return; - - check_unmap(&ref); -} - void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index 48757ca13f31..da7be0bddcf6 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -9,12 +9,11 @@ #define _KERNEL_DMA_DEBUG_H #ifdef CONFIG_DMA_API_DEBUG -extern void debug_dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - int direction, dma_addr_t dma_addr, +extern void debug_dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, int direction, dma_addr_t dma_addr, unsigned long attrs); -extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, +extern void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, int direction); extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, @@ -31,14 +30,6 @@ extern void debug_dma_alloc_coherent(struct device *dev, size_t size, extern void debug_dma_free_coherent(struct device *dev, size_t size, void *virt, dma_addr_t addr); -extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr, - size_t size, int direction, - dma_addr_t dma_addr, - unsigned long attrs); - -extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction); - extern void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction); @@ -62,14 +53,13 @@ extern void debug_dma_free_pages(struct device *dev, struct page *page, size_t size, int direction, dma_addr_t dma_addr); #else /* CONFIG_DMA_API_DEBUG */ -static inline void debug_dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - int direction, dma_addr_t dma_addr, - unsigned long attrs) +static inline void debug_dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, int direction, + dma_addr_t dma_addr, unsigned long attrs) { } -static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, +static inline void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, int direction) { } @@ -97,19 +87,6 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size, { } -static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr, - size_t size, int direction, - dma_addr_t dma_addr, - unsigned long attrs) -{ -} - -static inline void debug_dma_unmap_resource(struct device *dev, - dma_addr_t dma_addr, size_t size, - int direction) -{ -} - static inline void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 24c359d9c879..1f9ee9759426 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -120,7 +120,7 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, bool allow_highmem) { int node = dev_to_node(dev); - struct page *page = NULL; + struct page *page; u64 phys_limit; WARN_ON_ONCE(!PAGE_ALIGNED(size)); @@ -131,30 +131,25 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); if (page) { - if (!dma_coherent_ok(dev, page_to_phys(page), size) || - (!allow_highmem && PageHighMem(page))) { - dma_free_contiguous(dev, page, size); - page = NULL; - } + if (dma_coherent_ok(dev, page_to_phys(page), size) && + (allow_highmem || !PageHighMem(page))) + return page; + + dma_free_contiguous(dev, page, size); } -again: - if (!page) - page = alloc_pages_node(node, gfp, get_order(size)); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + + while ((page = alloc_pages_node(node, gfp, get_order(size))) + && !dma_coherent_ok(dev, page_to_phys(page), size)) { __free_pages(page, get_order(size)); - page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && phys_limit < DMA_BIT_MASK(64) && - !(gfp & (GFP_DMA32 | GFP_DMA))) { + !(gfp & (GFP_DMA32 | GFP_DMA))) gfp |= GFP_DMA32; - goto again; - } - - if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { + else if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) gfp = (gfp & ~GFP_DMA32) | GFP_DMA; - goto again; - } + else + return NULL; } return page; @@ -453,7 +448,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, if (sg_dma_is_bus_address(sg)) sg_dma_unmark_bus_address(sg); else - dma_direct_unmap_page(dev, sg->dma_address, + dma_direct_unmap_phys(dev, sg->dma_address, sg_dma_len(sg), dir, attrs); } } @@ -476,8 +471,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, */ break; case PCI_P2PDMA_MAP_NONE: - sg->dma_address = dma_direct_map_page(dev, sg_page(sg), - sg->offset, sg->length, dir, attrs); + sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg), + sg->length, dir, attrs); if (sg->dma_address == DMA_MAPPING_ERROR) { ret = -EIO; goto out_unmap; @@ -502,22 +497,6 @@ out_unmap: return ret; } -dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - dma_addr_t dma_addr = paddr; - - if (unlikely(!dma_capable(dev, dma_addr, size, false))) { - dev_err_once(dev, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - WARN_ON_ONCE(1); - return DMA_MAPPING_ERROR; - } - - return dma_addr; -} - int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index d2c0b7e632fc..da2fadf45bcd 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -80,42 +80,57 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, arch_dma_mark_clean(paddr, size); } -static inline dma_addr_t dma_direct_map_page(struct device *dev, - struct page *page, unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) +static inline dma_addr_t dma_direct_map_phys(struct device *dev, + phys_addr_t phys, size_t size, enum dma_data_direction dir, + unsigned long attrs) { - phys_addr_t phys = page_to_phys(page) + offset; - dma_addr_t dma_addr = phys_to_dma(dev, phys); + dma_addr_t dma_addr; if (is_swiotlb_force_bounce(dev)) { - if (is_pci_p2pdma_page(page)) - return DMA_MAPPING_ERROR; + if (attrs & DMA_ATTR_MMIO) + goto err_overflow; + return swiotlb_map(dev, phys, size, dir, attrs); } - if (unlikely(!dma_capable(dev, dma_addr, size, true)) || - dma_kmalloc_needs_bounce(dev, size, dir)) { - if (is_pci_p2pdma_page(page)) - return DMA_MAPPING_ERROR; - if (is_swiotlb_active(dev)) - return swiotlb_map(dev, phys, size, dir, attrs); - - dev_WARN_ONCE(dev, 1, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - return DMA_MAPPING_ERROR; + if (attrs & DMA_ATTR_MMIO) { + dma_addr = phys; + if (unlikely(!dma_capable(dev, dma_addr, size, false))) + goto err_overflow; + } else { + dma_addr = phys_to_dma(dev, phys); + if (unlikely(!dma_capable(dev, dma_addr, size, true)) || + dma_kmalloc_needs_bounce(dev, size, dir)) { + if (is_swiotlb_active(dev)) + return swiotlb_map(dev, phys, size, dir, attrs); + + goto err_overflow; + } } - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + if (!dev_is_dma_coherent(dev) && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) arch_sync_dma_for_device(phys, size, dir); return dma_addr; + +err_overflow: + dev_WARN_ONCE( + dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; } -static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, +static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t phys = dma_to_phys(dev, addr); + phys_addr_t phys; + + if (attrs & DMA_ATTR_MMIO) + /* nothing to do: uncached and no swiotlb */ + return; + phys = dma_to_phys(dev, addr); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 56de28a3b179..fe7472f13b10 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -152,11 +152,11 @@ static inline bool dma_map_direct(struct device *dev, return dma_go_direct(dev, *dev->dma_mask, ops); } -dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, - size_t offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); + bool is_mmio = attrs & DMA_ATTR_MMIO; dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); @@ -165,36 +165,81 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, return DMA_MAPPING_ERROR; if (dma_map_direct(dev, ops) || - arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size)) - addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + (!is_mmio && arch_dma_map_phys_direct(dev, phys + size))) + addr = dma_direct_map_phys(dev, phys, size, dir, attrs); else if (use_dma_iommu(dev)) - addr = iommu_dma_map_page(dev, page, offset, size, dir, attrs); - else + addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); + else if (is_mmio) { + if (!ops->map_resource) + return DMA_MAPPING_ERROR; + + addr = ops->map_resource(dev, phys, size, dir, attrs); + } else { + struct page *page = phys_to_page(phys); + size_t offset = offset_in_page(phys); + + /* + * The dma_ops API contract for ops->map_page() requires + * kmappable memory, while ops->map_resource() does not. + */ addr = ops->map_page(dev, page, offset, size, dir, attrs); - kmsan_handle_dma(page, offset, size, dir); - trace_dma_map_page(dev, page_to_phys(page) + offset, addr, size, dir, - attrs); - debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); + } + + if (!is_mmio) + kmsan_handle_dma(phys, size, dir); + trace_dma_map_phys(dev, phys, addr, size, dir, attrs); + debug_dma_map_phys(dev, phys, size, dir, addr, attrs); return addr; } +EXPORT_SYMBOL_GPL(dma_map_phys); + +dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t phys = page_to_phys(page) + offset; + + if (unlikely(attrs & DMA_ATTR_MMIO)) + return DMA_MAPPING_ERROR; + + if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && + WARN_ON_ONCE(is_zone_device_page(page))) + return DMA_MAPPING_ERROR; + + return dma_map_phys(dev, phys, size, dir, attrs); +} EXPORT_SYMBOL(dma_map_page_attrs); -void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, +void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); + bool is_mmio = attrs & DMA_ATTR_MMIO; BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops) || - arch_dma_unmap_page_direct(dev, addr + size)) - dma_direct_unmap_page(dev, addr, size, dir, attrs); + (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size))) + dma_direct_unmap_phys(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) - iommu_dma_unmap_page(dev, addr, size, dir, attrs); - else + iommu_dma_unmap_phys(dev, addr, size, dir, attrs); + else if (is_mmio) { + if (ops->unmap_resource) + ops->unmap_resource(dev, addr, size, dir, attrs); + } else ops->unmap_page(dev, addr, size, dir, attrs); - trace_dma_unmap_page(dev, addr, size, dir, attrs); - debug_dma_unmap_page(dev, addr, size, dir); + trace_dma_unmap_phys(dev, addr, size, dir, attrs); + debug_dma_unmap_phys(dev, addr, size, dir); +} +EXPORT_SYMBOL_GPL(dma_unmap_phys); + +void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + if (unlikely(attrs & DMA_ATTR_MMIO)) + return; + + dma_unmap_phys(dev, addr, size, dir, attrs); } EXPORT_SYMBOL(dma_unmap_page_attrs); @@ -321,41 +366,18 @@ EXPORT_SYMBOL(dma_unmap_sg_attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - const struct dma_map_ops *ops = get_dma_ops(dev); - dma_addr_t addr = DMA_MAPPING_ERROR; - - BUG_ON(!valid_dma_direction(dir)); - - if (WARN_ON_ONCE(!dev->dma_mask)) + if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && + WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) return DMA_MAPPING_ERROR; - if (dma_map_direct(dev, ops)) - addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); - else if (use_dma_iommu(dev)) - addr = iommu_dma_map_resource(dev, phys_addr, size, dir, attrs); - else if (ops->map_resource) - addr = ops->map_resource(dev, phys_addr, size, dir, attrs); - - trace_dma_map_resource(dev, phys_addr, addr, size, dir, attrs); - debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs); - return addr; + return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_map_resource); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (dma_map_direct(dev, ops)) - ; /* nothing to do: uncached and no swiotlb */ - else if (use_dma_iommu(dev)) - iommu_dma_unmap_resource(dev, addr, size, dir, attrs); - else if (ops->unmap_resource) - ops->unmap_resource(dev, addr, size, dir, attrs); - trace_dma_unmap_resource(dev, addr, size, dir, attrs); - debug_dma_unmap_resource(dev, addr, size, dir); + dma_unmap_phys(dev, addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_unmap_resource); diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 9afd569eadb9..6f9d604d9d40 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -72,8 +72,8 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, return NULL; if (use_dma_iommu(dev)) - *dma_handle = iommu_dma_map_page(dev, page, 0, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); + *dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size, + dir, DMA_ATTR_SKIP_CPU_SYNC); else *dma_handle = ops->map_page(dev, page, 0, size, dir, DMA_ATTR_SKIP_CPU_SYNC); @@ -92,7 +92,7 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, const struct dma_map_ops *ops = get_dma_ops(dev); if (use_dma_iommu(dev)) - iommu_dma_unmap_page(dev, dma_handle, size, dir, + iommu_dma_unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); else if (ops->unmap_page) ops->unmap_page(dev, dma_handle, size, dir, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index abcf3fa63a56..0d37da3d95b6 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1209,7 +1209,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, nslabs = nr_slots(alloc_size); phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit); pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!pool) return -1; diff --git a/kernel/fork.c b/kernel/fork.c index f1688b3e79a6..3da0f08615a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2132,9 +2132,7 @@ __latent_entropy struct task_struct *copy_process( p->pagefault_disabled = 0; -#ifdef CONFIG_LOCKDEP lockdep_init_task(p); -#endif p->blocked_on = NULL; /* not blocked yet */ @@ -2547,11 +2545,9 @@ struct task_struct * __init fork_idle(int cpu) struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) { unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| - CLONE_IO; + CLONE_IO|CLONE_VM|CLONE_UNTRACED; struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = flags, .fn = fn, .fn_arg = arg, .io_thread = 1, @@ -2663,9 +2659,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, .name = name, @@ -2681,9 +2676,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, }; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 8708a1205f82..b2c1f14b8129 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -95,9 +95,41 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; +static bool task_is_hung(struct task_struct *t, unsigned long timeout) +{ + unsigned long switch_count = t->nvcsw + t->nivcsw; + unsigned int state = READ_ONCE(t->__state); + + /* + * skip the TASK_KILLABLE tasks -- these can be killed + * skip the TASK_IDLE tasks -- those are genuinely idle + * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer + */ + if (!(state & TASK_UNINTERRUPTIBLE) || + (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN))) + return false; + + /* + * When a freshly created task is scheduled once, changes its state to + * TASK_UNINTERRUPTIBLE without having ever been switched out once, it + * musn't be checked. + */ + if (unlikely(!switch_count)) + return false; + + if (switch_count != t->last_switch_count) { + t->last_switch_count = switch_count; + t->last_switch_time = jiffies; + return false; + } + if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + return false; + + return true; +} #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER -static void debug_show_blocker(struct task_struct *task) +static void debug_show_blocker(struct task_struct *task, unsigned long timeout) { struct task_struct *g, *t; unsigned long owner, blocker, blocker_type; @@ -174,41 +206,21 @@ static void debug_show_blocker(struct task_struct *task) t->pid, rwsem_blocked_by); break; } - sched_show_task(t); + /* Avoid duplicated task dump, skip if the task is also hung. */ + if (!task_is_hung(t, timeout)) + sched_show_task(t); return; } } #else -static inline void debug_show_blocker(struct task_struct *task) +static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout) { } #endif static void check_hung_task(struct task_struct *t, unsigned long timeout) { - unsigned long switch_count = t->nvcsw + t->nivcsw; - - /* - * Ensure the task is not frozen. - * Also, skip vfork and any other user process that freezer should skip. - */ - if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) - return; - - /* - * When a freshly created task is scheduled once, changes its state to - * TASK_UNINTERRUPTIBLE without having ever been switched out once, it - * musn't be checked. - */ - if (unlikely(!switch_count)) - return; - - if (switch_count != t->last_switch_count) { - t->last_switch_count = switch_count; - t->last_switch_time = jiffies; - return; - } - if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + if (!task_is_hung(t, timeout)) return; /* @@ -243,7 +255,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_blocker(t); + debug_show_blocker(t, timeout); hung_task_show_lock = true; if (sysctl_hung_task_all_cpu_backtrace) @@ -299,7 +311,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { - unsigned int state; if (!max_count--) goto unlock; @@ -308,15 +319,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } - /* - * skip the TASK_KILLABLE tasks -- these can be killed - * skip the TASK_IDLE tasks -- those are genuinely idle - */ - state = READ_ONCE(t->__state); - if ((state & TASK_UNINTERRUPTIBLE) && - !(state & TASK_WAKEKILL) && - !(state & TASK_NOLOAD)) - check_hung_task(t, timeout); + + check_hung_task(t, timeout); } unlock: rcu_read_unlock(); diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index cf4af5728307..2b082a7e24a2 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -264,7 +264,7 @@ static int test_kallsyms_basic_function(void) char namebuf[KSYM_NAME_LEN]; struct test_stat *stat, *stat2; - stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL); + stat = kmalloc_array(2, sizeof(*stat), GFP_KERNEL); if (!stat) return -ENOMEM; stat2 = stat + 1; diff --git a/kernel/kcov.c b/kernel/kcov.c index 1d85597057e1..6563141f5de9 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -978,6 +978,15 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, memcpy(dst_entries, src_entries, bytes_to_move); entries_moved = bytes_to_move >> entry_size_log; + /* + * A write memory barrier is required here, to ensure + * that the writes from the memcpy() are visible before + * the count is updated. Without this, it is possible for + * a user to observe a new count value but stale + * coverage data. + */ + smp_wmb(); + switch (mode) { case KCOV_MODE_TRACE_PC: WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 31203f0bacaf..fa00b239c5d9 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -233,7 +233,6 @@ struct kimage *do_kimage_alloc_init(void) if (!image) return NULL; - image->head = 0; image->entry = &image->head; image->last_entry = &image->head; image->control_page = ~0; /* By default this does not apply */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 91d46502a817..eb62a9794242 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -255,6 +255,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + image->force_dtb = flags & KEXEC_FILE_FORCE_DTB; if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 555488eb1a18..5083c68c3a4e 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -988,6 +988,26 @@ static const void *kho_get_fdt(void) } /** + * is_kho_boot - check if current kernel was booted via KHO-enabled + * kexec + * + * This function checks if the current kernel was loaded through a kexec + * operation with KHO enabled, by verifying that a valid KHO FDT + * was passed. + * + * Note: This function returns reliable results only after + * kho_populate() has been called during early boot. Before that, + * it may return false even if KHO data is present. + * + * Return: true if booted via KHO-enabled kexec, false otherwise + */ +bool is_kho_boot(void) +{ + return !!kho_get_fdt(); +} +EXPORT_SYMBOL_GPL(is_kho_boot); + +/** * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. * @name: the name of the sub FDT passed to kho_add_subtree(). * @phys: if found, the physical address of the sub FDT is stored in @phys. @@ -1269,7 +1289,7 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_enable) + if (!kho_out.finalized) return 0; image->kho.fdt = page_to_phys(kho_out.ser.fdt); diff --git a/kernel/panic.c b/kernel/panic.c index 72fcbb5a071b..24cc3eec1805 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -53,7 +53,7 @@ static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; #define sysctl_oops_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ -int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; +int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS); static unsigned long tainted_mask = IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; @@ -67,6 +67,7 @@ static unsigned int warn_limit __read_mostly; static bool panic_console_replay; bool panic_triggering_all_cpu_backtrace; +static bool panic_this_cpu_backtrace_printed; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -77,6 +78,11 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); +static void panic_print_deprecated(void) +{ + pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n"); +} + #ifdef CONFIG_SYSCTL /* @@ -125,7 +131,7 @@ static int proc_taint(const struct ctl_table *table, int write, static int sysctl_panic_print_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + panic_print_deprecated(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } @@ -294,6 +300,59 @@ void __weak crash_smp_send_stop(void) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +bool panic_try_start(void) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); + + return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu); +} +EXPORT_SYMBOL(panic_try_start); + +void panic_reset(void) +{ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_reset); + +bool panic_in_progress(void) +{ + return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_in_progress); + +/* Return true if a panic is in progress on the current CPU. */ +bool panic_on_this_cpu(void) +{ + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); +} +EXPORT_SYMBOL(panic_on_this_cpu); + +/* + * Return true if a panic is in progress on a remote CPU. + * + * On true, the local CPU should immediately release any printing resources + * that may be needed by the panic CPU. + */ +bool panic_on_other_cpu(void) +{ + return (panic_in_progress() && !panic_on_this_cpu()); +} +EXPORT_SYMBOL(panic_on_other_cpu); + /* * A variant of panic() called from NMI context. We return if we've already * panicked on this CPU. If another CPU already panicked, loop in @@ -302,15 +361,9 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); */ void nmi_panic(struct pt_regs *regs, const char *msg) { - int old_cpu, this_cpu; - - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - /* atomic_try_cmpxchg updates old_cpu on failure */ - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) + if (panic_try_start()) panic("%s", msg); - else if (old_cpu != this_cpu) + else if (panic_on_other_cpu()) nmi_panic_self_stop(regs); } EXPORT_SYMBOL(nmi_panic); @@ -328,6 +381,19 @@ void check_panic_on_warn(const char *origin) origin, limit); } +static void panic_trigger_all_cpu_backtrace(void) +{ + /* Temporary allow non-panic CPUs to write their backtraces. */ + panic_triggering_all_cpu_backtrace = true; + + if (panic_this_cpu_backtrace_printed) + trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id()); + else + trigger_all_cpu_backtrace(); + + panic_triggering_all_cpu_backtrace = false; +} + /* * Helper that triggers the NMI backtrace (if set in panic_print) * and then performs the secondary CPUs shutdown - we cannot have @@ -335,12 +401,8 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & SYS_INFO_ALL_CPU_BT) { - /* Temporary allow non-panic CPUs to write their backtraces. */ - panic_triggering_all_cpu_backtrace = true; - trigger_all_cpu_backtrace(); - panic_triggering_all_cpu_backtrace = false; - } + if (panic_print & SYS_INFO_ALL_CPU_BT) + panic_trigger_all_cpu_backtrace(); /* * Note that smp_send_stop() is the usual SMP shutdown function, @@ -368,7 +430,6 @@ void vpanic(const char *fmt, va_list args) static char buf[1024]; long i, i_next = 0, len; int state = 0; - int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; if (panic_on_warn) { @@ -405,13 +466,10 @@ void vpanic(const char *fmt, va_list args) * `old_cpu == this_cpu' means we came from nmi_panic() which sets * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - /* atomic_try_cmpxchg updates old_cpu on failure */ - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + if (panic_try_start()) { /* go ahead */ - } else if (old_cpu != this_cpu) + } else if (panic_on_other_cpu()) panic_smp_self_stop(); console_verbose(); @@ -422,13 +480,15 @@ void vpanic(const char *fmt, va_list args) buf[len - 1] = '\0'; pr_emerg("Kernel panic - not syncing: %s\n", buf); -#ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing */ - if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) + if (test_taint(TAINT_DIE) || oops_in_progress > 1) { + panic_this_cpu_backtrace_printed = true; + } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { dump_stack(); -#endif + panic_this_cpu_backtrace_printed = true; + } /* * If kgdb is enabled, give it a chance to run before we stop all @@ -937,12 +997,29 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif core_param(panic, panic_timeout, int, 0644); -core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); core_param(panic_console_replay, panic_console_replay, bool, 0644); +static int panic_print_set(const char *val, const struct kernel_param *kp) +{ + panic_print_deprecated(); + return param_set_ulong(val, kp); +} + +static int panic_print_get(char *val, const struct kernel_param *kp) +{ + panic_print_deprecated(); + return param_get_ulong(val, kp); +} + +static const struct kernel_param_ops panic_print_ops = { + .set = panic_print_set, + .get = panic_print_get, +}; +__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644); + static int __init oops_setup(char *s) { if (!s) diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index ef282001f200..f72bbfa266d6 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -332,7 +332,6 @@ struct printk_message { unsigned long dropped; }; -bool other_cpu_in_panic(void); bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_supress); diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 646801813415..558ef3177976 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -12,6 +12,7 @@ #include <linux/irqflags.h> #include <linux/kthread.h> #include <linux/minmax.h> +#include <linux/panic.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/slab.h> @@ -254,7 +255,7 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, * opportunity to perform any necessary cleanup if they were * interrupted by the panic CPU while printing. */ - if (other_cpu_in_panic() && + if (panic_on_other_cpu() && (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; } @@ -309,7 +310,7 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. * Event #4 occurs when a non-panic CPU reacquires. - * Event #5 is not possible due to the other_cpu_in_panic() check + * Event #5 is not possible due to the panic_on_other_cpu() check * in nbcon_context_try_acquire_handover(). */ @@ -348,7 +349,7 @@ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt, struct nbcon_state new; /* Note that the caller must still remove the request! */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return -EPERM; /* @@ -446,7 +447,7 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, * nbcon_waiter_matches(). In particular, the assumption that * lower priorities are ignored during panic. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return -EPERM; /* Handover is not possible on the same CPU. */ @@ -589,7 +590,6 @@ static struct printk_buffers panic_nbcon_pbufs; */ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { - unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state cur; int err; @@ -614,7 +614,7 @@ out: /* Acquire succeeded. */ /* Assign the appropriate buffer for this context. */ - if (atomic_read(&panic_cpu) == cpu) + if (panic_on_this_cpu()) ctxt->pbufs = &panic_nbcon_pbufs; else ctxt->pbufs = con->pbufs; @@ -1394,7 +1394,7 @@ enum nbcon_prio nbcon_get_default_prio(void) { unsigned int *cpu_emergency_nesting; - if (this_cpu_in_panic()) + if (panic_on_this_cpu()) return NBCON_PRIO_PANIC; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0efbcdda9aab..5aee9ffb16b9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -48,6 +48,7 @@ #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> +#include <linux/panic.h> #include <linux/uaccess.h> #include <asm/sections.h> @@ -345,34 +346,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -static bool panic_in_progress(void) -{ - return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); -} - -/* Return true if a panic is in progress on the current CPU. */ -bool this_cpu_in_panic(void) -{ - /* - * We can use raw_smp_processor_id() here because it is impossible for - * the task to be migrated to the panic_cpu, or away from it. If - * panic_cpu has already been set, and we're not currently executing on - * that CPU, then we never will be. - */ - return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); -} - -/* - * Return true if a panic is in progress on a remote CPU. - * - * On true, the local CPU should immediately release any printing resources - * that may be needed by the panic CPU. - */ -bool other_cpu_in_panic(void) -{ - return (panic_in_progress() && !this_cpu_in_panic()); -} - /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -2407,7 +2380,7 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic() && + if (panic_on_other_cpu() && !debug_non_panic_cpus && !panic_triggering_all_cpu_backtrace) return 0; @@ -2843,7 +2816,7 @@ void console_lock(void) might_sleep(); /* On panic, the console_lock must be left to the panic cpu. */ - while (other_cpu_in_panic()) + while (panic_on_other_cpu()) msleep(1000); down_console_sem(); @@ -2863,7 +2836,7 @@ EXPORT_SYMBOL(console_lock); int console_trylock(void) { /* On panic, the console_lock must be left to the panic cpu. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return 0; if (down_trylock_console_sem()) return 0; @@ -3243,7 +3216,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) goto abandon; if (do_cond_resched) diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index d9fb053cff67..e2a1b2d34d2b 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -2143,7 +2143,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ - if (this_cpu_in_panic() && + if (panic_on_this_cpu() && (!debug_non_panic_cpus || legacy_allow_panic_sync) && ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; diff --git a/kernel/sys.c b/kernel/sys.c index a46d9b75880b..8b58eece4e58 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1734,6 +1734,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, struct rlimit old, new; struct task_struct *tsk; unsigned int checkflags = 0; + bool need_tasklist; int ret; if (old_rlim) @@ -1760,8 +1761,25 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, get_task_struct(tsk); rcu_read_unlock(); - ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, - old_rlim ? &old : NULL); + need_tasklist = !same_thread_group(tsk, current); + if (need_tasklist) { + /* + * Ensure we can't race with group exit or de_thread(), + * so tsk->group_leader can't be freed or changed until + * read_unlock(tasklist_lock) below. + */ + read_lock(&tasklist_lock); + if (!pid_alive(tsk)) + ret = -ESRCH; + } + + if (!ret) { + ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, + old_rlim ? &old : NULL); + } + + if (need_tasklist) + read_unlock(&tasklist_lock); if (!ret && old_rlim) { rlim_to_rlim64(&old, &old64); @@ -2515,7 +2533,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = -EINVAL; break; } + /* + * Ensure that either: + * + * 1. Subsequent getppid() calls reflect the parent process having died. + * 2. forget_original_parent() will send the new me->pdeath_signal. + * + * Also prevent the read of me->pdeath_signal from being a data race. + */ + read_lock(&tasklist_lock); me->pdeath_signal = arg2; + read_unlock(&tasklist_lock); break; case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 8f23f5273bab..4f87c16d915a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -899,7 +899,7 @@ const struct bpf_func_proto bpf_send_signal_thread_proto = { .arg1_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) +BPF_CALL_3(bpf_d_path, const struct path *, path, char *, buf, u32, sz) { struct path copy; long len; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b3c94fbaf002..156e7e0bf559 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10211,8 +10211,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n"); - ret = vfs_parse_fs_string(fc, "source", - "tracefs", strlen("tracefs")); + ret = vfs_parse_fs_string(fc, "source", "tracefs"); if (!ret) mnt = fc_mount(fc); else diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 80b56c002c7f..5b62d1002783 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -425,7 +425,11 @@ static DEFINE_PER_CPU(u8, cpustat_tail); */ static u16 get_16bit_precision(u64 data_ns) { - return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */ + /* + * 2^24ns ~= 16.8ms + * Round to the nearest multiple of 16.8 milliseconds. + */ + return (data_ns + (1 << 23)) >> 24LL; } static void update_cpustat(void) @@ -444,6 +448,14 @@ static void update_cpustat(void) old_stat = __this_cpu_read(cpustat_old[i]); new_stat = get_16bit_precision(cpustat[tracked_stats[i]]); util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16); + /* + * Since we use 16-bit precision, the raw data will undergo + * integer division, which may sometimes result in data loss, + * and then result might exceed 100%. To avoid confusion, + * we enforce a 100% display cap when calculations exceed this threshold. + */ + if (util > 100) + util = 100; __this_cpu_write(cpustat_util[tail][i], util); __this_cpu_write(cpustat_old[i], new_stat); } @@ -455,17 +467,17 @@ static void print_cpustat(void) { int i, group; u8 tail = __this_cpu_read(cpustat_tail); - u64 sample_period_second = sample_period; + u64 sample_period_msecond = sample_period; - do_div(sample_period_second, NSEC_PER_SEC); + do_div(sample_period_msecond, NSEC_PER_MSEC); /* * Outputting the "watchdog" prefix on every line is redundant and not * concise, and the original alarm information is sufficient for * positioning in logs, hence here printk() is used instead of pr_crit(). */ - printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n", - smp_processor_id(), sample_period_second); + printk(KERN_CRIT "CPU#%d Utilization every %llums during lockup:\n", + smp_processor_id(), sample_period_msecond); for (i = 0; i < NUM_SAMPLE_PERIODS; i++) { group = (tail + i) % NUM_SAMPLE_PERIODS; @@ -740,6 +752,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (!watchdog_enabled) return HRTIMER_NORESTART; + /* + * pass the buddy check if a panic is in process + */ + if (panic_in_progress()) + return HRTIMER_NORESTART; + watchdog_hardlockup_kick(); /* kick the softlockup detector */ diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 9c58f5b4381d..d3ca70e3c256 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "NMI watchdog: " fmt +#include <linux/panic.h> #include <linux/nmi.h> #include <linux/atomic.h> #include <linux/module.h> @@ -108,6 +109,9 @@ static void watchdog_overflow_callback(struct perf_event *event, /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; + if (panic_in_progress()) + return; + if (!watchdog_check_timestamp()) return; |