summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec11
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c96
-rw-r--r--kernel/audit_tree.c12
-rw-r--r--kernel/bpf/verifier.c7
-rw-r--r--kernel/crash_core.c30
-rw-r--r--kernel/crash_core_test.c343
-rw-r--r--kernel/dma/debug.c82
-rw-r--r--kernel/dma/debug.h37
-rw-r--r--kernel/dma/direct.c53
-rw-r--r--kernel/dma/direct.h57
-rw-r--r--kernel/dma/mapping.c112
-rw-r--r--kernel/dma/ops_helpers.c6
-rw-r--r--kernel/dma/swiotlb.c2
-rw-r--r--kernel/fork.c18
-rw-r--r--kernel/hung_task.c78
-rw-r--r--kernel/kallsyms_selftest.c2
-rw-r--r--kernel/kcov.c9
-rw-r--r--kernel/kexec_core.c1
-rw-r--r--kernel/kexec_file.c1
-rw-r--r--kernel/kexec_handover.c22
-rw-r--r--kernel/panic.c129
-rw-r--r--kernel/printk/internal.h1
-rw-r--r--kernel/printk/nbcon.c14
-rw-r--r--kernel/printk/printk.c37
-rw-r--r--kernel/printk/printk_ringbuffer.c2
-rw-r--r--kernel/sys.c32
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/trace.c3
-rw-r--r--kernel/watchdog.c28
-rw-r--r--kernel/watchdog_perf.c4
31 files changed, 823 insertions, 409 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 1224dd937df0..422270d64820 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -148,6 +148,17 @@ config CRASH_DM_CRYPT_CONFIGS
CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that
is required to be built-in.
+config CRASH_DUMP_KUNIT_TEST
+ tristate "Unit Tests for kernel crash dumps" if !KUNIT_ALL_TESTS
+ depends on CRASH_DUMP && KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ This option builds KUnit unit tests for kernel crash dumps. The unit
+ tests will be used to verify the correctness of covered functions and
+ also prevent any regression.
+
+ If unsure, say N.
+
config CRASH_HOTPLUG
bool "Update the crash elfcorehdr on system configuration changes"
default y
diff --git a/kernel/Makefile b/kernel/Makefile
index 41751834e764..df3dd8291bb6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -78,6 +78,7 @@ obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_CRASH_DUMP) += crash_core.o
obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o
+obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 6520baa13669..61630110e29d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -44,19 +44,14 @@
* a struct file opened for write. Fixed. 2/6/2000, AV.
*/
-#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/acct.h>
#include <linux/capability.h>
-#include <linux/file.h>
#include <linux/tty.h>
-#include <linux/security.h>
-#include <linux/vfs.h>
+#include <linux/statfs.h>
#include <linux/jiffies.h>
-#include <linux/times.h>
#include <linux/syscalls.h>
-#include <linux/mount.h>
-#include <linux/uaccess.h>
+#include <linux/namei.h>
#include <linux/sched/cputime.h>
#include <asm/div64.h>
@@ -217,84 +212,70 @@ static void close_work(struct work_struct *work)
complete(&acct->done);
}
-static int acct_on(struct filename *pathname)
+DEFINE_FREE(fput_sync, struct file *, if (!IS_ERR_OR_NULL(_T)) __fput_sync(_T))
+static int acct_on(const char __user *name)
{
- struct file *file;
- struct vfsmount *mnt, *internal;
+ /* Difference from BSD - they don't do O_APPEND */
+ const int open_flags = O_WRONLY|O_APPEND|O_LARGEFILE;
struct pid_namespace *ns = task_active_pid_ns(current);
+ struct filename *pathname __free(putname) = getname(name);
+ struct file *original_file __free(fput) = NULL; // in that order
+ struct path internal __free(path_put) = {}; // in that order
+ struct file *file __free(fput_sync) = NULL; // in that order
struct bsd_acct_struct *acct;
+ struct vfsmount *mnt;
struct fs_pin *old;
- int err;
- acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
- if (!acct)
- return -ENOMEM;
+ if (IS_ERR(pathname))
+ return PTR_ERR(pathname);
+ original_file = file_open_name(pathname, open_flags, 0);
+ if (IS_ERR(original_file))
+ return PTR_ERR(original_file);
- /* Difference from BSD - they don't do O_APPEND */
- file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
- if (IS_ERR(file)) {
- kfree(acct);
+ mnt = mnt_clone_internal(&original_file->f_path);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ internal.mnt = mnt;
+ internal.dentry = dget(mnt->mnt_root);
+
+ file = dentry_open(&internal, open_flags, current_cred());
+ if (IS_ERR(file))
return PTR_ERR(file);
- }
- if (!S_ISREG(file_inode(file)->i_mode)) {
- kfree(acct);
- filp_close(file, NULL);
+ if (!S_ISREG(file_inode(file)->i_mode))
return -EACCES;
- }
/* Exclude kernel kernel internal filesystems. */
- if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) {
- kfree(acct);
- filp_close(file, NULL);
+ if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT))
return -EINVAL;
- }
/* Exclude procfs and sysfs. */
- if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) {
- kfree(acct);
- filp_close(file, NULL);
+ if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE)
return -EINVAL;
- }
- if (!(file->f_mode & FMODE_CAN_WRITE)) {
- kfree(acct);
- filp_close(file, NULL);
+ if (!(file->f_mode & FMODE_CAN_WRITE))
return -EIO;
- }
- internal = mnt_clone_internal(&file->f_path);
- if (IS_ERR(internal)) {
- kfree(acct);
- filp_close(file, NULL);
- return PTR_ERR(internal);
- }
- err = mnt_get_write_access(internal);
- if (err) {
- mntput(internal);
- kfree(acct);
- filp_close(file, NULL);
- return err;
- }
- mnt = file->f_path.mnt;
- file->f_path.mnt = internal;
+
+ acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+ if (!acct)
+ return -ENOMEM;
atomic_long_set(&acct->count, 1);
init_fs_pin(&acct->pin, acct_pin_kill);
- acct->file = file;
+ acct->file = no_free_ptr(file);
acct->needcheck = jiffies;
acct->ns = ns;
mutex_init(&acct->lock);
INIT_WORK(&acct->work, close_work);
init_completion(&acct->done);
mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
- pin_insert(&acct->pin, mnt);
+ pin_insert(&acct->pin, original_file->f_path.mnt);
rcu_read_lock();
old = xchg(&ns->bacct, &acct->pin);
mutex_unlock(&acct->lock);
pin_kill(old);
- mnt_put_write_access(mnt);
- mntput(mnt);
return 0;
}
@@ -319,14 +300,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
return -EPERM;
if (name) {
- struct filename *tmp = getname(name);
-
- if (IS_ERR(tmp))
- return PTR_ERR(tmp);
mutex_lock(&acct_on_mutex);
- error = acct_on(tmp);
+ error = acct_on(name);
mutex_unlock(&acct_on_mutex);
- putname(tmp);
} else {
rcu_read_lock();
pin_kill(task_active_pid_ns(current)->bacct);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1605df0a171e..fda6beb041e0 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -680,7 +680,7 @@ void audit_trim_trees(void)
struct audit_tree *tree;
struct path path;
struct audit_node *node;
- struct path *paths;
+ const struct path *paths;
struct path array[16];
int err;
@@ -703,7 +703,7 @@ void audit_trim_trees(void)
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
node->index |= 1U<<31;
- for (struct path *p = paths; p->dentry; p++) {
+ for (const struct path *p = paths; p->dentry; p++) {
struct inode *inode = p->dentry->d_inode;
if (inode_to_key(inode) == chunk->key) {
node->index &= ~(1U<<31);
@@ -742,9 +742,9 @@ void audit_put_tree(struct audit_tree *tree)
put_tree(tree);
}
-static int tag_mounts(struct path *paths, struct audit_tree *tree)
+static int tag_mounts(const struct path *paths, struct audit_tree *tree)
{
- for (struct path *p = paths; p->dentry; p++) {
+ for (const struct path *p = paths; p->dentry; p++) {
int err = tag_chunk(p->dentry->d_inode, tree);
if (err)
return err;
@@ -807,7 +807,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
struct audit_tree *seed = rule->tree, *tree;
struct path path;
struct path array[16];
- struct path *paths;
+ const struct path *paths;
int err;
rule->tree = NULL;
@@ -879,7 +879,7 @@ int audit_tag_tree(char *old, char *new)
int failed = 0;
struct path path1, path2;
struct path array[16];
- struct path *paths;
+ const struct path *paths;
int err;
err = kern_path(new, 0, &path2);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 73bba397672a..ff40e5e65c43 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15645,7 +15645,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- if (opcode == BPF_NEG) {
+ if (opcode == BPF_NEG &&
+ regs[insn->dst_reg].type == SCALAR_VALUE) {
err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
err = err ?: adjust_scalar_min_max_vals(env, insn,
&regs[insn->dst_reg],
@@ -15803,7 +15804,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
- if (insn->imm != 0 || insn->off > 1 ||
+ if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) ||
(insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
@@ -15813,7 +15814,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
} else {
- if (insn->src_reg != BPF_REG_0 || insn->off > 1 ||
+ if (insn->src_reg != BPF_REG_0 || (insn->off != 0 && insn->off != 1) ||
(insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index a4ef79591eb2..3b1c43382eec 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -22,6 +22,7 @@
#include <linux/btf.h>
#include <linux/objtool.h>
#include <linux/delay.h>
+#include <linux/panic.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -143,17 +144,7 @@ STACK_FRAME_NON_STANDARD(__crash_kexec);
__bpf_kfunc void crash_kexec(struct pt_regs *regs)
{
- int old_cpu, this_cpu;
-
- /*
- * Only one CPU is allowed to execute the crash_kexec() code as with
- * panic(). Otherwise parallel calls of panic() and crash_kexec()
- * may stop each other. To exclude them, we use panic_cpu here too.
- */
- old_cpu = PANIC_CPU_INVALID;
- this_cpu = raw_smp_processor_id();
-
- if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+ if (panic_try_start()) {
/* This is the 1st CPU which comes here, so go ahead. */
__crash_kexec(regs);
@@ -161,7 +152,7 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs)
* Reset panic_cpu to allow another panic()/crash_kexec()
* call.
*/
- atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+ panic_reset();
}
}
@@ -274,6 +265,20 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
return 0;
}
+/**
+ * crash_exclude_mem_range - exclude a mem range for existing ranges
+ * @mem: mem->range contains an array of ranges sorted in ascending order
+ * @mstart: the start of to-be-excluded range
+ * @mend: the start of to-be-excluded range
+ *
+ * If you are unsure if a range split will happen, to avoid function call
+ * failure because of -ENOMEM, always make sure
+ * mem->max_nr_ranges == mem->nr_ranges + 1
+ * before calling the function each time.
+ *
+ * returns 0 if a memory range is excluded successfully
+ * return -ENOMEM if mem->ranges doesn't have space to hold split ranges
+ */
int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart, unsigned long long mend)
{
@@ -333,6 +338,7 @@ int crash_exclude_mem_range(struct crash_mem *mem,
return 0;
}
+EXPORT_SYMBOL_GPL(crash_exclude_mem_range);
ssize_t crash_get_memory_size(void)
{
diff --git a/kernel/crash_core_test.c b/kernel/crash_core_test.c
new file mode 100644
index 000000000000..8aadf6801530
--- /dev/null
+++ b/kernel/crash_core_test.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+#include <linux/crash_core.h> // For struct crash_mem and struct range if defined there
+
+// Helper to create and initialize crash_mem
+static struct crash_mem *create_crash_mem(struct kunit *test, unsigned int max_ranges,
+ unsigned int nr_initial_ranges,
+ const struct range *initial_ranges)
+{
+ struct crash_mem *mem;
+ size_t alloc_size;
+
+ // Check if max_ranges can even hold initial_ranges
+ if (max_ranges < nr_initial_ranges) {
+ kunit_err(test, "max_ranges (%u) < nr_initial_ranges (%u)\n",
+ max_ranges, nr_initial_ranges);
+ return NULL;
+ }
+
+ alloc_size = sizeof(struct crash_mem) + (size_t)max_ranges * sizeof(struct range);
+ mem = kunit_kzalloc(test, alloc_size, GFP_KERNEL);
+ if (!mem) {
+ kunit_err(test, "Failed to allocate crash_mem\n");
+ return NULL;
+ }
+
+ mem->max_nr_ranges = max_ranges;
+ mem->nr_ranges = nr_initial_ranges;
+ if (initial_ranges && nr_initial_ranges > 0) {
+ memcpy(mem->ranges, initial_ranges,
+ nr_initial_ranges * sizeof(struct range));
+ }
+
+ return mem;
+}
+
+// Helper to compare ranges for assertions
+static void assert_ranges_equal(struct kunit *test,
+ const struct range *actual_ranges,
+ unsigned int actual_nr_ranges,
+ const struct range *expected_ranges,
+ unsigned int expected_nr_ranges,
+ const char *case_name)
+{
+ unsigned int i;
+
+ KUNIT_ASSERT_EQ_MSG(test, expected_nr_ranges, actual_nr_ranges,
+ "%s: Number of ranges mismatch.", case_name);
+
+ for (i = 0; i < expected_nr_ranges; i++) {
+ KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].start, actual_ranges[i].start,
+ "%s: Range %u start mismatch.", case_name, i);
+ KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].end, actual_ranges[i].end,
+ "%s: Range %u end mismatch.", case_name, i);
+ }
+}
+
+// Structure for test parameters
+struct exclude_test_param {
+ const char *description;
+ unsigned long long exclude_start;
+ unsigned long long exclude_end;
+ unsigned int initial_max_ranges;
+ const struct range *initial_ranges;
+ unsigned int initial_nr_ranges;
+ const struct range *expected_ranges;
+ unsigned int expected_nr_ranges;
+ int expected_ret;
+};
+
+static void run_exclude_test_case(struct kunit *test, const struct exclude_test_param *params)
+{
+ struct crash_mem *mem;
+ int ret;
+
+ kunit_info(test, "%s", params->description);
+
+ mem = create_crash_mem(test, params->initial_max_ranges,
+ params->initial_nr_ranges, params->initial_ranges);
+ if (!mem)
+ return; // Error already logged by create_crash_mem or kunit_kzalloc
+
+ ret = crash_exclude_mem_range(mem, params->exclude_start, params->exclude_end);
+
+ KUNIT_ASSERT_EQ_MSG(test, params->expected_ret, ret,
+ "%s: Return value mismatch.", params->description);
+
+ if (params->expected_ret == 0) {
+ assert_ranges_equal(test, mem->ranges, mem->nr_ranges,
+ params->expected_ranges, params->expected_nr_ranges,
+ params->description);
+ } else {
+ // If an error is expected, nr_ranges might still be relevant to check
+ // depending on the exact point of failure. For ENOMEM on split,
+ // nr_ranges shouldn't have changed.
+ KUNIT_ASSERT_EQ_MSG(test, params->initial_nr_ranges,
+ mem->nr_ranges,
+ "%s: Number of ranges mismatch on error.",
+ params->description);
+ }
+}
+
+/*
+ * Test Strategy 1: One to-be-excluded range A and one existing range B.
+ *
+ * Exhaust all possibilities of the position of A regarding B.
+ */
+
+static const struct range single_range_b = { .start = 100, .end = 199 };
+
+static const struct exclude_test_param exclude_single_range_test_data[] = {
+ {
+ .description = "1.1: A is left of B, no overlap",
+ .exclude_start = 10, .exclude_end = 50,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.2: A's right boundary touches B's left boundary",
+ .exclude_start = 10, .exclude_end = 99,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.3: A overlaps B's left part",
+ .exclude_start = 50, .exclude_end = 149,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 150, .end = 199 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.4: A is completely inside B",
+ .exclude_start = 120, .exclude_end = 179,
+ .initial_max_ranges = 2, // Needs space for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){
+ { .start = 100, .end = 119 },
+ { .start = 180, .end = 199 }
+ },
+ .expected_nr_ranges = 2,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.5: A overlaps B's right part",
+ .exclude_start = 150, .exclude_end = 249,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 100, .end = 149 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.6: A's left boundary touches B's right boundary",
+ .exclude_start = 200, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.7: A is right of B, no overlap",
+ .exclude_start = 250, .exclude_end = 300,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.8: A completely covers B and extends beyond",
+ .exclude_start = 50, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.9: A covers B and extends to the left",
+ .exclude_start = 50, .exclude_end = 199, // A ends exactly where B ends
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.10: A covers B and extends to the right",
+ .exclude_start = 100, .exclude_end = 250, // A starts exactly where B starts
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.11: A is identical to B",
+ .exclude_start = 100, .exclude_end = 199,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.12: A is a point, left of B, no overlap",
+ .exclude_start = 10, .exclude_end = 10,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.13: A is a point, at start of B",
+ .exclude_start = 100, .exclude_end = 100,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 101, .end = 199 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.14: A is a point, in middle of B (causes split)",
+ .exclude_start = 150, .exclude_end = 150,
+ .initial_max_ranges = 2, // Needs space for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){
+ { .start = 100, .end = 149 },
+ { .start = 151, .end = 199 }
+ },
+ .expected_nr_ranges = 2,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.15: A is a point, at end of B",
+ .exclude_start = 199, .exclude_end = 199,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = (const struct range[]){{ .start = 100, .end = 198 }},
+ .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ {
+ .description = "1.16: A is a point, right of B, no overlap",
+ .exclude_start = 250, .exclude_end = 250,
+ .initial_max_ranges = 1,
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = &single_range_b, .expected_nr_ranges = 1,
+ .expected_ret = 0,
+ },
+ // ENOMEM case for single range split
+ {
+ .description = "1.17: A completely inside B (split), no space (ENOMEM)",
+ .exclude_start = 120, .exclude_end = 179,
+ .initial_max_ranges = 1, // Not enough for split
+ .initial_ranges = &single_range_b, .initial_nr_ranges = 1,
+ .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content
+ .expected_nr_ranges = 1, // Should remain unchanged
+ .expected_ret = -ENOMEM,
+ },
+};
+
+
+static void exclude_single_range_test(struct kunit *test)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(exclude_single_range_test_data); i++) {
+ kunit_log(KERN_INFO, test, "Running: %s", exclude_single_range_test_data[i].description);
+ run_exclude_test_case(test, &exclude_single_range_test_data[i]);
+ // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case
+ }
+}
+
+/*
+ * Test Strategy 2: Regression test.
+ */
+
+static const struct exclude_test_param exclude_range_regression_test_data[] = {
+ // Test data from commit a2e9a95d2190
+ {
+ .description = "2.1: exclude low 1M",
+ .exclude_start = 0, .exclude_end = (1 << 20) - 1,
+ .initial_max_ranges = 3,
+ .initial_ranges = (const struct range[]){
+ { .start = 0, .end = 0x3efff },
+ { .start = 0x3f000, .end = 0x3ffff },
+ { .start = 0x40000, .end = 0x9ffff }
+ },
+ .initial_nr_ranges = 3,
+ .expected_nr_ranges = 0,
+ .expected_ret = 0,
+ },
+ // Test data from https://lore.kernel.org/all/ZXrY7QbXAlxydsSC@MiWiFi-R3L-srv/T/#u
+ {
+ .description = "2.2: when range out of bound",
+ .exclude_start = 100, .exclude_end = 200,
+ .initial_max_ranges = 3,
+ .initial_ranges = (const struct range[]){
+ { .start = 1, .end = 299 },
+ { .start = 401, .end = 1000 },
+ { .start = 1001, .end = 2000 }
+ },
+ .initial_nr_ranges = 3,
+ .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content
+ .expected_nr_ranges = 3, // Should remain unchanged
+ .expected_ret = -ENOMEM
+ },
+
+};
+
+
+static void exclude_range_regression_test(struct kunit *test)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(exclude_range_regression_test_data); i++) {
+ kunit_log(KERN_INFO, test, "Running: %s", exclude_range_regression_test_data[i].description);
+ run_exclude_test_case(test, &exclude_range_regression_test_data[i]);
+ // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case
+ }
+}
+
+/*
+ * KUnit Test Suite
+ */
+static struct kunit_case crash_exclude_mem_range_test_cases[] = {
+ KUNIT_CASE(exclude_single_range_test),
+ KUNIT_CASE(exclude_range_regression_test),
+ {}
+};
+
+static struct kunit_suite crash_exclude_mem_range_suite = {
+ .name = "crash_exclude_mem_range_tests",
+ .test_cases = crash_exclude_mem_range_test_cases,
+ // .init and .exit can be NULL if not needed globally for the suite
+};
+
+kunit_test_suite(crash_exclude_mem_range_suite);
+
+MODULE_DESCRIPTION("crash dump KUnit test suite");
+MODULE_LICENSE("GPL");
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index b82399437db0..1e5c64cb6a42 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -38,8 +38,8 @@ enum {
dma_debug_single,
dma_debug_sg,
dma_debug_coherent,
- dma_debug_resource,
dma_debug_noncoherent,
+ dma_debug_phy,
};
enum map_err_types {
@@ -141,8 +141,8 @@ static const char *type2name[] = {
[dma_debug_single] = "single",
[dma_debug_sg] = "scatter-gather",
[dma_debug_coherent] = "coherent",
- [dma_debug_resource] = "resource",
[dma_debug_noncoherent] = "noncoherent",
+ [dma_debug_phy] = "phy",
};
static const char *dir2name[] = {
@@ -1054,17 +1054,16 @@ static void check_unmap(struct dma_debug_entry *ref)
dma_entry_free(entry);
}
-static void check_for_stack(struct device *dev,
- struct page *page, size_t offset)
+static void check_for_stack(struct device *dev, phys_addr_t phys)
{
void *addr;
struct vm_struct *stack_vm_area = task_stack_vm_area(current);
if (!stack_vm_area) {
/* Stack is direct-mapped. */
- if (PageHighMem(page))
+ if (PhysHighMem(phys))
return;
- addr = page_address(page) + offset;
+ addr = phys_to_virt(phys);
if (object_is_on_stack(addr))
err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr);
} else {
@@ -1072,10 +1071,12 @@ static void check_for_stack(struct device *dev,
int i;
for (i = 0; i < stack_vm_area->nr_pages; i++) {
- if (page != stack_vm_area->pages[i])
+ if (__phys_to_pfn(phys) !=
+ page_to_pfn(stack_vm_area->pages[i]))
continue;
- addr = (u8 *)current->stack + i * PAGE_SIZE + offset;
+ addr = (u8 *)current->stack + i * PAGE_SIZE +
+ (phys % PAGE_SIZE);
err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr);
break;
}
@@ -1204,9 +1205,8 @@ void debug_dma_map_single(struct device *dev, const void *addr,
}
EXPORT_SYMBOL(debug_dma_map_single);
-void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
- size_t size, int direction, dma_addr_t dma_addr,
- unsigned long attrs)
+void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
+ int direction, dma_addr_t dma_addr, unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1221,19 +1221,18 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
return;
entry->dev = dev;
- entry->type = dma_debug_single;
- entry->paddr = page_to_phys(page) + offset;
+ entry->type = dma_debug_phy;
+ entry->paddr = phys;
entry->dev_addr = dma_addr;
entry->size = size;
entry->direction = direction;
entry->map_err_type = MAP_ERR_NOT_CHECKED;
- check_for_stack(dev, page, offset);
+ if (!(attrs & DMA_ATTR_MMIO)) {
+ check_for_stack(dev, phys);
- if (!PageHighMem(page)) {
- void *addr = page_address(page) + offset;
-
- check_for_illegal_area(dev, addr, size);
+ if (!PhysHighMem(phys))
+ check_for_illegal_area(dev, phys_to_virt(phys), size);
}
add_dma_entry(entry, attrs);
@@ -1277,11 +1276,11 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
}
EXPORT_SYMBOL(debug_dma_mapping_error);
-void debug_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
+void debug_dma_unmap_phys(struct device *dev, dma_addr_t dma_addr,
size_t size, int direction)
{
struct dma_debug_entry ref = {
- .type = dma_debug_single,
+ .type = dma_debug_phy,
.dev = dev,
.dev_addr = dma_addr,
.size = size,
@@ -1305,7 +1304,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
return;
for_each_sg(sg, s, nents, i) {
- check_for_stack(dev, sg_page(s), s->offset);
+ check_for_stack(dev, sg_phys(s));
if (!PageHighMem(sg_page(s)))
check_for_illegal_area(dev, sg_virt(s), s->length);
}
@@ -1445,47 +1444,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
check_unmap(&ref);
}
-void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
- int direction, dma_addr_t dma_addr,
- unsigned long attrs)
-{
- struct dma_debug_entry *entry;
-
- if (unlikely(dma_debug_disabled()))
- return;
-
- entry = dma_entry_alloc();
- if (!entry)
- return;
-
- entry->type = dma_debug_resource;
- entry->dev = dev;
- entry->paddr = addr;
- entry->size = size;
- entry->dev_addr = dma_addr;
- entry->direction = direction;
- entry->map_err_type = MAP_ERR_NOT_CHECKED;
-
- add_dma_entry(entry, attrs);
-}
-
-void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
- size_t size, int direction)
-{
- struct dma_debug_entry ref = {
- .type = dma_debug_resource,
- .dev = dev,
- .dev_addr = dma_addr,
- .size = size,
- .direction = direction,
- };
-
- if (unlikely(dma_debug_disabled()))
- return;
-
- check_unmap(&ref);
-}
-
void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
size_t size, int direction)
{
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
index 48757ca13f31..da7be0bddcf6 100644
--- a/kernel/dma/debug.h
+++ b/kernel/dma/debug.h
@@ -9,12 +9,11 @@
#define _KERNEL_DMA_DEBUG_H
#ifdef CONFIG_DMA_API_DEBUG
-extern void debug_dma_map_page(struct device *dev, struct page *page,
- size_t offset, size_t size,
- int direction, dma_addr_t dma_addr,
+extern void debug_dma_map_phys(struct device *dev, phys_addr_t phys,
+ size_t size, int direction, dma_addr_t dma_addr,
unsigned long attrs);
-extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+extern void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr,
size_t size, int direction);
extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
@@ -31,14 +30,6 @@ extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
extern void debug_dma_free_coherent(struct device *dev, size_t size,
void *virt, dma_addr_t addr);
-extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
- size_t size, int direction,
- dma_addr_t dma_addr,
- unsigned long attrs);
-
-extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
- size_t size, int direction);
-
extern void debug_dma_sync_single_for_cpu(struct device *dev,
dma_addr_t dma_handle, size_t size,
int direction);
@@ -62,14 +53,13 @@ extern void debug_dma_free_pages(struct device *dev, struct page *page,
size_t size, int direction,
dma_addr_t dma_addr);
#else /* CONFIG_DMA_API_DEBUG */
-static inline void debug_dma_map_page(struct device *dev, struct page *page,
- size_t offset, size_t size,
- int direction, dma_addr_t dma_addr,
- unsigned long attrs)
+static inline void debug_dma_map_phys(struct device *dev, phys_addr_t phys,
+ size_t size, int direction,
+ dma_addr_t dma_addr, unsigned long attrs)
{
}
-static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+static inline void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr,
size_t size, int direction)
{
}
@@ -97,19 +87,6 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size,
{
}
-static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
- size_t size, int direction,
- dma_addr_t dma_addr,
- unsigned long attrs)
-{
-}
-
-static inline void debug_dma_unmap_resource(struct device *dev,
- dma_addr_t dma_addr, size_t size,
- int direction)
-{
-}
-
static inline void debug_dma_sync_single_for_cpu(struct device *dev,
dma_addr_t dma_handle,
size_t size, int direction)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 24c359d9c879..1f9ee9759426 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -120,7 +120,7 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp_t gfp, bool allow_highmem)
{
int node = dev_to_node(dev);
- struct page *page = NULL;
+ struct page *page;
u64 phys_limit;
WARN_ON_ONCE(!PAGE_ALIGNED(size));
@@ -131,30 +131,25 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
page = dma_alloc_contiguous(dev, size, gfp);
if (page) {
- if (!dma_coherent_ok(dev, page_to_phys(page), size) ||
- (!allow_highmem && PageHighMem(page))) {
- dma_free_contiguous(dev, page, size);
- page = NULL;
- }
+ if (dma_coherent_ok(dev, page_to_phys(page), size) &&
+ (allow_highmem || !PageHighMem(page)))
+ return page;
+
+ dma_free_contiguous(dev, page, size);
}
-again:
- if (!page)
- page = alloc_pages_node(node, gfp, get_order(size));
- if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+
+ while ((page = alloc_pages_node(node, gfp, get_order(size)))
+ && !dma_coherent_ok(dev, page_to_phys(page), size)) {
__free_pages(page, get_order(size));
- page = NULL;
if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
phys_limit < DMA_BIT_MASK(64) &&
- !(gfp & (GFP_DMA32 | GFP_DMA))) {
+ !(gfp & (GFP_DMA32 | GFP_DMA)))
gfp |= GFP_DMA32;
- goto again;
- }
-
- if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) {
+ else if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA))
gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
- goto again;
- }
+ else
+ return NULL;
}
return page;
@@ -453,7 +448,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
if (sg_dma_is_bus_address(sg))
sg_dma_unmark_bus_address(sg);
else
- dma_direct_unmap_page(dev, sg->dma_address,
+ dma_direct_unmap_phys(dev, sg->dma_address,
sg_dma_len(sg), dir, attrs);
}
}
@@ -476,8 +471,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
*/
break;
case PCI_P2PDMA_MAP_NONE:
- sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
- sg->offset, sg->length, dir, attrs);
+ sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
+ sg->length, dir, attrs);
if (sg->dma_address == DMA_MAPPING_ERROR) {
ret = -EIO;
goto out_unmap;
@@ -502,22 +497,6 @@ out_unmap:
return ret;
}
-dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
- size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
- dma_addr_t dma_addr = paddr;
-
- if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
- dev_err_once(dev,
- "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
- &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
- WARN_ON_ONCE(1);
- return DMA_MAPPING_ERROR;
- }
-
- return dma_addr;
-}
-
int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index d2c0b7e632fc..da2fadf45bcd 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -80,42 +80,57 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
arch_dma_mark_clean(paddr, size);
}
-static inline dma_addr_t dma_direct_map_page(struct device *dev,
- struct page *page, unsigned long offset, size_t size,
- enum dma_data_direction dir, unsigned long attrs)
+static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+ phys_addr_t phys, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
{
- phys_addr_t phys = page_to_phys(page) + offset;
- dma_addr_t dma_addr = phys_to_dma(dev, phys);
+ dma_addr_t dma_addr;
if (is_swiotlb_force_bounce(dev)) {
- if (is_pci_p2pdma_page(page))
- return DMA_MAPPING_ERROR;
+ if (attrs & DMA_ATTR_MMIO)
+ goto err_overflow;
+
return swiotlb_map(dev, phys, size, dir, attrs);
}
- if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
- dma_kmalloc_needs_bounce(dev, size, dir)) {
- if (is_pci_p2pdma_page(page))
- return DMA_MAPPING_ERROR;
- if (is_swiotlb_active(dev))
- return swiotlb_map(dev, phys, size, dir, attrs);
-
- dev_WARN_ONCE(dev, 1,
- "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
- &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
- return DMA_MAPPING_ERROR;
+ if (attrs & DMA_ATTR_MMIO) {
+ dma_addr = phys;
+ if (unlikely(!dma_capable(dev, dma_addr, size, false)))
+ goto err_overflow;
+ } else {
+ dma_addr = phys_to_dma(dev, phys);
+ if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
+ dma_kmalloc_needs_bounce(dev, size, dir)) {
+ if (is_swiotlb_active(dev))
+ return swiotlb_map(dev, phys, size, dir, attrs);
+
+ goto err_overflow;
+ }
}
- if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ if (!dev_is_dma_coherent(dev) &&
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
arch_sync_dma_for_device(phys, size, dir);
return dma_addr;
+
+err_overflow:
+ dev_WARN_ONCE(
+ dev, 1,
+ "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+ &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+ return DMA_MAPPING_ERROR;
}
-static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
- phys_addr_t phys = dma_to_phys(dev, addr);
+ phys_addr_t phys;
+
+ if (attrs & DMA_ATTR_MMIO)
+ /* nothing to do: uncached and no swiotlb */
+ return;
+ phys = dma_to_phys(dev, addr);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 56de28a3b179..fe7472f13b10 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -152,11 +152,11 @@ static inline bool dma_map_direct(struct device *dev,
return dma_go_direct(dev, *dev->dma_mask, ops);
}
-dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
- size_t offset, size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
+ bool is_mmio = attrs & DMA_ATTR_MMIO;
dma_addr_t addr;
BUG_ON(!valid_dma_direction(dir));
@@ -165,36 +165,81 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
return DMA_MAPPING_ERROR;
if (dma_map_direct(dev, ops) ||
- arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
- addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+ (!is_mmio && arch_dma_map_phys_direct(dev, phys + size)))
+ addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
else if (use_dma_iommu(dev))
- addr = iommu_dma_map_page(dev, page, offset, size, dir, attrs);
- else
+ addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
+ else if (is_mmio) {
+ if (!ops->map_resource)
+ return DMA_MAPPING_ERROR;
+
+ addr = ops->map_resource(dev, phys, size, dir, attrs);
+ } else {
+ struct page *page = phys_to_page(phys);
+ size_t offset = offset_in_page(phys);
+
+ /*
+ * The dma_ops API contract for ops->map_page() requires
+ * kmappable memory, while ops->map_resource() does not.
+ */
addr = ops->map_page(dev, page, offset, size, dir, attrs);
- kmsan_handle_dma(page, offset, size, dir);
- trace_dma_map_page(dev, page_to_phys(page) + offset, addr, size, dir,
- attrs);
- debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
+ }
+
+ if (!is_mmio)
+ kmsan_handle_dma(phys, size, dir);
+ trace_dma_map_phys(dev, phys, addr, size, dir, attrs);
+ debug_dma_map_phys(dev, phys, size, dir, addr, attrs);
return addr;
}
+EXPORT_SYMBOL_GPL(dma_map_phys);
+
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+ size_t offset, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ phys_addr_t phys = page_to_phys(page) + offset;
+
+ if (unlikely(attrs & DMA_ATTR_MMIO))
+ return DMA_MAPPING_ERROR;
+
+ if (IS_ENABLED(CONFIG_DMA_API_DEBUG) &&
+ WARN_ON_ONCE(is_zone_device_page(page)))
+ return DMA_MAPPING_ERROR;
+
+ return dma_map_phys(dev, phys, size, dir, attrs);
+}
EXPORT_SYMBOL(dma_map_page_attrs);
-void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
enum dma_data_direction dir, unsigned long attrs)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
+ bool is_mmio = attrs & DMA_ATTR_MMIO;
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops) ||
- arch_dma_unmap_page_direct(dev, addr + size))
- dma_direct_unmap_page(dev, addr, size, dir, attrs);
+ (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size)))
+ dma_direct_unmap_phys(dev, addr, size, dir, attrs);
else if (use_dma_iommu(dev))
- iommu_dma_unmap_page(dev, addr, size, dir, attrs);
- else
+ iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
+ else if (is_mmio) {
+ if (ops->unmap_resource)
+ ops->unmap_resource(dev, addr, size, dir, attrs);
+ } else
ops->unmap_page(dev, addr, size, dir, attrs);
- trace_dma_unmap_page(dev, addr, size, dir, attrs);
- debug_dma_unmap_page(dev, addr, size, dir);
+ trace_dma_unmap_phys(dev, addr, size, dir, attrs);
+ debug_dma_unmap_phys(dev, addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(dma_unmap_phys);
+
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ if (unlikely(attrs & DMA_ATTR_MMIO))
+ return;
+
+ dma_unmap_phys(dev, addr, size, dir, attrs);
}
EXPORT_SYMBOL(dma_unmap_page_attrs);
@@ -321,41 +366,18 @@ EXPORT_SYMBOL(dma_unmap_sg_attrs);
dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
- dma_addr_t addr = DMA_MAPPING_ERROR;
-
- BUG_ON(!valid_dma_direction(dir));
-
- if (WARN_ON_ONCE(!dev->dma_mask))
+ if (IS_ENABLED(CONFIG_DMA_API_DEBUG) &&
+ WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
return DMA_MAPPING_ERROR;
- if (dma_map_direct(dev, ops))
- addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
- else if (use_dma_iommu(dev))
- addr = iommu_dma_map_resource(dev, phys_addr, size, dir, attrs);
- else if (ops->map_resource)
- addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
-
- trace_dma_map_resource(dev, phys_addr, addr, size, dir, attrs);
- debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs);
- return addr;
+ return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO);
}
EXPORT_SYMBOL(dma_map_resource);
void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
enum dma_data_direction dir, unsigned long attrs)
{
- const struct dma_map_ops *ops = get_dma_ops(dev);
-
- BUG_ON(!valid_dma_direction(dir));
- if (dma_map_direct(dev, ops))
- ; /* nothing to do: uncached and no swiotlb */
- else if (use_dma_iommu(dev))
- iommu_dma_unmap_resource(dev, addr, size, dir, attrs);
- else if (ops->unmap_resource)
- ops->unmap_resource(dev, addr, size, dir, attrs);
- trace_dma_unmap_resource(dev, addr, size, dir, attrs);
- debug_dma_unmap_resource(dev, addr, size, dir);
+ dma_unmap_phys(dev, addr, size, dir, attrs | DMA_ATTR_MMIO);
}
EXPORT_SYMBOL(dma_unmap_resource);
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index 9afd569eadb9..6f9d604d9d40 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -72,8 +72,8 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size,
return NULL;
if (use_dma_iommu(dev))
- *dma_handle = iommu_dma_map_page(dev, page, 0, size, dir,
- DMA_ATTR_SKIP_CPU_SYNC);
+ *dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size,
+ dir, DMA_ATTR_SKIP_CPU_SYNC);
else
*dma_handle = ops->map_page(dev, page, 0, size, dir,
DMA_ATTR_SKIP_CPU_SYNC);
@@ -92,7 +92,7 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
const struct dma_map_ops *ops = get_dma_ops(dev);
if (use_dma_iommu(dev))
- iommu_dma_unmap_page(dev, dma_handle, size, dir,
+ iommu_dma_unmap_phys(dev, dma_handle, size, dir,
DMA_ATTR_SKIP_CPU_SYNC);
else if (ops->unmap_page)
ops->unmap_page(dev, dma_handle, size, dir,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index abcf3fa63a56..0d37da3d95b6 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1209,7 +1209,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
nslabs = nr_slots(alloc_size);
phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit,
- GFP_NOWAIT | __GFP_NOWARN);
+ GFP_NOWAIT);
if (!pool)
return -1;
diff --git a/kernel/fork.c b/kernel/fork.c
index f1688b3e79a6..3da0f08615a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2132,9 +2132,7 @@ __latent_entropy struct task_struct *copy_process(
p->pagefault_disabled = 0;
-#ifdef CONFIG_LOCKDEP
lockdep_init_task(p);
-#endif
p->blocked_on = NULL; /* not blocked yet */
@@ -2547,11 +2545,9 @@ struct task_struct * __init fork_idle(int cpu)
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
- CLONE_IO;
+ CLONE_IO|CLONE_VM|CLONE_UNTRACED;
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .flags = flags,
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
@@ -2663,9 +2659,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
unsigned long flags)
{
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
.fn = fn,
.fn_arg = arg,
.name = name,
@@ -2681,9 +2676,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
.fn = fn,
.fn_arg = arg,
};
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 8708a1205f82..b2c1f14b8129 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -95,9 +95,41 @@ static struct notifier_block panic_block = {
.notifier_call = hung_task_panic,
};
+static bool task_is_hung(struct task_struct *t, unsigned long timeout)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+ unsigned int state = READ_ONCE(t->__state);
+
+ /*
+ * skip the TASK_KILLABLE tasks -- these can be killed
+ * skip the TASK_IDLE tasks -- those are genuinely idle
+ * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer
+ */
+ if (!(state & TASK_UNINTERRUPTIBLE) ||
+ (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN)))
+ return false;
+
+ /*
+ * When a freshly created task is scheduled once, changes its state to
+ * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
+ * musn't be checked.
+ */
+ if (unlikely(!switch_count))
+ return false;
+
+ if (switch_count != t->last_switch_count) {
+ t->last_switch_count = switch_count;
+ t->last_switch_time = jiffies;
+ return false;
+ }
+ if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
+ return false;
+
+ return true;
+}
#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
-static void debug_show_blocker(struct task_struct *task)
+static void debug_show_blocker(struct task_struct *task, unsigned long timeout)
{
struct task_struct *g, *t;
unsigned long owner, blocker, blocker_type;
@@ -174,41 +206,21 @@ static void debug_show_blocker(struct task_struct *task)
t->pid, rwsem_blocked_by);
break;
}
- sched_show_task(t);
+ /* Avoid duplicated task dump, skip if the task is also hung. */
+ if (!task_is_hung(t, timeout))
+ sched_show_task(t);
return;
}
}
#else
-static inline void debug_show_blocker(struct task_struct *task)
+static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout)
{
}
#endif
static void check_hung_task(struct task_struct *t, unsigned long timeout)
{
- unsigned long switch_count = t->nvcsw + t->nivcsw;
-
- /*
- * Ensure the task is not frozen.
- * Also, skip vfork and any other user process that freezer should skip.
- */
- if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN))
- return;
-
- /*
- * When a freshly created task is scheduled once, changes its state to
- * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
- * musn't be checked.
- */
- if (unlikely(!switch_count))
- return;
-
- if (switch_count != t->last_switch_count) {
- t->last_switch_count = switch_count;
- t->last_switch_time = jiffies;
- return;
- }
- if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
+ if (!task_is_hung(t, timeout))
return;
/*
@@ -243,7 +255,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
- debug_show_blocker(t);
+ debug_show_blocker(t, timeout);
hung_task_show_lock = true;
if (sysctl_hung_task_all_cpu_backtrace)
@@ -299,7 +311,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
hung_task_show_lock = false;
rcu_read_lock();
for_each_process_thread(g, t) {
- unsigned int state;
if (!max_count--)
goto unlock;
@@ -308,15 +319,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
goto unlock;
last_break = jiffies;
}
- /*
- * skip the TASK_KILLABLE tasks -- these can be killed
- * skip the TASK_IDLE tasks -- those are genuinely idle
- */
- state = READ_ONCE(t->__state);
- if ((state & TASK_UNINTERRUPTIBLE) &&
- !(state & TASK_WAKEKILL) &&
- !(state & TASK_NOLOAD))
- check_hung_task(t, timeout);
+
+ check_hung_task(t, timeout);
}
unlock:
rcu_read_unlock();
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index cf4af5728307..2b082a7e24a2 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -264,7 +264,7 @@ static int test_kallsyms_basic_function(void)
char namebuf[KSYM_NAME_LEN];
struct test_stat *stat, *stat2;
- stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL);
+ stat = kmalloc_array(2, sizeof(*stat), GFP_KERNEL);
if (!stat)
return -ENOMEM;
stat2 = stat + 1;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 1d85597057e1..6563141f5de9 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -978,6 +978,15 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area,
memcpy(dst_entries, src_entries, bytes_to_move);
entries_moved = bytes_to_move >> entry_size_log;
+ /*
+ * A write memory barrier is required here, to ensure
+ * that the writes from the memcpy() are visible before
+ * the count is updated. Without this, it is possible for
+ * a user to observe a new count value but stale
+ * coverage data.
+ */
+ smp_wmb();
+
switch (mode) {
case KCOV_MODE_TRACE_PC:
WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 31203f0bacaf..fa00b239c5d9 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -233,7 +233,6 @@ struct kimage *do_kimage_alloc_init(void)
if (!image)
return NULL;
- image->head = 0;
image->entry = &image->head;
image->last_entry = &image->head;
image->control_page = ~0; /* By default this does not apply */
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 91d46502a817..eb62a9794242 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -255,6 +255,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
}
image->no_cma = !!(flags & KEXEC_FILE_NO_CMA);
+ image->force_dtb = flags & KEXEC_FILE_FORCE_DTB;
if (cmdline_len) {
image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 555488eb1a18..5083c68c3a4e 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -988,6 +988,26 @@ static const void *kho_get_fdt(void)
}
/**
+ * is_kho_boot - check if current kernel was booted via KHO-enabled
+ * kexec
+ *
+ * This function checks if the current kernel was loaded through a kexec
+ * operation with KHO enabled, by verifying that a valid KHO FDT
+ * was passed.
+ *
+ * Note: This function returns reliable results only after
+ * kho_populate() has been called during early boot. Before that,
+ * it may return false even if KHO data is present.
+ *
+ * Return: true if booted via KHO-enabled kexec, false otherwise
+ */
+bool is_kho_boot(void)
+{
+ return !!kho_get_fdt();
+}
+EXPORT_SYMBOL_GPL(is_kho_boot);
+
+/**
* kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
* @name: the name of the sub FDT passed to kho_add_subtree().
* @phys: if found, the physical address of the sub FDT is stored in @phys.
@@ -1269,7 +1289,7 @@ int kho_fill_kimage(struct kimage *image)
int err = 0;
struct kexec_buf scratch;
- if (!kho_enable)
+ if (!kho_out.finalized)
return 0;
image->kho.fdt = page_to_phys(kho_out.ser.fdt);
diff --git a/kernel/panic.c b/kernel/panic.c
index 72fcbb5a071b..24cc3eec1805 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -53,7 +53,7 @@ static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace;
#define sysctl_oops_all_cpu_backtrace 0
#endif /* CONFIG_SMP */
-int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
+int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS);
static unsigned long tainted_mask =
IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
static int pause_on_oops;
@@ -67,6 +67,7 @@ static unsigned int warn_limit __read_mostly;
static bool panic_console_replay;
bool panic_triggering_all_cpu_backtrace;
+static bool panic_this_cpu_backtrace_printed;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -77,6 +78,11 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
+static void panic_print_deprecated(void)
+{
+ pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n");
+}
+
#ifdef CONFIG_SYSCTL
/*
@@ -125,7 +131,7 @@ static int proc_taint(const struct ctl_table *table, int write,
static int sysctl_panic_print_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n");
+ panic_print_deprecated();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
@@ -294,6 +300,59 @@ void __weak crash_smp_send_stop(void)
atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+bool panic_try_start(void)
+{
+ int old_cpu, this_cpu;
+
+ /*
+ * Only one CPU is allowed to execute the crash_kexec() code as with
+ * panic(). Otherwise parallel calls of panic() and crash_kexec()
+ * may stop each other. To exclude them, we use panic_cpu here too.
+ */
+ old_cpu = PANIC_CPU_INVALID;
+ this_cpu = raw_smp_processor_id();
+
+ return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu);
+}
+EXPORT_SYMBOL(panic_try_start);
+
+void panic_reset(void)
+{
+ atomic_set(&panic_cpu, PANIC_CPU_INVALID);
+}
+EXPORT_SYMBOL(panic_reset);
+
+bool panic_in_progress(void)
+{
+ return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
+}
+EXPORT_SYMBOL(panic_in_progress);
+
+/* Return true if a panic is in progress on the current CPU. */
+bool panic_on_this_cpu(void)
+{
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
+}
+EXPORT_SYMBOL(panic_on_this_cpu);
+
+/*
+ * Return true if a panic is in progress on a remote CPU.
+ *
+ * On true, the local CPU should immediately release any printing resources
+ * that may be needed by the panic CPU.
+ */
+bool panic_on_other_cpu(void)
+{
+ return (panic_in_progress() && !panic_on_this_cpu());
+}
+EXPORT_SYMBOL(panic_on_other_cpu);
+
/*
* A variant of panic() called from NMI context. We return if we've already
* panicked on this CPU. If another CPU already panicked, loop in
@@ -302,15 +361,9 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
*/
void nmi_panic(struct pt_regs *regs, const char *msg)
{
- int old_cpu, this_cpu;
-
- old_cpu = PANIC_CPU_INVALID;
- this_cpu = raw_smp_processor_id();
-
- /* atomic_try_cmpxchg updates old_cpu on failure */
- if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu))
+ if (panic_try_start())
panic("%s", msg);
- else if (old_cpu != this_cpu)
+ else if (panic_on_other_cpu())
nmi_panic_self_stop(regs);
}
EXPORT_SYMBOL(nmi_panic);
@@ -328,6 +381,19 @@ void check_panic_on_warn(const char *origin)
origin, limit);
}
+static void panic_trigger_all_cpu_backtrace(void)
+{
+ /* Temporary allow non-panic CPUs to write their backtraces. */
+ panic_triggering_all_cpu_backtrace = true;
+
+ if (panic_this_cpu_backtrace_printed)
+ trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id());
+ else
+ trigger_all_cpu_backtrace();
+
+ panic_triggering_all_cpu_backtrace = false;
+}
+
/*
* Helper that triggers the NMI backtrace (if set in panic_print)
* and then performs the secondary CPUs shutdown - we cannot have
@@ -335,12 +401,8 @@ void check_panic_on_warn(const char *origin)
*/
static void panic_other_cpus_shutdown(bool crash_kexec)
{
- if (panic_print & SYS_INFO_ALL_CPU_BT) {
- /* Temporary allow non-panic CPUs to write their backtraces. */
- panic_triggering_all_cpu_backtrace = true;
- trigger_all_cpu_backtrace();
- panic_triggering_all_cpu_backtrace = false;
- }
+ if (panic_print & SYS_INFO_ALL_CPU_BT)
+ panic_trigger_all_cpu_backtrace();
/*
* Note that smp_send_stop() is the usual SMP shutdown function,
@@ -368,7 +430,6 @@ void vpanic(const char *fmt, va_list args)
static char buf[1024];
long i, i_next = 0, len;
int state = 0;
- int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
if (panic_on_warn) {
@@ -405,13 +466,10 @@ void vpanic(const char *fmt, va_list args)
* `old_cpu == this_cpu' means we came from nmi_panic() which sets
* panic_cpu to this CPU. In this case, this is also the 1st CPU.
*/
- old_cpu = PANIC_CPU_INVALID;
- this_cpu = raw_smp_processor_id();
-
/* atomic_try_cmpxchg updates old_cpu on failure */
- if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
+ if (panic_try_start()) {
/* go ahead */
- } else if (old_cpu != this_cpu)
+ } else if (panic_on_other_cpu())
panic_smp_self_stop();
console_verbose();
@@ -422,13 +480,15 @@ void vpanic(const char *fmt, va_list args)
buf[len - 1] = '\0';
pr_emerg("Kernel panic - not syncing: %s\n", buf);
-#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
*/
- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
+ if (test_taint(TAINT_DIE) || oops_in_progress > 1) {
+ panic_this_cpu_backtrace_printed = true;
+ } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) {
dump_stack();
-#endif
+ panic_this_cpu_backtrace_printed = true;
+ }
/*
* If kgdb is enabled, give it a chance to run before we stop all
@@ -937,12 +997,29 @@ EXPORT_SYMBOL(__stack_chk_fail);
#endif
core_param(panic, panic_timeout, int, 0644);
-core_param(panic_print, panic_print, ulong, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
core_param(panic_console_replay, panic_console_replay, bool, 0644);
+static int panic_print_set(const char *val, const struct kernel_param *kp)
+{
+ panic_print_deprecated();
+ return param_set_ulong(val, kp);
+}
+
+static int panic_print_get(char *val, const struct kernel_param *kp)
+{
+ panic_print_deprecated();
+ return param_get_ulong(val, kp);
+}
+
+static const struct kernel_param_ops panic_print_ops = {
+ .set = panic_print_set,
+ .get = panic_print_get,
+};
+__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644);
+
static int __init oops_setup(char *s)
{
if (!s)
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index ef282001f200..f72bbfa266d6 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -332,7 +332,6 @@ struct printk_message {
unsigned long dropped;
};
-bool other_cpu_in_panic(void);
bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
bool is_extended, bool may_supress);
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 646801813415..558ef3177976 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -12,6 +12,7 @@
#include <linux/irqflags.h>
#include <linux/kthread.h>
#include <linux/minmax.h>
+#include <linux/panic.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/slab.h>
@@ -254,7 +255,7 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
* opportunity to perform any necessary cleanup if they were
* interrupted by the panic CPU while printing.
*/
- if (other_cpu_in_panic() &&
+ if (panic_on_other_cpu() &&
(!is_reacquire || cur->unsafe_takeover)) {
return -EPERM;
}
@@ -309,7 +310,7 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
* Event #2 implies the new context is PANIC.
* Event #3 occurs when panic() has flushed the console.
* Event #4 occurs when a non-panic CPU reacquires.
- * Event #5 is not possible due to the other_cpu_in_panic() check
+ * Event #5 is not possible due to the panic_on_other_cpu() check
* in nbcon_context_try_acquire_handover().
*/
@@ -348,7 +349,7 @@ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
struct nbcon_state new;
/* Note that the caller must still remove the request! */
- if (other_cpu_in_panic())
+ if (panic_on_other_cpu())
return -EPERM;
/*
@@ -446,7 +447,7 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
* nbcon_waiter_matches(). In particular, the assumption that
* lower priorities are ignored during panic.
*/
- if (other_cpu_in_panic())
+ if (panic_on_other_cpu())
return -EPERM;
/* Handover is not possible on the same CPU. */
@@ -589,7 +590,6 @@ static struct printk_buffers panic_nbcon_pbufs;
*/
static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire)
{
- unsigned int cpu = smp_processor_id();
struct console *con = ctxt->console;
struct nbcon_state cur;
int err;
@@ -614,7 +614,7 @@ out:
/* Acquire succeeded. */
/* Assign the appropriate buffer for this context. */
- if (atomic_read(&panic_cpu) == cpu)
+ if (panic_on_this_cpu())
ctxt->pbufs = &panic_nbcon_pbufs;
else
ctxt->pbufs = con->pbufs;
@@ -1394,7 +1394,7 @@ enum nbcon_prio nbcon_get_default_prio(void)
{
unsigned int *cpu_emergency_nesting;
- if (this_cpu_in_panic())
+ if (panic_on_this_cpu())
return NBCON_PRIO_PANIC;
cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 0efbcdda9aab..5aee9ffb16b9 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -48,6 +48,7 @@
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
+#include <linux/panic.h>
#include <linux/uaccess.h>
#include <asm/sections.h>
@@ -345,34 +346,6 @@ static void __up_console_sem(unsigned long ip)
}
#define up_console_sem() __up_console_sem(_RET_IP_)
-static bool panic_in_progress(void)
-{
- return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
-}
-
-/* Return true if a panic is in progress on the current CPU. */
-bool this_cpu_in_panic(void)
-{
- /*
- * We can use raw_smp_processor_id() here because it is impossible for
- * the task to be migrated to the panic_cpu, or away from it. If
- * panic_cpu has already been set, and we're not currently executing on
- * that CPU, then we never will be.
- */
- return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
-}
-
-/*
- * Return true if a panic is in progress on a remote CPU.
- *
- * On true, the local CPU should immediately release any printing resources
- * that may be needed by the panic CPU.
- */
-bool other_cpu_in_panic(void)
-{
- return (panic_in_progress() && !this_cpu_in_panic());
-}
-
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
@@ -2407,7 +2380,7 @@ asmlinkage int vprintk_emit(int facility, int level,
* non-panic CPUs are generating any messages, they will be
* silently dropped.
*/
- if (other_cpu_in_panic() &&
+ if (panic_on_other_cpu() &&
!debug_non_panic_cpus &&
!panic_triggering_all_cpu_backtrace)
return 0;
@@ -2843,7 +2816,7 @@ void console_lock(void)
might_sleep();
/* On panic, the console_lock must be left to the panic cpu. */
- while (other_cpu_in_panic())
+ while (panic_on_other_cpu())
msleep(1000);
down_console_sem();
@@ -2863,7 +2836,7 @@ EXPORT_SYMBOL(console_lock);
int console_trylock(void)
{
/* On panic, the console_lock must be left to the panic cpu. */
- if (other_cpu_in_panic())
+ if (panic_on_other_cpu())
return 0;
if (down_trylock_console_sem())
return 0;
@@ -3243,7 +3216,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
any_progress = true;
/* Allow panic_cpu to take over the consoles safely. */
- if (other_cpu_in_panic())
+ if (panic_on_other_cpu())
goto abandon;
if (do_cond_resched)
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index d9fb053cff67..e2a1b2d34d2b 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -2143,7 +2143,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
* But it would have the sequence number returned
* by "prb_next_reserve_seq() - 1".
*/
- if (this_cpu_in_panic() &&
+ if (panic_on_this_cpu() &&
(!debug_non_panic_cpus || legacy_allow_panic_sync) &&
((*seq + 1) < prb_next_reserve_seq(rb))) {
(*seq)++;
diff --git a/kernel/sys.c b/kernel/sys.c
index a46d9b75880b..8b58eece4e58 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1734,6 +1734,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
struct rlimit old, new;
struct task_struct *tsk;
unsigned int checkflags = 0;
+ bool need_tasklist;
int ret;
if (old_rlim)
@@ -1760,8 +1761,25 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
get_task_struct(tsk);
rcu_read_unlock();
- ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
- old_rlim ? &old : NULL);
+ need_tasklist = !same_thread_group(tsk, current);
+ if (need_tasklist) {
+ /*
+ * Ensure we can't race with group exit or de_thread(),
+ * so tsk->group_leader can't be freed or changed until
+ * read_unlock(tasklist_lock) below.
+ */
+ read_lock(&tasklist_lock);
+ if (!pid_alive(tsk))
+ ret = -ESRCH;
+ }
+
+ if (!ret) {
+ ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
+ old_rlim ? &old : NULL);
+ }
+
+ if (need_tasklist)
+ read_unlock(&tasklist_lock);
if (!ret && old_rlim) {
rlim_to_rlim64(&old, &old64);
@@ -2515,7 +2533,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = -EINVAL;
break;
}
+ /*
+ * Ensure that either:
+ *
+ * 1. Subsequent getppid() calls reflect the parent process having died.
+ * 2. forget_original_parent() will send the new me->pdeath_signal.
+ *
+ * Also prevent the read of me->pdeath_signal from being a data race.
+ */
+ read_lock(&tasklist_lock);
me->pdeath_signal = arg2;
+ read_unlock(&tasklist_lock);
break;
case PR_GET_PDEATHSIG:
error = put_user(me->pdeath_signal, (int __user *)arg2);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8f23f5273bab..4f87c16d915a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -899,7 +899,7 @@ const struct bpf_func_proto bpf_send_signal_thread_proto = {
.arg1_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
+BPF_CALL_3(bpf_d_path, const struct path *, path, char *, buf, u32, sz)
{
struct path copy;
long len;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b3c94fbaf002..156e7e0bf559 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -10211,8 +10211,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n");
- ret = vfs_parse_fs_string(fc, "source",
- "tracefs", strlen("tracefs"));
+ ret = vfs_parse_fs_string(fc, "source", "tracefs");
if (!ret)
mnt = fc_mount(fc);
else
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 80b56c002c7f..5b62d1002783 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -425,7 +425,11 @@ static DEFINE_PER_CPU(u8, cpustat_tail);
*/
static u16 get_16bit_precision(u64 data_ns)
{
- return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */
+ /*
+ * 2^24ns ~= 16.8ms
+ * Round to the nearest multiple of 16.8 milliseconds.
+ */
+ return (data_ns + (1 << 23)) >> 24LL;
}
static void update_cpustat(void)
@@ -444,6 +448,14 @@ static void update_cpustat(void)
old_stat = __this_cpu_read(cpustat_old[i]);
new_stat = get_16bit_precision(cpustat[tracked_stats[i]]);
util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16);
+ /*
+ * Since we use 16-bit precision, the raw data will undergo
+ * integer division, which may sometimes result in data loss,
+ * and then result might exceed 100%. To avoid confusion,
+ * we enforce a 100% display cap when calculations exceed this threshold.
+ */
+ if (util > 100)
+ util = 100;
__this_cpu_write(cpustat_util[tail][i], util);
__this_cpu_write(cpustat_old[i], new_stat);
}
@@ -455,17 +467,17 @@ static void print_cpustat(void)
{
int i, group;
u8 tail = __this_cpu_read(cpustat_tail);
- u64 sample_period_second = sample_period;
+ u64 sample_period_msecond = sample_period;
- do_div(sample_period_second, NSEC_PER_SEC);
+ do_div(sample_period_msecond, NSEC_PER_MSEC);
/*
* Outputting the "watchdog" prefix on every line is redundant and not
* concise, and the original alarm information is sufficient for
* positioning in logs, hence here printk() is used instead of pr_crit().
*/
- printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n",
- smp_processor_id(), sample_period_second);
+ printk(KERN_CRIT "CPU#%d Utilization every %llums during lockup:\n",
+ smp_processor_id(), sample_period_msecond);
for (i = 0; i < NUM_SAMPLE_PERIODS; i++) {
group = (tail + i) % NUM_SAMPLE_PERIODS;
@@ -740,6 +752,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (!watchdog_enabled)
return HRTIMER_NORESTART;
+ /*
+ * pass the buddy check if a panic is in process
+ */
+ if (panic_in_progress())
+ return HRTIMER_NORESTART;
+
watchdog_hardlockup_kick();
/* kick the softlockup detector */
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 9c58f5b4381d..d3ca70e3c256 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -12,6 +12,7 @@
#define pr_fmt(fmt) "NMI watchdog: " fmt
+#include <linux/panic.h>
#include <linux/nmi.h>
#include <linux/atomic.h>
#include <linux/module.h>
@@ -108,6 +109,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;
+ if (panic_in_progress())
+ return;
+
if (!watchdog_check_timestamp())
return;