summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/auditfilter.c2
-rw-r--r--kernel/bpf/bpf_iter.c73
-rw-r--r--kernel/bpf/cgroup.c2
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/cpumap.c2
-rw-r--r--kernel/bpf/map_iter.c37
-rw-r--r--kernel/bpf/stackmap.c24
-rw-r--r--kernel/bpf/syscall.c4
-rw-r--r--kernel/bpf/task_iter.c6
-rw-r--r--kernel/bpf/verifier.c6
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/compat.c6
-rw-r--r--kernel/crash_core.c50
-rw-r--r--kernel/debug/gdbstub.c6
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c4
-rw-r--r--kernel/debug/kdb/kdb_support.c6
-rw-r--r--kernel/dma/Kconfig8
-rw-r--r--kernel/dma/debug.c55
-rw-r--r--kernel/dma/direct.c13
-rw-r--r--kernel/dma/pool.c147
-rw-r--r--kernel/entry/common.c3
-rw-r--r--kernel/events/callchain.c5
-rw-r--r--kernel/events/core.c11
-rw-r--r--kernel/events/uprobes.c10
-rw-r--r--kernel/exit.c18
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/futex.c6
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/manage.c11
-rw-r--r--kernel/irq/matrix.c7
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/kallsyms.c4
-rw-r--r--kernel/kcov.c6
-rw-r--r--kernel/kexec_file.c41
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/kthread.c5
-rw-r--r--kernel/locking/lockdep.c18
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/module.c60
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/power/hibernate.c99
-rw-r--r--kernel/power/qos.c4
-rw-r--r--kernel/relay.c1
-rw-r--r--kernel/sched/core.c17
-rw-r--r--kernel/sched/idle.c25
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/sched/topology.c6
-rw-r--r--kernel/signal.c18
-rw-r--r--kernel/stacktrace.c5
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/sysctl_binary.c171
-rw-r--r--kernel/task_work.c8
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/hrtimer.c15
-rw-r--r--kernel/time/posix-cpu-timers.c216
-rw-r--r--kernel/time/posix-timers.c4
-rw-r--r--kernel/time/sched_clock.c2
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/timekeeping.c21
-rw-r--r--kernel/time/timekeeping_internal.h11
-rw-r--r--kernel/time/timer.c3
-rw-r--r--kernel/time/vsyscall.c41
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/umh.c29
-rw-r--r--kernel/watch_queue.c8
71 files changed, 854 insertions, 572 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 5350fd292910..9a20016d4900 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,7 +5,7 @@
obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o softirq.o resource.o \
- sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
+ sysctl.o capability.o ptrace.o user.o \
signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
@@ -36,7 +36,7 @@ KCOV_INSTRUMENT_stacktrace.o := n
KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
KCSAN_SANITIZE_kcov.o := n
-CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a10e2997aa6c..333b3bcfc545 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -681,7 +681,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->values[i] = AUDIT_UID_UNSET;
break;
}
- /* fall through - if set */
+ fallthrough; /* if set */
default:
data->values[i] = f->val;
}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 363b9cafc2d8..8faa2ce89396 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -67,6 +67,9 @@ static void bpf_iter_done_stop(struct seq_file *seq)
iter_priv->done_stop = true;
}
+/* maximum visited objects before bailing out */
+#define MAX_ITER_OBJECTS 1000000
+
/* bpf_seq_read, a customized and simpler version for bpf iterator.
* no_llseek is assumed for this file.
* The following are differences from seq_read():
@@ -79,7 +82,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
{
struct seq_file *seq = file->private_data;
size_t n, offs, copied = 0;
- int err = 0;
+ int err = 0, num_objs = 0;
void *p;
mutex_lock(&seq->lock);
@@ -135,6 +138,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
while (1) {
loff_t pos = seq->index;
+ num_objs++;
offs = seq->count;
p = seq->op->next(seq, p, &seq->index);
if (pos == seq->index) {
@@ -153,6 +157,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
if (seq->count >= size)
break;
+ if (num_objs >= MAX_ITER_OBJECTS) {
+ if (offs == 0) {
+ err = -EAGAIN;
+ seq->op->stop(seq, p);
+ goto done;
+ }
+ break;
+ }
+
err = seq->op->show(seq, p);
if (err > 0) {
bpf_iter_dec_seq_num(seq);
@@ -338,8 +351,8 @@ static void bpf_iter_link_release(struct bpf_link *link)
struct bpf_iter_link *iter_link =
container_of(link, struct bpf_iter_link, link);
- if (iter_link->aux.map)
- bpf_map_put_with_uref(iter_link->aux.map);
+ if (iter_link->tinfo->reg_info->detach_target)
+ iter_link->tinfo->reg_info->detach_target(&iter_link->aux);
}
static void bpf_iter_link_dealloc(struct bpf_link *link)
@@ -390,15 +403,35 @@ bool bpf_link_is_iter(struct bpf_link *link)
int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
+ union bpf_iter_link_info __user *ulinfo;
struct bpf_link_primer link_primer;
struct bpf_iter_target_info *tinfo;
- struct bpf_iter_aux_info aux = {};
+ union bpf_iter_link_info linfo;
struct bpf_iter_link *link;
- u32 prog_btf_id, target_fd;
+ u32 prog_btf_id, linfo_len;
bool existed = false;
- struct bpf_map *map;
int err;
+ if (attr->link_create.target_fd || attr->link_create.flags)
+ return -EINVAL;
+
+ memset(&linfo, 0, sizeof(union bpf_iter_link_info));
+
+ ulinfo = u64_to_user_ptr(attr->link_create.iter_info);
+ linfo_len = attr->link_create.iter_info_len;
+ if (!ulinfo ^ !linfo_len)
+ return -EINVAL;
+
+ if (ulinfo) {
+ err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
+ linfo_len);
+ if (err)
+ return err;
+ linfo_len = min_t(u32, linfo_len, sizeof(linfo));
+ if (copy_from_user(&linfo, ulinfo, linfo_len))
+ return -EFAULT;
+ }
+
prog_btf_id = prog->aux->attach_btf_id;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
@@ -411,13 +444,6 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
if (!existed)
return -ENOENT;
- /* Make sure user supplied flags are target expected. */
- target_fd = attr->link_create.target_fd;
- if (attr->link_create.flags != tinfo->reg_info->req_linfo)
- return -EINVAL;
- if (!attr->link_create.flags && target_fd)
- return -EINVAL;
-
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
if (!link)
return -ENOMEM;
@@ -431,28 +457,15 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
return err;
}
- if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) {
- map = bpf_map_get_with_uref(target_fd);
- if (IS_ERR(map)) {
- err = PTR_ERR(map);
- goto cleanup_link;
- }
-
- aux.map = map;
- err = tinfo->reg_info->check_target(prog, &aux);
+ if (tinfo->reg_info->attach_target) {
+ err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux);
if (err) {
- bpf_map_put_with_uref(map);
- goto cleanup_link;
+ bpf_link_cleanup(&link_primer);
+ return err;
}
-
- link->aux.map = map;
}
return bpf_link_settle(&link_primer);
-
-cleanup_link:
- bpf_link_cleanup(&link_primer);
- return err;
}
static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 83ff127ef7ae..e21de4f1754c 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1794,7 +1794,7 @@ static bool cg_sockopt_is_valid_access(int off, int size,
return prog->expected_attach_type ==
BPF_CGROUP_GETSOCKOPT;
case offsetof(struct bpf_sockopt, optname):
- /* fallthrough */
+ fallthrough;
case offsetof(struct bpf_sockopt, level):
if (size != size_default)
return false;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index bde93344164d..ed0b3578867c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1966,7 +1966,7 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
* @index: the index of the program to replace
*
* Skips over dummy programs, by not counting them, when calculating
- * the the position of the program to replace.
+ * the position of the program to replace.
*
* Return:
* * 0 - Success
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index f1c46529929b..6386b7bb98f2 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -279,7 +279,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
break;
default:
bpf_warn_invalid_xdp_action(act);
- /* fallthrough */
+ fallthrough;
case XDP_DROP:
xdp_return_frame(xdpf);
stats->drop++;
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index fbe1f557cb88..af86048e5afd 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -98,12 +98,21 @@ static struct bpf_iter_reg bpf_map_reg_info = {
.seq_info = &bpf_map_seq_info,
};
-static int bpf_iter_check_map(struct bpf_prog *prog,
- struct bpf_iter_aux_info *aux)
+static int bpf_iter_attach_map(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
{
u32 key_acc_size, value_acc_size, key_size, value_size;
- struct bpf_map *map = aux->map;
+ struct bpf_map *map;
bool is_percpu = false;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -112,7 +121,7 @@ static int bpf_iter_check_map(struct bpf_prog *prog,
else if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY)
- return -EINVAL;
+ goto put_map;
key_acc_size = prog->aux->max_rdonly_access;
value_acc_size = prog->aux->max_rdwr_access;
@@ -122,10 +131,22 @@ static int bpf_iter_check_map(struct bpf_prog *prog,
else
value_size = round_up(map->value_size, 8) * num_possible_cpus();
- if (key_acc_size > key_size || value_acc_size > value_size)
- return -EACCES;
+ if (key_acc_size > key_size || value_acc_size > value_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+ aux->map = map;
return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
}
DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
@@ -133,8 +154,8 @@ DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
static const struct bpf_iter_reg bpf_map_elem_reg_info = {
.target = "bpf_map_elem",
- .check_target = bpf_iter_check_map,
- .req_linfo = BPF_ITER_LINK_MAP_FD,
+ .attach_target = bpf_iter_attach_map,
+ .detach_target = bpf_iter_detach_map,
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map_elem, key),
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 4fd830a62be2..cfed0ac44d38 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -213,11 +213,13 @@ static int stack_map_get_build_id_32(void *page_addr,
phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
- for (i = 0; i < ehdr->e_phnum; ++i)
- if (phdr[i].p_type == PT_NOTE)
- return stack_map_parse_build_id(page_addr, build_id,
- page_addr + phdr[i].p_offset,
- phdr[i].p_filesz);
+ for (i = 0; i < ehdr->e_phnum; ++i) {
+ if (phdr[i].p_type == PT_NOTE &&
+ !stack_map_parse_build_id(page_addr, build_id,
+ page_addr + phdr[i].p_offset,
+ phdr[i].p_filesz))
+ return 0;
+ }
return -EINVAL;
}
@@ -236,11 +238,13 @@ static int stack_map_get_build_id_64(void *page_addr,
phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
- for (i = 0; i < ehdr->e_phnum; ++i)
- if (phdr[i].p_type == PT_NOTE)
- return stack_map_parse_build_id(page_addr, build_id,
- page_addr + phdr[i].p_offset,
- phdr[i].p_filesz);
+ for (i = 0; i < ehdr->e_phnum; ++i) {
+ if (phdr[i].p_type == PT_NOTE &&
+ !stack_map_parse_build_id(page_addr, build_id,
+ page_addr + phdr[i].p_offset,
+ phdr[i].p_filesz))
+ return 0;
+ }
return -EINVAL;
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2f343ce15747..1bf960aa615c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2029,7 +2029,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
return -EINVAL;
- /* fallthrough */
+ fallthrough;
default:
return 0;
}
@@ -3883,7 +3883,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *
return -EINVAL;
}
-#define BPF_LINK_CREATE_LAST_FIELD link_create.flags
+#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
static int link_create(union bpf_attr *attr)
{
enum bpf_prog_type ptype;
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 232df29793e9..99af4cea1102 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -29,8 +29,9 @@ static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
rcu_read_lock();
retry:
- pid = idr_get_next(&ns->idr, tid);
+ pid = find_ge_pid(*tid, ns);
if (pid) {
+ *tid = pid_nr_ns(pid, ns);
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
++*tid;
@@ -178,10 +179,11 @@ again:
f = fcheck_files(curr_files, curr_fd);
if (!f)
continue;
+ if (!get_file_rcu(f))
+ continue;
/* set info->fd */
info->fd = curr_fd;
- get_file(f);
rcu_read_unlock();
return f;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b6ccfce3bf4c..47e74f09fa37 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5236,7 +5236,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
off_reg == dst_reg ? dst : src);
return -EACCES;
}
- /* fall-through */
+ fallthrough;
default:
break;
}
@@ -8294,7 +8294,7 @@ static bool stacksafe(struct bpf_func_state *old,
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
cur->stack[spi].slot_type[i % BPF_REG_SIZE])
/* Ex: old explored (safe) state has STACK_SPILL in
- * this stack slot, but current has has STACK_MISC ->
+ * this stack slot, but current has STACK_MISC ->
* this verifier states are not equivalent,
* return false to continue verification of this path
*/
@@ -10988,7 +10988,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
default:
if (!prog_extension)
return -EINVAL;
- /* fallthrough */
+ fallthrough;
case BPF_MODIFY_RETURN:
case BPF_LSM_MAC:
case BPF_TRACE_FENTRY:
diff --git a/kernel/capability.c b/kernel/capability.c
index 1444f3954d75..7c59b096c98a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -93,7 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
break;
case _LINUX_CAPABILITY_VERSION_2:
warn_deprecated_v2();
- /* fall through - v3 is otherwise equivalent to v2. */
+ fallthrough; /* v3 is otherwise equivalent to v2 */
case _LINUX_CAPABILITY_VERSION_3:
*tocopy = _LINUX_CAPABILITY_U32S_3;
break;
diff --git a/kernel/compat.c b/kernel/compat.c
index b8d2800bb4b7..05adfd6fa8bf 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -255,11 +255,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
return -EFAULT;
switch (_NSIG_WORDS) {
case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
- /* fall through */
+ fallthrough;
case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
- /* fall through */
+ fallthrough;
case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
- /* fall through */
+ fallthrough;
case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
}
#else
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 18175687133a..106e4500fd53 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -11,6 +11,8 @@
#include <asm/page.h>
#include <asm/sections.h>
+#include <crypto/sha.h>
+
/* vmcoreinfo stuff */
unsigned char *vmcoreinfo_data;
size_t vmcoreinfo_size;
@@ -376,6 +378,53 @@ phys_addr_t __weak paddr_vmcoreinfo_note(void)
}
EXPORT_SYMBOL(paddr_vmcoreinfo_note);
+#define NOTES_SIZE (&__stop_notes - &__start_notes)
+#define BUILD_ID_MAX SHA1_DIGEST_SIZE
+#define NT_GNU_BUILD_ID 3
+
+struct elf_note_section {
+ struct elf_note n_hdr;
+ u8 n_data[];
+};
+
+/*
+ * Add build ID from .notes section as generated by the GNU ld(1)
+ * or LLVM lld(1) --build-id option.
+ */
+static void add_build_id_vmcoreinfo(void)
+{
+ char build_id[BUILD_ID_MAX * 2 + 1];
+ int n_remain = NOTES_SIZE;
+
+ while (n_remain >= sizeof(struct elf_note)) {
+ const struct elf_note_section *note_sec =
+ &__start_notes + NOTES_SIZE - n_remain;
+ const u32 n_namesz = note_sec->n_hdr.n_namesz;
+
+ if (note_sec->n_hdr.n_type == NT_GNU_BUILD_ID &&
+ n_namesz != 0 &&
+ !strcmp((char *)&note_sec->n_data[0], "GNU")) {
+ if (note_sec->n_hdr.n_descsz <= BUILD_ID_MAX) {
+ const u32 n_descsz = note_sec->n_hdr.n_descsz;
+ const u8 *s = &note_sec->n_data[n_namesz];
+
+ s = PTR_ALIGN(s, 4);
+ bin2hex(build_id, s, n_descsz);
+ build_id[2 * n_descsz] = '\0';
+ VMCOREINFO_BUILD_ID(build_id);
+ return;
+ }
+ pr_warn("Build ID is too large to include in vmcoreinfo: %u > %u\n",
+ note_sec->n_hdr.n_descsz,
+ BUILD_ID_MAX);
+ return;
+ }
+ n_remain -= sizeof(struct elf_note) +
+ ALIGN(note_sec->n_hdr.n_namesz, 4) +
+ ALIGN(note_sec->n_hdr.n_descsz, 4);
+ }
+}
+
static int __init crash_save_vmcoreinfo_init(void)
{
vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
@@ -394,6 +443,7 @@ static int __init crash_save_vmcoreinfo_init(void)
}
VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ add_build_id_vmcoreinfo();
VMCOREINFO_PAGESIZE(PAGE_SIZE);
VMCOREINFO_SYMBOL(init_uts_ns);
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index a790026e42d0..cc3c43dfec44 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1046,14 +1046,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
return DBG_PASS_EVENT;
}
#endif
- /* Fall through */
+ fallthrough;
case 'C': /* Exception passing */
tmp = gdb_cmd_exception_pass(ks);
if (tmp > 0)
goto default_handle;
if (tmp == 0)
break;
- /* Fall through - on tmp < 0 */
+ fallthrough; /* on tmp < 0 */
case 'c': /* Continue packet */
case 's': /* Single step packet */
if (kgdb_contthread && kgdb_contthread != current) {
@@ -1062,7 +1062,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
break;
}
dbg_activate_sw_breakpoints();
- /* Fall through - to default processing */
+ fallthrough; /* to default processing */
default:
default_handle:
error = kgdb_arch_handle_exception(ks->ex_vector,
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 750497b0003a..f877a0a0d7cf 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -173,11 +173,11 @@ int kdb_get_kbd_char(void)
case KT_LATIN:
if (isprint(keychar))
break; /* printable characters */
- /* fall through */
+ fallthrough;
case KT_SPEC:
if (keychar == K_ENTER)
break;
- /* fall through */
+ fallthrough;
default:
return -1; /* ignore unprintables */
}
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 004c5b6c87f8..6226502ce049 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -432,7 +432,7 @@ int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
*word = w8;
break;
}
- /* fall through */
+ fallthrough;
default:
diag = KDB_BADWIDTH;
kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
@@ -481,7 +481,7 @@ int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
*word = w8;
break;
}
- /* fall through */
+ fallthrough;
default:
diag = KDB_BADWIDTH;
kdb_printf("kdb_getword: bad width %ld\n", (long) size);
@@ -525,7 +525,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
diag = kdb_putarea(addr, w8);
break;
}
- /* fall through */
+ fallthrough;
default:
diag = KDB_BADWIDTH;
kdb_printf("kdb_putword: bad width %ld\n", (long) size);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index f4770fcfa62b..847a9d1fa634 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -1,5 +1,8 @@
# SPDX-License-Identifier: GPL-2.0-only
+config NO_DMA
+ bool
+
config HAS_DMA
bool
depends on !NO_DMA
@@ -186,11 +189,6 @@ config DMA_API_DEBUG
drivers like double-freeing of DMA mappings or freeing mappings that
were never allocated.
- This also attempts to catch cases where a page owned by DMA is
- accessed by the cpu in a way that could cause data corruption. For
- example, this enables cow_user_page() to check that the source page is
- not undergoing DMA.
-
This option causes a performance degradation. Use only if you want to
debug device drivers and dma interactions.
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index f7f807fb6ebb..8e9f7b301c6d 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -448,9 +448,6 @@ void debug_dma_dump_mappings(struct device *dev)
* dma_active_cacheline entry to track per event. dma_map_sg(), on the
* other hand, consumes a single dma_debug_entry, but inserts 'nents'
* entries into the tree.
- *
- * At any time debug_dma_assert_idle() can be called to trigger a
- * warning if any cachelines in the given page are in the active set.
*/
static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT);
static DEFINE_SPINLOCK(radix_lock);
@@ -497,10 +494,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
overlap = active_cacheline_set_overlap(cln, ++overlap);
/* If we overflowed the overlap counter then we're potentially
- * leaking dma-mappings. Otherwise, if maps and unmaps are
- * balanced then this overflow may cause false negatives in
- * debug_dma_assert_idle() as the cacheline may be marked idle
- * prematurely.
+ * leaking dma-mappings.
*/
WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
@@ -555,53 +549,6 @@ static void active_cacheline_remove(struct dma_debug_entry *entry)
spin_unlock_irqrestore(&radix_lock, flags);
}
-/**
- * debug_dma_assert_idle() - assert that a page is not undergoing dma
- * @page: page to lookup in the dma_active_cacheline tree
- *
- * Place a call to this routine in cases where the cpu touching the page
- * before the dma completes (page is dma_unmapped) will lead to data
- * corruption.
- */
-void debug_dma_assert_idle(struct page *page)
-{
- static struct dma_debug_entry *ents[CACHELINES_PER_PAGE];
- struct dma_debug_entry *entry = NULL;
- void **results = (void **) &ents;
- unsigned int nents, i;
- unsigned long flags;
- phys_addr_t cln;
-
- if (dma_debug_disabled())
- return;
-
- if (!page)
- return;
-
- cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT;
- spin_lock_irqsave(&radix_lock, flags);
- nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln,
- CACHELINES_PER_PAGE);
- for (i = 0; i < nents; i++) {
- phys_addr_t ent_cln = to_cacheline_number(ents[i]);
-
- if (ent_cln == cln) {
- entry = ents[i];
- break;
- } else if (ent_cln >= cln + CACHELINES_PER_PAGE)
- break;
- }
- spin_unlock_irqrestore(&radix_lock, flags);
-
- if (!entry)
- return;
-
- cln = to_cacheline_number(entry);
- err_printk(entry->dev, entry,
- "cpu touching an active dma mapped cacheline [cln=%pa]\n",
- &cln);
-}
-
/*
* Wrapper function for adding an entry to the hash.
* This function takes care of locking itself.
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index bb0041e99659..db6ef07aec3b 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -43,7 +43,7 @@ u64 dma_direct_get_required_mask(struct device *dev)
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
}
-gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
+static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
u64 *phys_limit)
{
u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
@@ -68,7 +68,7 @@ gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
return 0;
}
-bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
+static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
{
return phys_to_dma_direct(dev, phys) + size - 1 <=
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
@@ -161,8 +161,13 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
size = PAGE_ALIGN(size);
if (dma_should_alloc_from_pool(dev, gfp, attrs)) {
- ret = dma_alloc_from_pool(dev, size, &page, gfp);
- if (!ret)
+ u64 phys_mask;
+
+ gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+ &phys_mask);
+ page = dma_alloc_from_pool(dev, size, &ret, gfp,
+ dma_coherent_ok);
+ if (!page)
return NULL;
goto done;
}
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 6bc74a2d5127..1281c0f0442b 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -3,7 +3,9 @@
* Copyright (C) 2012 ARM Ltd.
* Copyright (C) 2020 Google LLC
*/
+#include <linux/cma.h>
#include <linux/debugfs.h>
+#include <linux/dma-contiguous.h>
#include <linux/dma-direct.h>
#include <linux/dma-noncoherent.h>
#include <linux/init.h>
@@ -55,11 +57,34 @@ static void dma_atomic_pool_size_add(gfp_t gfp, size_t size)
pool_size_kernel += size;
}
+static bool cma_in_zone(gfp_t gfp)
+{
+ unsigned long size;
+ phys_addr_t end;
+ struct cma *cma;
+
+ cma = dev_get_cma_area(NULL);
+ if (!cma)
+ return false;
+
+ size = cma_get_size(cma);
+ if (!size)
+ return false;
+
+ /* CMA can't cross zone boundaries, see cma_activate_area() */
+ end = cma_get_base(cma) + size - 1;
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+ return end <= DMA_BIT_MASK(zone_dma_bits);
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+ return end <= DMA_BIT_MASK(32);
+ return true;
+}
+
static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
gfp_t gfp)
{
unsigned int order;
- struct page *page;
+ struct page *page = NULL;
void *addr;
int ret = -ENOMEM;
@@ -68,7 +93,11 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
do {
pool_size = 1 << (PAGE_SHIFT + order);
- page = alloc_pages(gfp, order);
+ if (cma_in_zone(gfp))
+ page = dma_alloc_from_contiguous(NULL, 1 << order,
+ order, false);
+ if (!page)
+ page = alloc_pages(gfp, order);
} while (!page && order-- > 0);
if (!page)
goto out;
@@ -196,93 +225,75 @@ static int __init dma_atomic_pool_init(void)
}
postcore_initcall(dma_atomic_pool_init);
-static inline struct gen_pool *dma_guess_pool_from_device(struct device *dev)
+static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
{
- u64 phys_mask;
- gfp_t gfp;
-
- gfp = dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
- &phys_mask);
- if (IS_ENABLED(CONFIG_ZONE_DMA) && gfp == GFP_DMA)
+ if (prev == NULL) {
+ if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
+ return atomic_pool_dma32;
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+ return atomic_pool_dma;
+ return atomic_pool_kernel;
+ }
+ if (prev == atomic_pool_kernel)
+ return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma;
+ if (prev == atomic_pool_dma32)
return atomic_pool_dma;
- if (IS_ENABLED(CONFIG_ZONE_DMA32) && gfp == GFP_DMA32)
- return atomic_pool_dma32;
- return atomic_pool_kernel;
+ return NULL;
}
-static inline struct gen_pool *dma_get_safer_pool(struct gen_pool *bad_pool)
+static struct page *__dma_alloc_from_pool(struct device *dev, size_t size,
+ struct gen_pool *pool, void **cpu_addr,
+ bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
{
- if (bad_pool == atomic_pool_kernel)
- return atomic_pool_dma32 ? : atomic_pool_dma;
+ unsigned long addr;
+ phys_addr_t phys;
- if (bad_pool == atomic_pool_dma32)
- return atomic_pool_dma;
+ addr = gen_pool_alloc(pool, size);
+ if (!addr)
+ return NULL;
- return NULL;
-}
+ phys = gen_pool_virt_to_phys(pool, addr);
+ if (phys_addr_ok && !phys_addr_ok(dev, phys, size)) {
+ gen_pool_free(pool, addr, size);
+ return NULL;
+ }
-static inline struct gen_pool *dma_guess_pool(struct device *dev,
- struct gen_pool *bad_pool)
-{
- if (bad_pool)
- return dma_get_safer_pool(bad_pool);
+ if (gen_pool_avail(pool) < atomic_pool_size)
+ schedule_work(&atomic_pool_work);
- return dma_guess_pool_from_device(dev);
+ *cpu_addr = (void *)addr;
+ memset(*cpu_addr, 0, size);
+ return pfn_to_page(__phys_to_pfn(phys));
}
-void *dma_alloc_from_pool(struct device *dev, size_t size,
- struct page **ret_page, gfp_t flags)
+struct page *dma_alloc_from_pool(struct device *dev, size_t size,
+ void **cpu_addr, gfp_t gfp,
+ bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
{
struct gen_pool *pool = NULL;
- unsigned long val = 0;
- void *ptr = NULL;
- phys_addr_t phys;
-
- while (1) {
- pool = dma_guess_pool(dev, pool);
- if (!pool) {
- WARN(1, "Failed to get suitable pool for %s\n",
- dev_name(dev));
- break;
- }
-
- val = gen_pool_alloc(pool, size);
- if (!val)
- continue;
-
- phys = gen_pool_virt_to_phys(pool, val);
- if (dma_coherent_ok(dev, phys, size))
- break;
-
- gen_pool_free(pool, val, size);
- val = 0;
- }
-
-
- if (val) {
- *ret_page = pfn_to_page(__phys_to_pfn(phys));
- ptr = (void *)val;
- memset(ptr, 0, size);
+ struct page *page;
- if (gen_pool_avail(pool) < atomic_pool_size)
- schedule_work(&atomic_pool_work);
+ while ((pool = dma_guess_pool(pool, gfp))) {
+ page = __dma_alloc_from_pool(dev, size, pool, cpu_addr,
+ phys_addr_ok);
+ if (page)
+ return page;
}
- return ptr;
+ WARN(1, "Failed to get suitable pool for %s\n", dev_name(dev));
+ return NULL;
}
bool dma_free_from_pool(struct device *dev, void *start, size_t size)
{
struct gen_pool *pool = NULL;
- while (1) {
- pool = dma_guess_pool(dev, pool);
- if (!pool)
- return false;
-
- if (gen_pool_has_addr(pool, (unsigned long)start, size)) {
- gen_pool_free(pool, (unsigned long)start, size);
- return true;
- }
+ while ((pool = dma_guess_pool(pool, 0))) {
+ if (!gen_pool_has_addr(pool, (unsigned long)start, size))
+ continue;
+ gen_pool_free(pool, (unsigned long)start, size);
+ return true;
}
+
+ return false;
}
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 9852e0d62d95..fcae019158ca 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -65,7 +65,8 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
syscall_enter_audit(regs, syscall);
- return ret ? : syscall;
+ /* The above might have changed the syscall number */
+ return ret ? : syscall_get_nr(current, regs);
}
noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c6ce894e4ce9..58cbe357fb2b 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -217,10 +217,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
perf_callchain_user(&ctx, regs);
- set_fs(fs);
+ force_uaccess_end(fs);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d1f0a7e5b182..7ed5248f0445 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6453,10 +6453,9 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
rem = __output_copy_user(handle, (void *) sp, dump_size);
- set_fs(fs);
+ force_uaccess_end(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
@@ -10035,7 +10034,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
case IF_SRC_KERNELADDR:
case IF_SRC_KERNEL:
kernel = 1;
- /* fall through */
+ fallthrough;
case IF_SRC_FILEADDR:
case IF_SRC_FILE:
@@ -11707,7 +11706,7 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
/*
- * Reuse ptrace permission checks for now.
+ * Preserve ptrace permission check for backwards compatibility.
*
* We must hold exec_update_mutex across this and any potential
* perf_install_in_context() call for this new event to
@@ -11715,7 +11714,7 @@ SYSCALL_DEFINE5(perf_event_open,
* perf_event_exit_task() that could imply).
*/
err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
goto err_cred;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 25de10c904e6..0e18aaf23a7b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (new_page) {
get_page(new_page);
page_add_new_anon_rmap(new_page, vma, addr, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
} else
/* no new page, just dec_mm_counter for old_page */
dec_mm_counter(mm, MM_ANONPAGES);
@@ -205,7 +205,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
try_to_free_swap(old_page);
page_vma_mapped_walk_done(&pvmw);
- if (vma->vm_flags & VM_LOCKED)
+ if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
munlock_vma_page(old_page);
put_page(old_page);
@@ -376,7 +376,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
if (!vaddr || !d)
return -EINVAL;
- ret = get_user_pages_remote(NULL, mm, vaddr, 1,
+ ret = get_user_pages_remote(mm, vaddr, 1,
FOLL_WRITE, &page, &vma, NULL);
if (unlikely(ret <= 0)) {
/*
@@ -477,7 +477,7 @@ retry:
if (is_register)
gup_flags |= FOLL_SPLIT_PMD;
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags,
+ ret = get_user_pages_remote(mm, vaddr, 1, gup_flags,
&old_page, &vma, NULL);
if (ret <= 0)
return ret;
@@ -2029,7 +2029,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
- result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
+ result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page,
NULL, NULL);
if (result < 0)
return result;
diff --git a/kernel/exit.c b/kernel/exit.c
index e731c414e024..733e80f334e7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -732,7 +732,7 @@ void __noreturn do_exit(long code)
* mm_release()->clear_child_tid() from writing to a user-controlled
* kernel address.
*/
- set_fs(USER_DS);
+ force_uaccess_begin();
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -1626,6 +1626,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
return ret;
}
+int kernel_wait(pid_t pid, int *stat)
+{
+ struct wait_opts wo = {
+ .wo_type = PIDTYPE_PID,
+ .wo_pid = find_get_pid(pid),
+ .wo_flags = WEXITED,
+ };
+ int ret;
+
+ ret = do_wait(&wo);
+ if (ret > 0 && wo.wo_stat)
+ *stat = wo.wo_stat;
+ put_pid(wo.wo_pid);
+ return ret;
+}
+
SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
int, options, struct rusage __user *, ru)
{
diff --git a/kernel/fork.c b/kernel/fork.c
index 35e9894d394c..4d32190861bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2011,7 +2011,7 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
- seqcount_init(&p->mems_allowed_seq);
+ seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
memset(&p->irqtrace, 0, sizeof(p->irqtrace));
diff --git a/kernel/futex.c b/kernel/futex.c
index 83404124b77b..a5876694a60e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -678,7 +678,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
int ret;
mmap_read_lock(mm);
- ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+ ret = fixup_user_fault(mm, (unsigned long)uaddr,
FAULT_FLAG_WRITE, NULL);
mmap_read_unlock(mm);
@@ -3744,12 +3744,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
- /* fall through */
+ fallthrough;
case FUTEX_WAIT_BITSET:
return futex_wait(uaddr, flags, val, timeout, val3);
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
- /* fall through */
+ fallthrough;
case FUTEX_WAKE_BITSET:
return futex_wake(uaddr, flags, val, val3);
case FUTEX_REQUEUE:
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a8e14c80b405..762a928e18f9 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -173,7 +173,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
__irq_wake_thread(desc, action);
- /* Fall through - to add to randomness */
+ fallthrough; /* to add to randomness */
case IRQ_HANDLED:
*flags |= action->flags;
break;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d55ba625d426..5df903fccb60 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -271,7 +271,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
cpumask_copy(desc->irq_common_data.affinity, mask);
- /* fall through */
+ fallthrough;
case IRQ_SET_MASK_OK_NOCOPY:
irq_validate_effective_affinity(data);
irq_set_thread_affinity(desc);
@@ -868,7 +868,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
case IRQ_SET_MASK_OK_DONE:
irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
irqd_set(&desc->irq_data, flags);
- /* fall through */
+ fallthrough;
case IRQ_SET_MASK_OK_NOCOPY:
flags = irqd_get_trigger_type(&desc->irq_data);
@@ -2731,8 +2731,10 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
do {
chip = irq_data_get_irq_chip(data);
- if (WARN_ON_ONCE(!chip))
- return -ENODEV;
+ if (WARN_ON_ONCE(!chip)) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
if (chip->irq_set_irqchip_state)
break;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
@@ -2745,6 +2747,7 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
if (data)
err = chip->irq_set_irqchip_state(data, which, val);
+out_unlock:
irq_put_desc_busunlock(desc, flags);
return err;
}
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 30cc217b8631..651a4ad6d711 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -380,6 +380,13 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
unsigned int cpu, bit;
struct cpumap *cm;
+ /*
+ * Not required in theory, but matrix_find_best_cpu() uses
+ * for_each_cpu() which ignores the cpumask on UP .
+ */
+ if (cpumask_empty(msk))
+ return -EINVAL;
+
cpu = matrix_find_best_cpu(m, msk);
if (cpu == UINT_MAX)
return -ENOSPC;
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 8f557fa1f4fe..c6c7e187ae74 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -185,14 +185,18 @@ void rearm_wake_irq(unsigned int irq)
unsigned long flags;
struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
- if (!desc || !(desc->istate & IRQS_SUSPENDED) ||
- !irqd_is_wakeup_set(&desc->irq_data))
+ if (!desc)
return;
+ if (!(desc->istate & IRQS_SUSPENDED) ||
+ !irqd_is_wakeup_set(&desc->irq_data))
+ goto unlock;
+
desc->istate &= ~IRQS_SUSPENDED;
irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
__enable_irq(desc);
+unlock:
irq_put_desc_busunlock(desc, flags);
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 95cb74f73292..4fb15fa96734 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -684,12 +684,12 @@ bool kallsyms_show_value(const struct cred *cred)
case 0:
if (kallsyms_for_perf())
return true;
- /* fallthrough */
+ fallthrough;
case 1:
if (security_capable(cred, &init_user_ns, CAP_SYSLOG,
CAP_OPT_NOAUDIT) == 0)
return true;
- /* fallthrough */
+ fallthrough;
default:
return false;
}
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 6afae0bcbac4..6b8368be89c8 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -96,7 +96,7 @@ struct kcov_percpu_data {
int saved_sequence;
};
-DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
+static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_find(u64 handle)
@@ -775,7 +775,7 @@ static inline bool kcov_mode_enabled(unsigned int mode)
return (mode & ~KCOV_IN_CTXSW) != KCOV_MODE_DISABLED;
}
-void kcov_remote_softirq_start(struct task_struct *t)
+static void kcov_remote_softirq_start(struct task_struct *t)
{
struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
unsigned int mode;
@@ -792,7 +792,7 @@ void kcov_remote_softirq_start(struct task_struct *t)
}
}
-void kcov_remote_softirq_stop(struct task_struct *t)
+static void kcov_remote_softirq_stop(struct task_struct *t)
{
struct kcov_percpu_data *data = this_cpu_ptr(&kcov_percpu_data);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 78c0837bfd7b..ca40bef75a61 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1169,24 +1169,26 @@ int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart, unsigned long long mend)
{
int i, j;
- unsigned long long start, end;
+ unsigned long long start, end, p_start, p_end;
struct crash_mem_range temp_range = {0, 0};
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
end = mem->ranges[i].end;
+ p_start = mstart;
+ p_end = mend;
if (mstart > end || mend < start)
continue;
/* Truncate any area outside of range */
if (mstart < start)
- mstart = start;
+ p_start = start;
if (mend > end)
- mend = end;
+ p_end = end;
/* Found completely overlapping range */
- if (mstart == start && mend == end) {
+ if (p_start == start && p_end == end) {
mem->ranges[i].start = 0;
mem->ranges[i].end = 0;
if (i < mem->nr_ranges - 1) {
@@ -1197,20 +1199,29 @@ int crash_exclude_mem_range(struct crash_mem *mem,
mem->ranges[j].end =
mem->ranges[j+1].end;
}
+
+ /*
+ * Continue to check if there are another overlapping ranges
+ * from the current position because of shifting the above
+ * mem ranges.
+ */
+ i--;
+ mem->nr_ranges--;
+ continue;
}
mem->nr_ranges--;
return 0;
}
- if (mstart > start && mend < end) {
+ if (p_start > start && p_end < end) {
/* Split original range */
- mem->ranges[i].end = mstart - 1;
- temp_range.start = mend + 1;
+ mem->ranges[i].end = p_start - 1;
+ temp_range.start = p_end + 1;
temp_range.end = end;
- } else if (mstart != start)
- mem->ranges[i].end = mstart - 1;
+ } else if (p_start != start)
+ mem->ranges[i].end = p_start - 1;
else
- mem->ranges[i].start = mend + 1;
+ mem->ranges[i].start = p_end + 1;
break;
}
@@ -1247,7 +1258,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
unsigned long long notes_addr;
unsigned long mstart, mend;
- /* extra phdr for vmcoreinfo elf note */
+ /* extra phdr for vmcoreinfo ELF note */
nr_phdr = nr_cpus + 1;
nr_phdr += mem->nr_ranges;
@@ -1255,7 +1266,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
* kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
* area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
* I think this is required by tools like gdb. So same physical
- * memory will be mapped in two elf headers. One will contain kernel
+ * memory will be mapped in two ELF headers. One will contain kernel
* text virtual addresses and other will have __va(physical) addresses.
*/
@@ -1282,7 +1293,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
ehdr->e_ehsize = sizeof(Elf64_Ehdr);
ehdr->e_phentsize = sizeof(Elf64_Phdr);
- /* Prepare one phdr of type PT_NOTE for each present cpu */
+ /* Prepare one phdr of type PT_NOTE for each present CPU */
for_each_present_cpu(cpu) {
phdr->p_type = PT_NOTE;
notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
@@ -1324,10 +1335,10 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
phdr->p_align = 0;
ehdr->e_phnum++;
- phdr++;
- pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
ehdr->e_phnum, phdr->p_offset);
+ phdr++;
}
*addr = buf;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 37c3c4b97b8e..3cd075ce2a1e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,9 +36,8 @@
*
* If you need less than 50 threads would mean we're dealing with systems
* smaller than 3200 pages. This assumes you are capable of having ~13M memory,
- * and this would only be an be an upper limit, after which the OOM killer
- * would take effect. Systems like these are very unlikely if modules are
- * enabled.
+ * and this would only be an upper limit, after which the OOM killer would take
+ * effect. Systems like these are very unlikely if modules are enabled.
*/
#define MAX_KMOD_CONCURRENT 50
static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b2807e7be772..3edaa380dc7b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1258,8 +1258,7 @@ void kthread_use_mm(struct mm_struct *mm)
if (active_mm != mm)
mmdrop(active_mm);
- to_kthread(tsk)->oldfs = get_fs();
- set_fs(USER_DS);
+ to_kthread(tsk)->oldfs = force_uaccess_begin();
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
@@ -1274,7 +1273,7 @@ void kthread_unuse_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(!tsk->mm);
- set_fs(to_kthread(tsk)->oldfs);
+ force_uaccess_end(to_kthread(tsk)->oldfs);
task_lock(tsk);
sync_mm_rss(mm);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 2fad21d345b0..54b74fabf40c 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3756,7 +3756,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
skip_checks:
/* we'll do an OFF -> ON transition: */
- this_cpu_write(hardirqs_enabled, 1);
+ __this_cpu_write(hardirqs_enabled, 1);
trace->hardirq_enable_ip = ip;
trace->hardirq_enable_event = ++trace->irq_events;
debug_atomic_inc(hardirqs_on_events);
@@ -3795,7 +3795,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
/*
* We have done an ON -> OFF transition:
*/
- this_cpu_write(hardirqs_enabled, 0);
+ __this_cpu_write(hardirqs_enabled, 0);
trace->hardirq_disable_ip = ip;
trace->hardirq_disable_event = ++trace->irq_events;
debug_atomic_inc(hardirqs_off_events);
@@ -4977,6 +4977,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
{
unsigned long flags;
+ trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+
if (unlikely(current->lockdep_recursion)) {
/* XXX allow trylock from NMI ?!? */
if (lockdep_nmi() && !trylock) {
@@ -5001,7 +5003,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
check_flags(flags);
current->lockdep_recursion++;
- trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
__lock_acquire(lock, subclass, trylock, read, check,
irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
lockdep_recursion_finish();
@@ -5013,13 +5014,15 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
+ trace_lock_release(lock, ip);
+
if (unlikely(current->lockdep_recursion))
return;
raw_local_irq_save(flags);
check_flags(flags);
+
current->lockdep_recursion++;
- trace_lock_release(lock, ip);
if (__lock_release(lock, ip))
check_chain_key(current);
lockdep_recursion_finish();
@@ -5205,8 +5208,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
hlock->holdtime_stamp = now;
}
- trace_lock_acquired(lock, ip);
-
stats = get_lock_stats(hlock_class(hlock));
if (waittime) {
if (hlock->read)
@@ -5225,6 +5226,8 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
+ trace_lock_acquired(lock, ip);
+
if (unlikely(!lock_stat || !debug_locks))
return;
@@ -5234,7 +5237,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
raw_local_irq_save(flags);
check_flags(flags);
current->lockdep_recursion++;
- trace_lock_contended(lock, ip);
__lock_contended(lock, ip);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5245,6 +5247,8 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
+ trace_lock_contended(lock, ip);
+
if (unlikely(!lock_stat || !debug_locks))
return;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 5525cd3ba0c8..02ef87f50df2 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -423,7 +423,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
seq_time(m, lt->min);
seq_time(m, lt->max);
seq_time(m, lt->total);
- seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
+ seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0);
}
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
diff --git a/kernel/module.c b/kernel/module.c
index 8fa2600bde6a..1c5cff34d9f2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -422,7 +422,7 @@ static bool each_symbol_in_section(const struct symsearch *arr,
}
/* Returns true as soon as fn returns true, otherwise false. */
-bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
+static bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
struct module *owner,
void *data),
void *data)
@@ -484,7 +484,6 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
}
return false;
}
-EXPORT_SYMBOL_GPL(each_symbol_section);
struct find_symbol_arg {
/* Input */
@@ -496,6 +495,7 @@ struct find_symbol_arg {
struct module *owner;
const s32 *crc;
const struct kernel_symbol *sym;
+ enum mod_license license;
};
static bool check_exported_symbol(const struct symsearch *syms,
@@ -505,9 +505,9 @@ static bool check_exported_symbol(const struct symsearch *syms,
struct find_symbol_arg *fsa = data;
if (!fsa->gplok) {
- if (syms->licence == GPL_ONLY)
+ if (syms->license == GPL_ONLY)
return false;
- if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+ if (syms->license == WILL_BE_GPL_ONLY && fsa->warn) {
pr_warn("Symbol %s is being used by a non-GPL module, "
"which will not be allowed in the future\n",
fsa->name);
@@ -529,6 +529,7 @@ static bool check_exported_symbol(const struct symsearch *syms,
fsa->owner = owner;
fsa->crc = symversion(syms->crcs, symnum);
fsa->sym = &syms->start[symnum];
+ fsa->license = syms->license;
return true;
}
@@ -585,9 +586,10 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
/* Find an exported symbol and return it, along with, (optional) crc and
* (optional) module which owns it. Needs preempt disabled or module_mutex. */
-const struct kernel_symbol *find_symbol(const char *name,
+static const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
const s32 **crc,
+ enum mod_license *license,
bool gplok,
bool warn)
{
@@ -602,13 +604,14 @@ const struct kernel_symbol *find_symbol(const char *name,
*owner = fsa.owner;
if (crc)
*crc = fsa.crc;
+ if (license)
+ *license = fsa.license;
return fsa.sym;
}
pr_debug("Failed to find symbol %s\n", name);
return NULL;
}
-EXPORT_SYMBOL_GPL(find_symbol);
/*
* Search for module by name: must hold module_mutex (or preempt disabled
@@ -869,7 +872,7 @@ static int add_module_usage(struct module *a, struct module *b)
}
/* Module a uses b: caller needs module_mutex() */
-int ref_module(struct module *a, struct module *b)
+static int ref_module(struct module *a, struct module *b)
{
int err;
@@ -888,7 +891,6 @@ int ref_module(struct module *a, struct module *b)
}
return 0;
}
-EXPORT_SYMBOL_GPL(ref_module);
/* Clear the unload stuff of the module. */
static void module_unload_free(struct module *mod)
@@ -1077,7 +1079,7 @@ void __symbol_put(const char *symbol)
struct module *owner;
preempt_disable();
- if (!find_symbol(symbol, &owner, NULL, true, false))
+ if (!find_symbol(symbol, &owner, NULL, NULL, true, false))
BUG();
module_put(owner);
preempt_enable();
@@ -1169,11 +1171,10 @@ static inline void module_unload_free(struct module *mod)
{
}
-int ref_module(struct module *a, struct module *b)
+static int ref_module(struct module *a, struct module *b)
{
return strong_try_module_get(b);
}
-EXPORT_SYMBOL_GPL(ref_module);
static inline int module_unload_init(struct module *mod)
{
@@ -1356,7 +1357,7 @@ static inline int check_modstruct_version(const struct load_info *info,
* locking is necessary -- use preempt_disable() to placate lockdep.
*/
preempt_disable();
- if (!find_symbol("module_layout", NULL, &crc, true, false)) {
+ if (!find_symbol("module_layout", NULL, &crc, NULL, true, false)) {
preempt_enable();
BUG();
}
@@ -1430,6 +1431,24 @@ static int verify_namespace_is_imported(const struct load_info *info,
return 0;
}
+static bool inherit_taint(struct module *mod, struct module *owner)
+{
+ if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
+ return true;
+
+ if (mod->using_gplonly_symbols) {
+ pr_err("%s: module using GPL-only symbols uses symbols from proprietary module %s.\n",
+ mod->name, owner->name);
+ return false;
+ }
+
+ if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
+ pr_warn("%s: module uses symbols from proprietary module %s, inheriting taint.\n",
+ mod->name, owner->name);
+ set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
+ }
+ return true;
+}
/* Resolve a symbol for this module. I.e. if we find one, record usage. */
static const struct kernel_symbol *resolve_symbol(struct module *mod,
@@ -1440,6 +1459,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
struct module *owner;
const struct kernel_symbol *sym;
const s32 *crc;
+ enum mod_license license;
int err;
/*
@@ -1449,11 +1469,19 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
*/
sched_annotate_sleep();
mutex_lock(&module_mutex);
- sym = find_symbol(name, &owner, &crc,
+ sym = find_symbol(name, &owner, &crc, &license,
!(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
if (!sym)
goto unlock;
+ if (license == GPL_ONLY)
+ mod->using_gplonly_symbols = true;
+
+ if (!inherit_taint(mod, owner)) {
+ sym = NULL;
+ goto getname;
+ }
+
if (!check_version(info, name, mod, crc)) {
sym = ERR_PTR(-EINVAL);
goto getname;
@@ -2236,7 +2264,7 @@ void *__symbol_get(const char *symbol)
const struct kernel_symbol *sym;
preempt_disable();
- sym = find_symbol(symbol, &owner, NULL, true, true);
+ sym = find_symbol(symbol, &owner, NULL, NULL, true, true);
if (sym && strong_try_module_get(owner))
sym = NULL;
preempt_enable();
@@ -2272,7 +2300,7 @@ static int verify_exported_symbols(struct module *mod)
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
if (find_symbol(kernel_symbol_name(s), &owner, NULL,
- true, false)) {
+ NULL, true, false)) {
pr_err("%s: exports duplicate symbol %s"
" (owned by %s)\n",
mod->name, kernel_symbol_name(s),
@@ -4489,7 +4517,6 @@ struct module *__module_address(unsigned long addr)
}
return mod;
}
-EXPORT_SYMBOL_GPL(__module_address);
/*
* is_module_text_address - is this address inside module code?
@@ -4528,7 +4555,6 @@ struct module *__module_text_address(unsigned long addr)
}
return mod;
}
-EXPORT_SYMBOL_GPL(__module_text_address);
/* Don't grab lock, we're oopsing. */
void print_modules(void)
diff --git a/kernel/panic.c b/kernel/panic.c
index e2157ca387c8..aef8872ba843 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -505,7 +505,7 @@ static void do_oops_enter_exit(void)
* Return true if the calling CPU is allowed to print oops-related info.
* This is a bit racy..
*/
-int oops_may_print(void)
+bool oops_may_print(void)
{
return pause_on_oops_flag == 0;
}
@@ -551,7 +551,7 @@ static int init_oops_id(void)
}
late_initcall(init_oops_id);
-void print_oops_end_marker(void)
+static void print_oops_end_marker(void)
{
init_oops_id();
pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 5714f51ba9f8..e7aa57fb2fdc 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -659,7 +659,7 @@ static void power_down(void)
break;
case HIBERNATION_PLATFORM:
hibernation_platform_enter();
- /* Fall through */
+ fallthrough;
case HIBERNATION_SHUTDOWN:
if (pm_power_off)
kernel_power_off();
@@ -795,6 +795,103 @@ int hibernate(void)
return error;
}
+/**
+ * hibernate_quiet_exec - Execute a function with all devices frozen.
+ * @func: Function to execute.
+ * @data: Data pointer to pass to @func.
+ *
+ * Return the @func return value or an error code if it cannot be executed.
+ */
+int hibernate_quiet_exec(int (*func)(void *data), void *data)
+{
+ int error, nr_calls = 0;
+
+ lock_system_sleep();
+
+ if (!hibernate_acquire()) {
+ error = -EBUSY;
+ goto unlock;
+ }
+
+ pm_prepare_console();
+
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
+ goto exit;
+ }
+
+ error = freeze_processes();
+ if (error)
+ goto exit;
+
+ lock_device_hotplug();
+
+ pm_suspend_clear_flags();
+
+ error = platform_begin(true);
+ if (error)
+ goto thaw;
+
+ error = freeze_kernel_threads();
+ if (error)
+ goto thaw;
+
+ error = dpm_prepare(PMSG_FREEZE);
+ if (error)
+ goto dpm_complete;
+
+ suspend_console();
+
+ error = dpm_suspend(PMSG_FREEZE);
+ if (error)
+ goto dpm_resume;
+
+ error = dpm_suspend_end(PMSG_FREEZE);
+ if (error)
+ goto dpm_resume;
+
+ error = platform_pre_snapshot(true);
+ if (error)
+ goto skip;
+
+ error = func(data);
+
+skip:
+ platform_finish(true);
+
+ dpm_resume_start(PMSG_THAW);
+
+dpm_resume:
+ dpm_resume(PMSG_THAW);
+
+ resume_console();
+
+dpm_complete:
+ dpm_complete(PMSG_THAW);
+
+ thaw_kernel_threads();
+
+thaw:
+ platform_end(true);
+
+ unlock_device_hotplug();
+
+ thaw_processes();
+
+exit:
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
+
+ pm_restore_console();
+
+ hibernate_release();
+
+unlock:
+ unlock_system_sleep();
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(hibernate_quiet_exec);
/**
* software_resume - Resume from a saved hibernation image.
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index db0bed2cae26..ec7e1e85923e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -119,7 +119,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
* and add, then see if the aggregate has changed.
*/
plist_del(node, &c->list);
- /* fall through */
+ fallthrough;
case PM_QOS_ADD_REQ:
plist_node_init(node, new_value);
plist_add(node, &c->list);
@@ -188,7 +188,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
break;
case PM_QOS_UPDATE_REQ:
pm_qos_flags_remove_req(pqf, req);
- /* fall through */
+ fallthrough;
case PM_QOS_ADD_REQ:
req->flags = val;
INIT_LIST_HEAD(&req->node);
diff --git a/kernel/relay.c b/kernel/relay.c
index 72fe443ea78f..fb4e0c530c08 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -197,6 +197,7 @@ free_buf:
static void relay_destroy_channel(struct kref *kref)
{
struct rchan *chan = container_of(kref, struct rchan, kref);
+ free_percpu(chan->buf);
kfree(chan);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 84758f34cdb0..2d95dc3f4644 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2320,7 +2320,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
state = possible;
break;
}
- /* Fall-through */
+ fallthrough;
case possible:
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
@@ -6431,10 +6431,10 @@ void sched_show_task(struct task_struct *p)
if (!try_get_task_stack(p))
return;
- printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
if (p->state == TASK_RUNNING)
- printk(KERN_CONT " running task ");
+ pr_cont(" running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
@@ -6443,8 +6443,8 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), ppid,
+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+ free, task_pid_nr(p), ppid,
(unsigned long)task_thread_info(p)->flags);
print_worker_info(KERN_INFO, p);
@@ -6479,13 +6479,6 @@ void show_state_filter(unsigned long state_filter)
{
struct task_struct *g, *p;
-#if BITS_PER_LONG == 32
- printk(KERN_INFO
- " task PC stack pid father\n");
-#else
- printk(KERN_INFO
- " task PC stack pid father\n");
-#endif
rcu_read_lock();
for_each_process_thread(g, p) {
/*
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6bf34986f45c..f324dc36fc43 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -54,17 +54,18 @@ __setup("hlt", cpu_idle_nopoll_setup);
static noinline int __cpuidle cpu_idle_poll(void)
{
+ trace_cpu_idle(0, smp_processor_id());
+ stop_critical_timings();
rcu_idle_enter();
- trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
- stop_critical_timings();
while (!tif_need_resched() &&
- (cpu_idle_force_poll || tick_check_broadcast_expired()))
+ (cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
- start_critical_timings();
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+
rcu_idle_exit();
+ start_critical_timings();
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
return 1;
}
@@ -90,9 +91,14 @@ void __cpuidle default_idle_call(void)
if (current_clr_polling_and_test()) {
local_irq_enable();
} else {
+
+ trace_cpu_idle(1, smp_processor_id());
stop_critical_timings();
+ rcu_idle_enter();
arch_cpu_idle();
+ rcu_idle_exit();
start_critical_timings();
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
}
@@ -158,7 +164,6 @@ static void cpuidle_idle_call(void)
if (cpuidle_not_available(drv, dev)) {
tick_nohz_idle_stop_tick();
- rcu_idle_enter();
default_idle_call();
goto exit_idle;
@@ -178,21 +183,17 @@ static void cpuidle_idle_call(void)
u64 max_latency_ns;
if (idle_should_enter_s2idle()) {
- rcu_idle_enter();
entered_state = call_cpuidle_s2idle(drv, dev);
if (entered_state > 0)
goto exit_idle;
- rcu_idle_exit();
-
max_latency_ns = U64_MAX;
} else {
max_latency_ns = dev->forced_idle_latency_limit_ns;
}
tick_nohz_idle_stop_tick();
- rcu_idle_enter();
next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
call_cpuidle(drv, dev, next_state);
@@ -209,8 +210,6 @@ static void cpuidle_idle_call(void)
else
tick_nohz_idle_retain_tick();
- rcu_idle_enter();
-
entered_state = call_cpuidle(drv, dev, next_state);
/*
* Give the governor an opportunity to reflect on the outcome
@@ -226,8 +225,6 @@ exit_idle:
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
-
- rcu_idle_exit();
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3fd283892761..28709f6b0975 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1999,7 +1999,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
if (trace_sched_update_nr_running_tp_enabled()) {
- call_trace_sched_update_nr_running(rq, count);
+ call_trace_sched_update_nr_running(rq, -count);
}
/* Check if we still need preemption */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 007b0a6b0152..1bd7e3af904f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1219,13 +1219,13 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
case sa_rootdomain:
if (!atomic_read(&d->rd->refcount))
free_rootdomain(&d->rd->rcu);
- /* Fall through */
+ fallthrough;
case sa_sd:
free_percpu(d->sd);
- /* Fall through */
+ fallthrough;
case sa_sd_storage:
__sdt_free(cpu_map);
- /* Fall through */
+ fallthrough;
case sa_none:
break;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 6f16f7c5d375..a38b3edc6851 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -851,7 +851,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
*/
if (!sid || sid == task_session(current))
break;
- /* fall through */
+ fallthrough;
default:
return -EPERM;
}
@@ -2541,7 +2541,21 @@ bool get_signal(struct ksignal *ksig)
relock:
spin_lock_irq(&sighand->siglock);
- current->jobctl &= ~JOBCTL_TASK_WORK;
+ /*
+ * Make sure we can safely read ->jobctl() in task_work add. As Oleg
+ * states:
+ *
+ * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we
+ * roughly have
+ *
+ * task_work_add: get_signal:
+ * STORE(task->task_works, new_work); STORE(task->jobctl);
+ * mb(); mb();
+ * LOAD(task->jobctl); LOAD(task->task_works);
+ *
+ * and we can rely on STORE-MB-LOAD [ in task_work_add].
+ */
+ smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK);
if (unlikely(current->task_works)) {
spin_unlock_irq(&sighand->siglock);
task_work_run();
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 2af66e449aa6..946f44a9e86a 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -233,10 +233,9 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
if (current->flags & PF_KTHREAD)
return 0;
- fs = get_fs();
- set_fs(USER_DS);
+ fs = force_uaccess_begin();
arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
- set_fs(fs);
+ force_uaccess_end(fs);
return c.len;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index ca11af9d815d..ab6c409b1159 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1753,7 +1753,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
if (who == RUSAGE_CHILDREN)
break;
- /* fall through */
+ fallthrough;
case RUSAGE_SELF:
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 3b69a560a7ac..4d59775ea79c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -364,7 +364,6 @@ COND_SYSCALL(socketcall);
COND_SYSCALL_COMPAT(socketcall);
/* compat syscalls for arm64, x86, ... */
-COND_SYSCALL_COMPAT(sysctl);
COND_SYSCALL_COMPAT(fanotify_mark);
/* x86 */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f785de3caac0..287862f91717 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2852,6 +2852,15 @@ static struct ctl_table vm_table[] = {
.proc_handler = sysctl_compaction_handler,
},
{
+ .procname = "compaction_proactiveness",
+ .data = &sysctl_compaction_proactiveness,
+ .maxlen = sizeof(sysctl_compaction_proactiveness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &one_hundred,
+ },
+ {
.procname = "extfrag_threshold",
.data = &sysctl_extfrag_threshold,
.maxlen = sizeof(int),
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
deleted file mode 100644
index 7d550cc76a3b..000000000000
--- a/kernel/sysctl_binary.c
+++ /dev/null
@@ -1,171 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include "../fs/xfs/xfs_sysctl.h"
-#include <linux/sunrpc/debug.h>
-#include <linux/string.h>
-#include <linux/syscalls.h>
-#include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/fs.h>
-#include <linux/nsproxy.h>
-#include <linux/pid_namespace.h>
-#include <linux/file.h>
-#include <linux/ctype.h>
-#include <linux/netdevice.h>
-#include <linux/kernel.h>
-#include <linux/uuid.h>
-#include <linux/slab.h>
-#include <linux/compat.h>
-
-static ssize_t binary_sysctl(const int *name, int nlen,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- return -ENOSYS;
-}
-
-static void deprecated_sysctl_warning(const int *name, int nlen)
-{
- int i;
-
- /*
- * CTL_KERN/KERN_VERSION is used by older glibc and cannot
- * ever go away.
- */
- if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION)
- return;
-
- if (printk_ratelimit()) {
- printk(KERN_INFO
- "warning: process `%s' used the deprecated sysctl "
- "system call with ", current->comm);
- for (i = 0; i < nlen; i++)
- printk(KERN_CONT "%d.", name[i]);
- printk(KERN_CONT "\n");
- }
- return;
-}
-
-#define WARN_ONCE_HASH_BITS 8
-#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
-
-static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
-
-#define FNV32_OFFSET 2166136261U
-#define FNV32_PRIME 0x01000193
-
-/*
- * Print each legacy sysctl (approximately) only once.
- * To avoid making the tables non-const use a external
- * hash-table instead.
- * Worst case hash collision: 6, but very rarely.
- * NOTE! We don't use the SMP-safe bit tests. We simply
- * don't care enough.
- */
-static void warn_on_bintable(const int *name, int nlen)
-{
- int i;
- u32 hash = FNV32_OFFSET;
-
- for (i = 0; i < nlen; i++)
- hash = (hash ^ name[i]) * FNV32_PRIME;
- hash %= WARN_ONCE_HASH_SIZE;
- if (__test_and_set_bit(hash, warn_once_bitmap))
- return;
- deprecated_sysctl_warning(name, nlen);
-}
-
-static ssize_t do_sysctl(int __user *args_name, int nlen,
- void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
- int name[CTL_MAXNAME];
- int i;
-
- /* Check args->nlen. */
- if (nlen < 0 || nlen > CTL_MAXNAME)
- return -ENOTDIR;
- /* Read in the sysctl name for simplicity */
- for (i = 0; i < nlen; i++)
- if (get_user(name[i], args_name + i))
- return -EFAULT;
-
- warn_on_bintable(name, nlen);
-
- return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
-}
-
-SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
-{
- struct __sysctl_args tmp;
- size_t oldlen = 0;
- ssize_t result;
-
- if (copy_from_user(&tmp, args, sizeof(tmp)))
- return -EFAULT;
-
- if (tmp.oldval && !tmp.oldlenp)
- return -EFAULT;
-
- if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
- return -EFAULT;
-
- result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
- tmp.newval, tmp.newlen);
-
- if (result >= 0) {
- oldlen = result;
- result = 0;
- }
-
- if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
- return -EFAULT;
-
- return result;
-}
-
-
-#ifdef CONFIG_COMPAT
-
-struct compat_sysctl_args {
- compat_uptr_t name;
- int nlen;
- compat_uptr_t oldval;
- compat_uptr_t oldlenp;
- compat_uptr_t newval;
- compat_size_t newlen;
- compat_ulong_t __unused[4];
-};
-
-COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args)
-{
- struct compat_sysctl_args tmp;
- compat_size_t __user *compat_oldlenp;
- size_t oldlen = 0;
- ssize_t result;
-
- if (copy_from_user(&tmp, args, sizeof(tmp)))
- return -EFAULT;
-
- if (tmp.oldval && !tmp.oldlenp)
- return -EFAULT;
-
- compat_oldlenp = compat_ptr(tmp.oldlenp);
- if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
- return -EFAULT;
-
- result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
- compat_ptr(tmp.oldval), oldlen,
- compat_ptr(tmp.newval), tmp.newlen);
-
- if (result >= 0) {
- oldlen = result;
- result = 0;
- }
-
- if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
- return -EFAULT;
-
- return result;
-}
-
-#endif /* CONFIG_COMPAT */
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 5c0848ca1287..613b2d634af8 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -42,7 +42,13 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify)
set_notify_resume(task);
break;
case TWA_SIGNAL:
- if (lock_task_sighand(task, &flags)) {
+ /*
+ * Only grab the sighand lock if we don't already have some
+ * task_work pending. This pairs with the smp_store_mb()
+ * in get_signal(), see comment there.
+ */
+ if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) &&
+ lock_task_sighand(task, &flags)) {
task->jobctl |= JOBCTL_TASK_WORK;
signal_wake_up(task, 0);
unlock_task_sighand(task, &flags);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fcc42353f125..a09b1d61df6a 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -52,6 +52,15 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST
config GENERIC_CMOS_UPDATE
bool
+# Select to handle posix CPU timers from task_work
+# and not from the timer interrupt context
+config HAVE_POSIX_CPU_TIMERS_TASK_WORK
+ bool
+
+config POSIX_CPU_TIMERS_TASK_WORK
+ bool
+ default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK
+
if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2ffb466af77e..ca223a89530a 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -192,7 +192,7 @@ static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
* When a alarm timer fires, this runs through the timerqueue to
* see which alarms expired, and runs those. If there are more alarm
* timers queued for the future, we set the hrtimer to fire when
- * when the next future alarm timer expires.
+ * the next future alarm timer expires.
*/
static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
{
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d89da1c7e005..95b6a708b040 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -135,7 +135,11 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
- .clock_base = { { .cpu_base = &migration_cpu_base, }, },
+ .clock_base = { {
+ .cpu_base = &migration_cpu_base,
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
+ &migration_cpu_base.lock),
+ }, },
};
#define migration_base migration_cpu_base.clock_base[0]
@@ -373,7 +377,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
- /* fall through */
+ fallthrough;
default:
return false;
}
@@ -1998,8 +2002,11 @@ int hrtimers_prepare_cpu(unsigned int cpu)
int i;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- cpu_base->clock_base[i].cpu_base = cpu_base;
- timerqueue_init_head(&cpu_base->clock_base[i].active);
+ struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
+
+ clock_b->cpu_base = cpu_base;
+ seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
+ timerqueue_init_head(&clock_b->active);
}
cpu_base->cpu = cpu;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 165117996ea0..a71758e34e45 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
*/
static int posix_cpu_timer_create(struct k_itimer *new_timer)
{
+ static struct lock_class_key posix_cpu_timers_key;
struct pid *pid;
rcu_read_lock();
@@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
return -EINVAL;
}
+ /*
+ * If posix timer expiry is handled in task work context then
+ * timer::it_lock can be taken without disabling interrupts as all
+ * other locking happens in task context. This requires a seperate
+ * lock class key otherwise regular posix timer expiry would record
+ * the lock class being taken in interrupt context and generate a
+ * false positive warning.
+ */
+ if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK))
+ lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key);
+
new_timer->kclock = &clock_posix_cpu;
timerqueue_init(&new_timer->it.cpu.node);
new_timer->it.cpu.pid = get_pid(pid);
@@ -1080,43 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk)
return false;
}
+static void handle_posix_cpu_timers(struct task_struct *tsk);
+
+#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
+static void posix_cpu_timers_work(struct callback_head *work)
+{
+ handle_posix_cpu_timers(current);
+}
+
/*
- * This is called from the timer interrupt handler. The irq handler has
- * already updated our counts. We need to check if any timers fire now.
- * Interrupts are disabled.
+ * Initialize posix CPU timers task work in init task. Out of line to
+ * keep the callback static and to avoid header recursion hell.
*/
-void run_posix_cpu_timers(void)
+void __init posix_cputimers_init_work(void)
{
- struct task_struct *tsk = current;
- struct k_itimer *timer, *next;
- unsigned long flags;
- LIST_HEAD(firing);
+ init_task_work(&current->posix_cputimers_work.work,
+ posix_cpu_timers_work);
+}
- lockdep_assert_irqs_disabled();
+/*
+ * Note: All operations on tsk->posix_cputimer_work.scheduled happen either
+ * in hard interrupt context or in task context with interrupts
+ * disabled. Aside of that the writer/reader interaction is always in the
+ * context of the current task, which means they are strict per CPU.
+ */
+static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
+{
+ return tsk->posix_cputimers_work.scheduled;
+}
- /*
- * The fast path checks that there are no expired thread or thread
- * group timers. If that's so, just return.
- */
- if (!fastpath_timer_check(tsk))
+static inline void __run_posix_cpu_timers(struct task_struct *tsk)
+{
+ if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled))
return;
- lockdep_posixtimer_enter();
- if (!lock_task_sighand(tsk, &flags)) {
- lockdep_posixtimer_exit();
- return;
+ /* Schedule task work to actually expire the timers */
+ tsk->posix_cputimers_work.scheduled = true;
+ task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME);
+}
+
+static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
+ unsigned long start)
+{
+ bool ret = true;
+
+ /*
+ * On !RT kernels interrupts are disabled while collecting expired
+ * timers, so no tick can happen and the fast path check can be
+ * reenabled without further checks.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ tsk->posix_cputimers_work.scheduled = false;
+ return true;
}
+
/*
- * Here we take off tsk->signal->cpu_timers[N] and
- * tsk->cpu_timers[N] all the timers that are firing, and
- * put them on the firing list.
+ * On RT enabled kernels ticks can happen while the expired timers
+ * are collected under sighand lock. But any tick which observes
+ * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath
+ * checks. So reenabling the tick work has do be done carefully:
+ *
+ * Disable interrupts and run the fast path check if jiffies have
+ * advanced since the collecting of expired timers started. If
+ * jiffies have not advanced or the fast path check did not find
+ * newly expired timers, reenable the fast path check in the timer
+ * interrupt. If there are newly expired timers, return false and
+ * let the collection loop repeat.
*/
- check_thread_timers(tsk, &firing);
+ local_irq_disable();
+ if (start != jiffies && fastpath_timer_check(tsk))
+ ret = false;
+ else
+ tsk->posix_cputimers_work.scheduled = false;
+ local_irq_enable();
+
+ return ret;
+}
+#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
+static inline void __run_posix_cpu_timers(struct task_struct *tsk)
+{
+ lockdep_posixtimer_enter();
+ handle_posix_cpu_timers(tsk);
+ lockdep_posixtimer_exit();
+}
+
+static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
+{
+ return false;
+}
+
+static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk,
+ unsigned long start)
+{
+ return true;
+}
+#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */
+
+static void handle_posix_cpu_timers(struct task_struct *tsk)
+{
+ struct k_itimer *timer, *next;
+ unsigned long flags, start;
+ LIST_HEAD(firing);
+
+ if (!lock_task_sighand(tsk, &flags))
+ return;
- check_process_timers(tsk, &firing);
+ do {
+ /*
+ * On RT locking sighand lock does not disable interrupts,
+ * so this needs to be careful vs. ticks. Store the current
+ * jiffies value.
+ */
+ start = READ_ONCE(jiffies);
+ barrier();
+
+ /*
+ * Here we take off tsk->signal->cpu_timers[N] and
+ * tsk->cpu_timers[N] all the timers that are firing, and
+ * put them on the firing list.
+ */
+ check_thread_timers(tsk, &firing);
+
+ check_process_timers(tsk, &firing);
+
+ /*
+ * The above timer checks have updated the exipry cache and
+ * because nothing can have queued or modified timers after
+ * sighand lock was taken above it is guaranteed to be
+ * consistent. So the next timer interrupt fastpath check
+ * will find valid data.
+ *
+ * If timer expiry runs in the timer interrupt context then
+ * the loop is not relevant as timers will be directly
+ * expired in interrupt context. The stub function below
+ * returns always true which allows the compiler to
+ * optimize the loop out.
+ *
+ * If timer expiry is deferred to task work context then
+ * the following rules apply:
+ *
+ * - On !RT kernels no tick can have happened on this CPU
+ * after sighand lock was acquired because interrupts are
+ * disabled. So reenabling task work before dropping
+ * sighand lock and reenabling interrupts is race free.
+ *
+ * - On RT kernels ticks might have happened but the tick
+ * work ignored posix CPU timer handling because the
+ * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work
+ * must be done very carefully including a check whether
+ * ticks have happened since the start of the timer
+ * expiry checks. posix_cpu_timers_enable_work() takes
+ * care of that and eventually lets the expiry checks
+ * run again.
+ */
+ } while (!posix_cpu_timers_enable_work(tsk, start));
/*
- * We must release these locks before taking any timer's lock.
+ * We must release sighand lock before taking any timer's lock.
* There is a potential race with timer deletion here, as the
* siglock now protects our private firing list. We have set
* the firing flag in each timer, so that a deletion attempt
@@ -1134,6 +1266,13 @@ void run_posix_cpu_timers(void)
list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) {
int cpu_firing;
+ /*
+ * spin_lock() is sufficient here even independent of the
+ * expiry context. If expiry happens in hard interrupt
+ * context it's obvious. For task work context it's safe
+ * because all other operations on timer::it_lock happen in
+ * task context (syscall or exit).
+ */
spin_lock(&timer->it_lock);
list_del_init(&timer->it.cpu.elist);
cpu_firing = timer->it.cpu.firing;
@@ -1147,7 +1286,34 @@ void run_posix_cpu_timers(void)
cpu_timer_fire(timer);
spin_unlock(&timer->it_lock);
}
- lockdep_posixtimer_exit();
+}
+
+/*
+ * This is called from the timer interrupt handler. The irq handler has
+ * already updated our counts. We need to check if any timers fire now.
+ * Interrupts are disabled.
+ */
+void run_posix_cpu_timers(void)
+{
+ struct task_struct *tsk = current;
+
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * If the actual expiry is deferred to task work context and the
+ * work is already scheduled there is no point to do anything here.
+ */
+ if (posix_cpu_timers_work_scheduled(tsk))
+ return;
+
+ /*
+ * The fast path checks that there are no expired thread or thread
+ * group timers. If that's so, just return.
+ */
+ if (!fastpath_timer_check(tsk))
+ return;
+
+ __run_posix_cpu_timers(tsk);
}
/*
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 07709ac30439..bf540f5a4115 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -439,12 +439,12 @@ static struct pid *good_sigevent(sigevent_t * event)
rtn = pid_task(pid, PIDTYPE_PID);
if (!rtn || !same_thread_group(rtn, current))
return NULL;
- /* FALLTHRU */
+ fallthrough;
case SIGEV_SIGNAL:
case SIGEV_THREAD:
if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
return NULL;
- /* FALLTHRU */
+ fallthrough;
case SIGEV_NONE:
return pid;
default:
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0deaf4b79fb4..1c03eec6ca9b 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -229,7 +229,7 @@ void __init generic_sched_clock_init(void)
{
/*
* If no sched_clock() function has been provided at that point,
- * make it the final one one.
+ * make it the final one.
*/
if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index e51778c312f1..36d7464c8962 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -381,7 +381,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
switch (mode) {
case TICK_BROADCAST_FORCE:
tick_broadcast_forced = 1;
- /* fall through */
+ fallthrough;
case TICK_BROADCAST_ON:
cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 63a632f9896c..4c47f388a83f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -39,18 +39,19 @@ enum timekeeping_adv_mode {
TK_ADV_FREQ
};
+DEFINE_RAW_SPINLOCK(timekeeper_lock);
+
/*
* The most important data for readout fits into a single 64 byte
* cache line.
*/
static struct {
- seqcount_t seq;
+ seqcount_raw_spinlock_t seq;
struct timekeeper timekeeper;
} tk_core ____cacheline_aligned = {
- .seq = SEQCNT_ZERO(tk_core.seq),
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
};
-static DEFINE_RAW_SPINLOCK(timekeeper_lock);
static struct timekeeper shadow_timekeeper;
/**
@@ -63,7 +64,7 @@ static struct timekeeper shadow_timekeeper;
* See @update_fast_timekeeper() below.
*/
struct tk_fast {
- seqcount_t seq;
+ seqcount_raw_spinlock_t seq;
struct tk_read_base base[2];
};
@@ -80,11 +81,13 @@ static struct clocksource dummy_clock = {
};
static struct tk_fast tk_fast_mono ____cacheline_aligned = {
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock),
.base[0] = { .clock = &dummy_clock, },
.base[1] = { .clock = &dummy_clock, },
};
static struct tk_fast tk_fast_raw ____cacheline_aligned = {
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock),
.base[0] = { .clock = &dummy_clock, },
.base[1] = { .clock = &dummy_clock, },
};
@@ -157,7 +160,7 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
* tk_clock_read - atomic clocksource read() helper
*
* This helper is necessary to use in the read paths because, while the
- * seqlock ensures we don't return a bad value while structures are updated,
+ * seqcount ensures we don't return a bad value while structures are updated,
* it doesn't protect from potential crashes. There is the possibility that
* the tkr's clocksource may change between the read reference, and the
* clock reference passed to the read function. This can cause crashes if
@@ -222,10 +225,10 @@ static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr)
unsigned int seq;
/*
- * Since we're called holding a seqlock, the data may shift
+ * Since we're called holding a seqcount, the data may shift
* under us while we're doing the calculation. This can cause
* false positives, since we'd note a problem but throw the
- * results away. So nest another seqlock here to atomically
+ * results away. So nest another seqcount here to atomically
* grab the points we are checking with.
*/
do {
@@ -486,7 +489,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
*
* To keep it NMI safe since we're accessing from tracing, we're not using a
* separate timekeeper with updates to monotonic clock and boot offset
- * protected with seqlocks. This has the following minor side effects:
+ * protected with seqcounts. This has the following minor side effects:
*
* (1) Its possible that a timestamp be taken after the boot offset is updated
* but before the timekeeper is updated. If this happens, the new boot offset
@@ -2001,7 +2004,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
* logarithmic_accumulation - shifted accumulation of cycles
*
* This functions accumulates a shifted interval of cycles into
- * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * a shifted interval nanoseconds. Allows for O(log) accumulation
* loop.
*
* Returns the unconsumed cycles.
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index bcbb52db2256..4ca2787d1642 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -1,12 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _TIMEKEEPING_INTERNAL_H
#define _TIMEKEEPING_INTERNAL_H
-/*
- * timekeeping debug functions
- */
+
#include <linux/clocksource.h>
+#include <linux/spinlock.h>
#include <linux/time.h>
+/*
+ * timekeeping debug functions
+ */
#ifdef CONFIG_DEBUG_FS
extern void tk_debug_account_sleep_time(const struct timespec64 *t);
#else
@@ -31,4 +33,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
}
#endif
+/* Semi public for serialization of non timekeeper VDSO updates. */
+extern raw_spinlock_t timekeeper_lock;
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ae5029f984a8..a50364df1054 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -666,7 +666,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
- /* fall through */
+ fallthrough;
default:
return false;
}
@@ -2017,6 +2017,7 @@ static void __init init_timer_cpus(void)
void __init init_timers(void)
{
init_timer_cpus();
+ posix_cputimers_init_work();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 54ce6eb2ca36..88e6b8ed6ca5 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -13,6 +13,8 @@
#include <vdso/helpers.h>
#include <vdso/vsyscall.h>
+#include "timekeeping_internal.h"
+
static inline void update_vdso_data(struct vdso_data *vdata,
struct timekeeper *tk)
{
@@ -127,3 +129,42 @@ void update_vsyscall_tz(void)
__arch_sync_vdso_data(vdata);
}
+
+/**
+ * vdso_update_begin - Start of a VDSO update section
+ *
+ * Allows architecture code to safely update the architecture specific VDSO
+ * data. Disables interrupts, acquires timekeeper lock to serialize against
+ * concurrent updates from timekeeping and invalidates the VDSO data
+ * sequence counter to prevent concurrent readers from accessing
+ * inconsistent data.
+ *
+ * Returns: Saved interrupt flags which need to be handed in to
+ * vdso_update_end().
+ */
+unsigned long vdso_update_begin(void)
+{
+ struct vdso_data *vdata = __arch_get_k_vdso_data();
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ vdso_write_begin(vdata);
+ return flags;
+}
+
+/**
+ * vdso_update_end - End of a VDSO update section
+ * @flags: Interrupt flags as returned from vdso_update_begin()
+ *
+ * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data
+ * synchronization if the architecture requires it, drops timekeeper lock
+ * and restores interrupt flags.
+ */
+void vdso_update_end(unsigned long flags)
+{
+ struct vdso_data *vdata = __arch_get_k_vdso_data();
+
+ vdso_write_end(vdata);
+ __arch_sync_vdso_data(vdata);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7ba62d68885a..4b3a42fc3b24 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -745,7 +745,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
#endif
case BLKTRACESTART:
start = 1;
- /* fall through */
+ fallthrough;
case BLKTRACESTOP:
ret = __blk_trace_startstop(q, start);
break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cb91ef902cc4..a8d4f253ed77 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -383,7 +383,7 @@ static DEFINE_RAW_SPINLOCK(trace_printk_lock);
#define BPF_TRACE_PRINTK_SIZE 1024
-static inline __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...)
+static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...)
{
static char buf[BPF_TRACE_PRINTK_SIZE];
unsigned long flags;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index bf44f6bbd0c3..78a678eeb140 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -499,7 +499,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
ptr++;
break;
}
- /* fall through */
+ fallthrough;
default:
parse_error(pe, FILT_ERR_TOO_MANY_PREDS,
next - str);
@@ -1273,7 +1273,7 @@ static int parse_pred(const char *str, void *data,
switch (op) {
case OP_NE:
pred->not = 1;
- /* Fall through */
+ fallthrough;
case OP_GLOB:
case OP_EQ:
break;
diff --git a/kernel/umh.c b/kernel/umh.c
index a25433f9cd9a..fcf3ee803630 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -119,37 +119,16 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
pid_t pid;
- /* If SIGCLD is ignored kernel_wait4 won't populate the status. */
+ /* If SIGCLD is ignored do_wait won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
- if (pid < 0) {
+ if (pid < 0)
sub_info->retval = pid;
- } else {
- int ret = -ECHILD;
- /*
- * Normally it is bogus to call wait4() from in-kernel because
- * wait4() wants to write the exit code to a userspace address.
- * But call_usermodehelper_exec_sync() always runs as kernel
- * thread (workqueue) and put_user() to a kernel address works
- * OK for kernel threads, due to their having an mm_segment_t
- * which spans the entire address space.
- *
- * Thus the __user pointer cast is valid here.
- */
- kernel_wait4(pid, (int __user *)&ret, 0, NULL);
-
- /*
- * If ret is 0, either call_usermodehelper_exec_async failed and
- * the real error code is already in sub_info->retval or
- * sub_info->retval is 0 anyway, so don't mess with it then.
- */
- if (ret)
- sub_info->retval = ret;
- }
+ else
+ kernel_wait(pid, &sub_info->retval);
/* Restore default kernel sig handler */
kernel_sigaction(SIGCHLD, SIG_IGN);
-
umh_complete(sub_info);
}
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index f74020f6bd9d..0ef8f65bd2d7 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -393,6 +393,7 @@ static void free_watch(struct rcu_head *rcu)
struct watch *watch = container_of(rcu, struct watch, rcu);
put_watch_queue(rcu_access_pointer(watch->queue));
+ atomic_dec(&watch->cred->user->nr_watches);
put_cred(watch->cred);
}
@@ -452,6 +453,13 @@ int add_watch_to_object(struct watch *watch, struct watch_list *wlist)
watch->cred = get_current_cred();
rcu_assign_pointer(watch->watch_list, wlist);
+ if (atomic_inc_return(&watch->cred->user->nr_watches) >
+ task_rlimit(current, RLIMIT_NOFILE)) {
+ atomic_dec(&watch->cred->user->nr_watches);
+ put_cred(watch->cred);
+ return -EAGAIN;
+ }
+
spin_lock_bh(&wqueue->lock);
kref_get(&wqueue->usage);
kref_get(&watch->usage);