summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/offload.c27
-rw-r--r--kernel/bpf/syscall.c40
-rw-r--r--kernel/bpf/verifier.c31
-rw-r--r--kernel/compat.c77
-rw-r--r--kernel/crash_core.c3
-rw-r--r--kernel/events/core.c14
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/irq/manage.c13
-rw-r--r--kernel/irq/matrix.c2
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/jump_label.c2
-rw-r--r--kernel/kallsyms.c51
-rw-r--r--kernel/kcov.c216
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/module.c8
-rw-r--r--kernel/padata.c6
-rw-r--r--kernel/panic.c47
-rw-r--r--kernel/pid.c247
-rw-r--r--kernel/pid_namespace.c59
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/printk/printk_safe.c17
-rw-r--r--kernel/reboot.c27
-rw-r--r--kernel/sched/core.c36
-rw-r--r--kernel/signal.c95
-rw-r--r--kernel/sysctl.c65
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/clocksource.c5
-rw-r--r--kernel/time/timekeeping.c45
-rw-r--r--kernel/time/timer.c40
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c30
-rw-r--r--kernel/trace/bpf_trace.c12
-rw-r--r--kernel/trace/ftrace.c354
-rw-r--r--kernel/trace/ring_buffer.c64
-rw-r--r--kernel/trace/trace.c91
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_event_perf.c82
-rw-r--r--kernel/trace/trace_events.c31
-rw-r--r--kernel/trace/trace_events_hist.c128
-rw-r--r--kernel/trace/trace_irqsoff.c133
-rw-r--r--kernel/trace/trace_kprobe.c22
-rw-r--r--kernel/trace/trace_probe.c86
-rw-r--r--kernel/trace/trace_probe.h7
-rw-r--r--kernel/trace/trace_selftest.c34
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c4
-rw-r--r--kernel/trace/tracing_map.c3
-rw-r--r--kernel/trace/tracing_map.h2
-rw-r--r--kernel/umh.c4
-rw-r--r--kernel/workqueue.c2
52 files changed, 1361 insertions, 942 deletions
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 2816feb38be1..68ec884440b7 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -14,8 +14,9 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
struct net *net = current->nsproxy->net_ns;
struct bpf_dev_offload *offload;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+ attr->prog_type != BPF_PROG_TYPE_XDP)
+ return -EINVAL;
if (attr->prog_flags)
return -EINVAL;
@@ -28,7 +29,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
init_waitqueue_head(&offload->verifier_done);
rtnl_lock();
- offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex);
+ offload->netdev = __dev_get_by_index(net, attr->prog_ifindex);
if (!offload->netdev) {
rtnl_unlock();
kfree(offload);
@@ -85,6 +86,10 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
struct bpf_dev_offload *offload = prog->aux->offload;
struct netdev_bpf data = {};
+ /* Caution - if netdev is destroyed before the program, this function
+ * will be called twice.
+ */
+
data.offload.prog = prog;
if (offload->verifier_running)
@@ -144,18 +149,6 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
return bpf_prog_offload_translate(prog);
}
-u32 bpf_prog_offload_ifindex(struct bpf_prog *prog)
-{
- struct bpf_dev_offload *offload = prog->aux->offload;
- u32 ifindex;
-
- rtnl_lock();
- ifindex = offload->netdev ? offload->netdev->ifindex : 0;
- rtnl_unlock();
-
- return ifindex;
-}
-
const struct bpf_prog_ops bpf_offload_prog_ops = {
};
@@ -169,6 +162,10 @@ static int bpf_offload_notification(struct notifier_block *notifier,
switch (event) {
case NETDEV_UNREGISTER:
+ /* ignore namespace changes */
+ if (netdev->reg_state != NETREG_UNREGISTERING)
+ break;
+
list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
offloads) {
if (offload->netdev == netdev)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 09badc37e864..2c4cfeaa8d5e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,22 +1057,23 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
}
EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
-static bool bpf_prog_can_attach(struct bpf_prog *prog,
- enum bpf_prog_type *attach_type,
- struct net_device *netdev)
+static bool bpf_prog_get_ok(struct bpf_prog *prog,
+ enum bpf_prog_type *attach_type, bool attach_drv)
{
- struct bpf_dev_offload *offload = prog->aux->offload;
+ /* not an attachment, just a refcount inc, always allow */
+ if (!attach_type)
+ return true;
if (prog->type != *attach_type)
return false;
- if (offload && offload->netdev != netdev)
+ if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
return false;
return true;
}
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
- struct net_device *netdev)
+ bool attach_drv)
{
struct fd f = fdget(ufd);
struct bpf_prog *prog;
@@ -1080,7 +1081,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
prog = ____bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
- if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) {
+ if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
prog = ERR_PTR(-EINVAL);
goto out;
}
@@ -1093,23 +1094,13 @@ out:
struct bpf_prog *bpf_prog_get(u32 ufd)
{
- return __bpf_prog_get(ufd, NULL, NULL);
+ return __bpf_prog_get(ufd, NULL, false);
}
-struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
-{
- struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL);
-
- if (!IS_ERR(prog))
- trace_bpf_prog_get_type(prog);
- return prog;
-}
-EXPORT_SYMBOL_GPL(bpf_prog_get_type);
-
struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
- struct net_device *netdev)
+ bool attach_drv)
{
- struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev);
+ struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);
if (!IS_ERR(prog))
trace_bpf_prog_get_type(prog);
@@ -1118,7 +1109,7 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
+#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex
static int bpf_prog_load(union bpf_attr *attr)
{
@@ -1181,7 +1172,7 @@ static int bpf_prog_load(union bpf_attr *attr)
atomic_set(&prog->aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;
- if (attr->prog_target_ifindex) {
+ if (attr->prog_ifindex) {
err = bpf_prog_offload_init(prog, attr);
if (err)
goto free_prog;
@@ -1625,11 +1616,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
return -EFAULT;
}
- if (bpf_prog_is_dev_bound(prog->aux)) {
- info.status |= BPF_PROG_STATUS_DEV_BOUND;
- info.ifindex = bpf_prog_offload_ifindex(prog);
- }
-
done:
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dd54d20ace2f..d4593571c404 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1384,13 +1384,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (type != expected_type)
goto err_type;
} else if (arg_type == ARG_PTR_TO_MEM ||
+ arg_type == ARG_PTR_TO_MEM_OR_NULL ||
arg_type == ARG_PTR_TO_UNINIT_MEM) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
* passed in as argument, it's a SCALAR_VALUE type. Final test
* happens during stack boundary checking.
*/
- if (register_is_null(*reg))
+ if (register_is_null(*reg) &&
+ arg_type == ARG_PTR_TO_MEM_OR_NULL)
/* final test in check_stack_boundary() */;
else if (!type_is_pkt_pointer(type) &&
type != PTR_TO_MAP_VALUE &&
@@ -3825,6 +3827,7 @@ static int do_check(struct bpf_verifier_env *env)
return err;
regs = cur_regs(env);
+ env->insn_aux_data[insn_idx].seen = true;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
@@ -4020,6 +4023,7 @@ process_bpf_exit:
return err;
insn_idx++;
+ env->insn_aux_data[insn_idx].seen = true;
} else {
verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
@@ -4202,6 +4206,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
u32 off, u32 cnt)
{
struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ int i;
if (cnt == 1)
return 0;
@@ -4211,6 +4216,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ for (i = off; i < off + cnt - 1; i++)
+ new_data[i].seen = true;
env->insn_aux_data = new_data;
vfree(old_data);
return 0;
@@ -4229,6 +4236,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
return new_prog;
}
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (aux_data[i].seen)
+ continue;
+ memcpy(insn + i, &nop, sizeof(nop));
+ }
+}
+
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
@@ -4556,6 +4582,9 @@ skip_full_check:
free_states(env);
if (ret == 0)
+ sanitize_dead_code(env);
+
+ if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
diff --git a/kernel/compat.c b/kernel/compat.c
index 772e038d04d9..d1cee656a7ed 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -367,24 +367,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
return ret;
}
-int get_compat_itimerspec(struct itimerspec *dst,
- const struct compat_itimerspec __user *src)
-{
- if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
- __compat_get_timespec(&dst->it_value, &src->it_value))
- return -EFAULT;
- return 0;
-}
-
-int put_compat_itimerspec(struct compat_itimerspec __user *dst,
- const struct itimerspec *src)
-{
- if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
- __compat_put_timespec(&src->it_value, &dst->it_value))
- return -EFAULT;
- return 0;
-}
-
int get_compat_itimerspec64(struct itimerspec64 *its,
const struct compat_itimerspec __user *uits)
{
@@ -485,27 +467,44 @@ Efault:
return -EFAULT;
}
-void
-sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
+int
+get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
{
+#ifdef __BIG_ENDIAN
+ compat_sigset_t v;
+ if (copy_from_user(&v, compat, sizeof(compat_sigset_t)))
+ return -EFAULT;
switch (_NSIG_WORDS) {
- case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
- case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
- case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
- case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+ case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+ case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+ case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+ case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
}
+#else
+ if (copy_from_user(set, compat, sizeof(compat_sigset_t)))
+ return -EFAULT;
+#endif
+ return 0;
}
-EXPORT_SYMBOL_GPL(sigset_from_compat);
+EXPORT_SYMBOL_GPL(get_compat_sigset);
-void
-sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+int
+put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
+ unsigned int size)
{
+ /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
+#ifdef __BIG_ENDIAN
+ compat_sigset_t v;
switch (_NSIG_WORDS) {
- case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
- case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
- case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
- case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+ case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
+ case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
+ case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
+ case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
}
+ return copy_to_user(compat, &v, size) ? -EFAULT : 0;
+#else
+ return copy_to_user(compat, set, size) ? -EFAULT : 0;
+#endif
}
#ifdef CONFIG_NUMA
@@ -563,22 +562,6 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
}
#endif
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
- compat_pid_t, pid,
- struct compat_timespec __user *, interval)
-{
- struct timespec t;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
- set_fs(old_fs);
- if (compat_put_timespec(&t, interval))
- return -EFAULT;
- return ret;
-}
-
/*
* Allocate user-space memory for the duration of a single system call,
* in order to marshall parameters inside a compat thunk.
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 6db80fc0810b..b3663896278e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -108,7 +108,8 @@ static int __init parse_crashkernel_mem(char *cmdline,
return -EINVAL;
}
}
- }
+ } else
+ pr_info("crashkernel size resulted in zero bytes\n");
return 0;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3939a4674e0a..16beab4767e1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6676,6 +6676,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
ns_inode = ns_path.dentry->d_inode;
ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
ns_link_info->ino = ns_inode->i_ino;
+ path_put(&ns_path);
}
}
@@ -7874,15 +7875,16 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
}
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
- rctx, task, NULL);
+ rctx, task);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
- struct task_struct *task, struct perf_event *event)
+ struct task_struct *task)
{
struct perf_sample_data data;
+ struct perf_event *event;
struct perf_raw_record raw = {
.frag = {
@@ -7896,15 +7898,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_trace_buf_update(record, event_type);
- /* Use the given event instead of the hlist */
- if (event) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
- } else {
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 4e55eedba8d6..432eadf6b58c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1871,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
- if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2ff1c0c82fc9..0f922729bab9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1246,7 +1246,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* set the trigger type must match. Also all must
* agree on ONESHOT.
*/
- unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data);
+ unsigned int oldtype;
+
+ /*
+ * If nobody did set the configuration before, inherit
+ * the one provided by the requester.
+ */
+ if (irqd_trigger_type_was_set(&desc->irq_data)) {
+ oldtype = irqd_get_trigger_type(&desc->irq_data);
+ } else {
+ oldtype = new->flags & IRQF_TRIGGER_MASK;
+ irqd_set_trigger_type(&desc->irq_data, oldtype);
+ }
if (!((old->flags & new->flags) & IRQF_SHARED) ||
(oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index a3cbbc8191c5..7df2480005f8 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -384,7 +384,7 @@ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
{
struct cpumap *cm = this_cpu_ptr(m->maps);
- return m->global_available - cpudown ? cm->available : 0;
+ return (m->global_available - cpudown) ? cm->available : 0;
}
/**
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 1215229d1c12..ef2a47e0eab6 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -20,7 +20,7 @@
static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
-static void poll_spurious_irqs(unsigned long dummy);
+static void poll_spurious_irqs(struct timer_list *unused);
static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
static int irq_poll_cpu;
static atomic_t irq_poll_active;
@@ -143,7 +143,7 @@ out:
return ok;
}
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_spurious_irqs(struct timer_list *unused)
{
struct irq_desc *desc;
int i;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8ff4ca4665ff..8594d24e4adc 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -769,7 +769,7 @@ static __init int jump_label_test(void)
return 0;
}
-late_initcall(jump_label_test);
+early_initcall(jump_label_test);
#endif /* STATIC_KEYS_SELFTEST */
#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 1e6ae66c6244..d5fa4116688a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/filter.h>
+#include <linux/ftrace.h>
#include <linux/compiler.h>
#include <asm/sections.h>
@@ -337,6 +338,10 @@ const char *kallsyms_lookup(unsigned long addr,
if (!ret)
ret = bpf_address_lookup(addr, symbolsize,
offset, modname, namebuf);
+
+ if (!ret)
+ ret = ftrace_mod_address_lookup(addr, symbolsize,
+ offset, modname, namebuf);
return ret;
}
@@ -474,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol);
struct kallsym_iter {
loff_t pos;
loff_t pos_mod_end;
+ loff_t pos_ftrace_mod_end;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
@@ -497,11 +503,25 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
return 1;
}
+static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
+{
+ int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
+ &iter->value, &iter->type,
+ iter->name, iter->module_name,
+ &iter->exported);
+ if (ret < 0) {
+ iter->pos_ftrace_mod_end = iter->pos;
+ return 0;
+ }
+
+ return 1;
+}
+
static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
iter->module_name[0] = '\0';
iter->exported = 0;
- return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+ return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
&iter->value, &iter->type,
iter->name) < 0 ? 0 : 1;
}
@@ -526,20 +546,31 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->name[0] = '\0';
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
- if (new_pos == 0)
+ if (new_pos == 0) {
iter->pos_mod_end = 0;
+ iter->pos_ftrace_mod_end = 0;
+ }
}
static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
{
iter->pos = pos;
- if (iter->pos_mod_end > 0 &&
- iter->pos_mod_end < iter->pos)
+ if (iter->pos_ftrace_mod_end > 0 &&
+ iter->pos_ftrace_mod_end < iter->pos)
return get_ksymbol_bpf(iter);
- if (!get_ksymbol_mod(iter))
- return get_ksymbol_bpf(iter);
+ if (iter->pos_mod_end > 0 &&
+ iter->pos_mod_end < iter->pos) {
+ if (!get_ksymbol_ftrace_mod(iter))
+ return get_ksymbol_bpf(iter);
+ return 1;
+ }
+
+ if (!get_ksymbol_mod(iter)) {
+ if (!get_ksymbol_ftrace_mod(iter))
+ return get_ksymbol_bpf(iter);
+ }
return 1;
}
@@ -583,14 +614,14 @@ static void s_stop(struct seq_file *m, void *p)
static int s_show(struct seq_file *m, void *p)
{
- unsigned long value;
+ void *value;
struct kallsym_iter *iter = m->private;
/* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
- value = iter->show_value ? iter->value : 0;
+ value = iter->show_value ? (void *)iter->value : NULL;
if (iter->module_name[0]) {
char type;
@@ -601,10 +632,10 @@ static int s_show(struct seq_file *m, void *p)
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
- seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value,
+ seq_printf(m, "%px %c %s\t[%s]\n", value,
type, iter->name, iter->module_name);
} else
- seq_printf(m, KALLSYM_FMT " %c %s\n", value,
+ seq_printf(m, "%px %c %s\n", value,
iter->type, iter->name);
return 0;
}
diff --git a/kernel/kcov.c b/kernel/kcov.c
index fc6af9e1308b..15f33faf4013 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -22,13 +22,21 @@
#include <linux/kcov.h>
#include <asm/setup.h>
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
/*
* kcov descriptor (one per opened debugfs file).
* State transitions of the descriptor:
* - initial state after open()
* - then there must be a single ioctl(KCOV_INIT_TRACE) call
* - then, mmap() call (several calls are allowed but not useful)
- * - then, repeated enable/disable for a task (only one task a time allowed)
+ * - then, ioctl(KCOV_ENABLE, arg), where arg is
+ * KCOV_TRACE_PC - to trace only the PCs
+ * or
+ * KCOV_TRACE_CMP - to trace only the comparison operands
+ * - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
*/
struct kcov {
/*
@@ -48,51 +56,176 @@ struct kcov {
struct task_struct *t;
};
-/*
- * Entry point from instrumented code.
- * This is called once per basic-block/edge.
- */
-void notrace __sanitizer_cov_trace_pc(void)
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
{
- struct task_struct *t;
enum kcov_mode mode;
- t = current;
/*
* We are interested in code coverage as a function of a syscall inputs,
* so we ignore code executed in interrupts.
*/
- if (!t || !in_task())
- return;
+ if (!in_task())
+ return false;
mode = READ_ONCE(t->kcov_mode);
- if (mode == KCOV_MODE_TRACE) {
- unsigned long *area;
- unsigned long pos;
- unsigned long ip = _RET_IP_;
+ /*
+ * There is some code that runs in interrupts but for which
+ * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+ * READ_ONCE()/barrier() effectively provides load-acquire wrt
+ * interrupts, there are paired barrier()/WRITE_ONCE() in
+ * kcov_ioctl_locked().
+ */
+ barrier();
+ return mode == needed_mode;
+}
+static unsigned long canonicalize_ip(unsigned long ip)
+{
#ifdef CONFIG_RANDOMIZE_BASE
- ip -= kaslr_offset();
+ ip -= kaslr_offset();
#endif
+ return ip;
+}
- /*
- * There is some code that runs in interrupts but for which
- * in_interrupt() returns false (e.g. preempt_schedule_irq()).
- * READ_ONCE()/barrier() effectively provides load-acquire wrt
- * interrupts, there are paired barrier()/WRITE_ONCE() in
- * kcov_ioctl_locked().
- */
- barrier();
- area = t->kcov_area;
- /* The first word is number of subsequent PCs. */
- pos = READ_ONCE(area[0]) + 1;
- if (likely(pos < t->kcov_size)) {
- area[pos] = ip;
- WRITE_ONCE(area[0], pos);
- }
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+ struct task_struct *t;
+ unsigned long *area;
+ unsigned long ip = canonicalize_ip(_RET_IP_);
+ unsigned long pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
+ return;
+
+ area = t->kcov_area;
+ /* The first 64-bit word is the number of subsequent PCs. */
+ pos = READ_ONCE(area[0]) + 1;
+ if (likely(pos < t->kcov_size)) {
+ area[pos] = ip;
+ WRITE_ONCE(area[0], pos);
}
}
EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+ struct task_struct *t;
+ u64 *area;
+ u64 count, start_index, end_pos, max_pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+ return;
+
+ ip = canonicalize_ip(ip);
+
+ /*
+ * We write all comparison arguments and types as u64.
+ * The buffer was allocated for t->kcov_size unsigned longs.
+ */
+ area = (u64 *)t->kcov_area;
+ max_pos = t->kcov_size * sizeof(unsigned long);
+
+ count = READ_ONCE(area[0]);
+
+ /* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+ start_index = 1 + count * KCOV_WORDS_PER_CMP;
+ end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+ if (likely(end_pos <= max_pos)) {
+ area[start_index] = type;
+ area[start_index + 1] = arg1;
+ area[start_index + 2] = arg2;
+ area[start_index + 3] = ip;
+ WRITE_ONCE(area[0], count + 1);
+ }
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+ u64 i;
+ u64 count = cases[0];
+ u64 size = cases[1];
+ u64 type = KCOV_CMP_CONST;
+
+ switch (size) {
+ case 8:
+ type |= KCOV_CMP_SIZE(0);
+ break;
+ case 16:
+ type |= KCOV_CMP_SIZE(1);
+ break;
+ case 32:
+ type |= KCOV_CMP_SIZE(2);
+ break;
+ case 64:
+ type |= KCOV_CMP_SIZE(3);
+ break;
+ default:
+ return;
+ }
+ for (i = 0; i < count; i++)
+ write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
static void kcov_get(struct kcov *kcov)
{
atomic_inc(&kcov->refcount);
@@ -129,6 +262,7 @@ void kcov_task_exit(struct task_struct *t)
/* Just to not leave dangling references behind. */
kcov_task_init(t);
kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
spin_unlock(&kcov->lock);
kcov_put(kcov);
}
@@ -147,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
spin_lock(&kcov->lock);
size = kcov->size * sizeof(unsigned long);
- if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+ if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
res = -EINVAL;
goto exit;
@@ -176,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
if (!kcov)
return -ENOMEM;
+ kcov->mode = KCOV_MODE_DISABLED;
atomic_set(&kcov->refcount, 1);
spin_lock_init(&kcov->lock);
filep->private_data = kcov;
@@ -211,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
if (size < 2 || size > INT_MAX / sizeof(unsigned long))
return -EINVAL;
kcov->size = size;
- kcov->mode = KCOV_MODE_TRACE;
+ kcov->mode = KCOV_MODE_INIT;
return 0;
case KCOV_ENABLE:
/*
@@ -221,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
* at task exit or voluntary by KCOV_DISABLE. After that it can
* be enabled for another task.
*/
- unused = arg;
- if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
- kcov->area == NULL)
+ if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
return -EINVAL;
if (kcov->t != NULL)
return -EBUSY;
+ if (arg == KCOV_TRACE_PC)
+ kcov->mode = KCOV_MODE_TRACE_PC;
+ else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+ kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+ return -ENOTSUPP;
+#endif
+ else
+ return -EINVAL;
t = current;
/* Cache in task struct for performance. */
t->kcov_size = kcov->size;
t->kcov_area = kcov->area;
- /* See comment in __sanitizer_cov_trace_pc(). */
+ /* See comment in check_kcov_mode(). */
barrier();
WRITE_ONCE(t->kcov_mode, kcov->mode);
t->kcov = kcov;
@@ -249,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
return -EINVAL;
kcov_task_init(t);
kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
kcov_put(kcov);
return 0;
default:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8af313081b0d..cd50e99202b0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -843,7 +843,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
struct timer_list *timer = &dwork->timer;
struct kthread_work *work = &dwork->work;
- WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn);
+ WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
/*
* If @delay is 0, queue @dwork->work immediately. This is for
diff --git a/kernel/module.c b/kernel/module.c
index 222aba4aa960..dea01ac9cb74 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3481,6 +3481,8 @@ static noinline int do_init_module(struct module *mod)
if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
async_synchronize_full();
+ ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
+ mod->init_layout.size);
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
@@ -4155,7 +4157,7 @@ static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
char buf[MODULE_FLAGS_BUF_SIZE];
- unsigned long value;
+ void *value;
/* We always ignore unformed modules. */
if (mod->state == MODULE_STATE_UNFORMED)
@@ -4171,8 +4173,8 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
- value = m->private ? 0 : (unsigned long)mod->core_layout.base;
- seq_printf(m, " 0x" KALLSYM_FMT, value);
+ value = m->private ? NULL : mod->core_layout.base;
+ seq_printf(m, " 0x%px", value);
/* Taints info */
if (mod->taints)
diff --git a/kernel/padata.c b/kernel/padata.c
index f262c9a4e70a..57c0074d50cc 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -288,9 +288,9 @@ static void invoke_padata_reorder(struct work_struct *work)
local_bh_enable();
}
-static void padata_reorder_timer(unsigned long arg)
+static void padata_reorder_timer(struct timer_list *t)
{
- struct parallel_data *pd = (struct parallel_data *)arg;
+ struct parallel_data *pd = from_timer(pd, t, timer);
unsigned int weight;
int target_cpu, cpu;
@@ -485,7 +485,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
padata_init_pqueues(pd);
padata_init_squeues(pd);
- setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
+ timer_setup(&pd->timer, padata_reorder_timer, 0);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
atomic_set(&pd->refcnt, 0);
diff --git a/kernel/panic.c b/kernel/panic.c
index bdd18afa19a4..2cfef408fec9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,6 +27,8 @@
#include <linux/console.h>
#include <linux/bug.h>
#include <linux/ratelimit.h>
+#include <linux/debugfs.h>
+#include <asm/sections.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -322,6 +324,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
{ 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */
{ 'L', ' ', false }, /* TAINT_SOFTLOCKUP */
{ 'K', ' ', true }, /* TAINT_LIVEPATCH */
+ { 'X', ' ', true }, /* TAINT_AUX */
};
/**
@@ -343,6 +346,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
* 'E' - Unsigned module has been loaded.
* 'L' - A soft lockup has previously occurred.
* 'K' - Kernel has been live patched.
+ * 'X' - Auxiliary taint, for distros' use.
*
* The string is overwritten by the next call to print_tainted().
*/
@@ -518,7 +522,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
{
disable_trace_on_warning();
- pr_warn("------------[ cut here ]------------\n");
+ if (args)
+ pr_warn(CUT_HERE);
if (file)
pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
@@ -582,9 +587,49 @@ EXPORT_SYMBOL(warn_slowpath_fmt_taint);
void warn_slowpath_null(const char *file, int line)
{
+ pr_warn(CUT_HERE);
__warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
}
EXPORT_SYMBOL(warn_slowpath_null);
+#else
+void __warn_printk(const char *fmt, ...)
+{
+ va_list args;
+
+ pr_warn(CUT_HERE);
+
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL(__warn_printk);
+#endif
+
+#ifdef CONFIG_BUG
+
+/* Support resetting WARN*_ONCE state */
+
+static int clear_warn_once_set(void *data, u64 val)
+{
+ generic_bug_clear_once();
+ memset(__start_once, 0, __end_once - __start_once);
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
+ NULL,
+ clear_warn_once_set,
+ "%lld\n");
+
+static __init int register_warn_debugfs(void)
+{
+ /* Don't care about failure */
+ debugfs_create_file("clear_warn_once", 0200, NULL,
+ NULL, &clear_warn_once_fops);
+ return 0;
+}
+
+device_initcall(register_warn_debugfs);
#endif
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedbdf066..b13b624e2c49 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,11 +39,8 @@
#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
#include <linux/sched/task.h>
+#include <linux/idr.h>
-#define pid_hashfn(nr, ns) \
- hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
struct pid init_struct_pid = INIT_STRUCT_PID;
int pid_max = PID_MAX_DEFAULT;
@@ -53,15 +50,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-static inline int mk_pid(struct pid_namespace *pid_ns,
- struct pidmap *map, int off)
-{
- return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
-#define find_next_offset(map, off) \
- find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
-
/*
* PID-map pages start out as NULL, they get allocated upon
* first use and are never deallocated. This way a low pid_max
@@ -70,11 +58,8 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
*/
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
- .pidmap = {
- [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
- },
- .last_pid = 0,
- .nr_hashed = PIDNS_HASH_ADDING,
+ .idr = IDR_INIT,
+ .pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
@@ -101,138 +86,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static void free_pidmap(struct upid *upid)
-{
- int nr = upid->nr;
- struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
- int offset = nr & BITS_PER_PAGE_MASK;
-
- clear_bit(offset, map->page);
- atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
- /*
- * This is the same as saying
- *
- * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
- * and that mapping orders 'a' and 'b' with respect to 'base'.
- */
- return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value. We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
- int prev;
- int last_write = base;
- do {
- prev = last_write;
- last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
- } while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
- int i, offset, max_scan, pid, last = pid_ns->last_pid;
- struct pidmap *map;
-
- pid = last + 1;
- if (pid >= pid_max)
- pid = RESERVED_PIDS;
- offset = pid & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
- /*
- * If last_pid points into the middle of the map->page we
- * want to scan this bitmap block twice, the second time
- * we start with offset == 0 (or RESERVED_PIDS).
- */
- max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
- for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
- return -ENOMEM;
- }
- if (likely(atomic_read(&map->nr_free))) {
- for ( ; ; ) {
- if (!test_and_set_bit(offset, map->page)) {
- atomic_dec(&map->nr_free);
- set_last_pid(pid_ns, last, pid);
- return pid;
- }
- offset = find_next_offset(map, offset);
- if (offset >= BITS_PER_PAGE)
- break;
- pid = mk_pid(pid_ns, map, offset);
- if (pid >= pid_max)
- break;
- }
- }
- if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
- ++map;
- offset = 0;
- } else {
- map = &pid_ns->pidmap[0];
- offset = RESERVED_PIDS;
- if (unlikely(last == offset))
- break;
- }
- pid = mk_pid(pid_ns, map, offset);
- }
- return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
- int offset;
- struct pidmap *map, *end;
-
- if (last >= PID_MAX_LIMIT)
- return -1;
-
- offset = (last + 1) & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
- end = &pid_ns->pidmap[PIDMAP_ENTRIES];
- for (; map < end; map++, offset = 0) {
- if (unlikely(!map->page))
- continue;
- offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
- if (offset < BITS_PER_PAGE)
- return mk_pid(pid_ns, map, offset);
- }
- return -1;
-}
-
void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
@@ -265,8 +118,7 @@ void free_pid(struct pid *pid)
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
- hlist_del_rcu(&upid->pid_chain);
- switch(--ns->nr_hashed) {
+ switch (--ns->pid_allocated) {
case 2:
case 1:
/* When all that is left in the pid namespace
@@ -275,21 +127,20 @@ void free_pid(struct pid *pid)
*/
wake_up_process(ns->child_reaper);
break;
- case PIDNS_HASH_ADDING:
+ case PIDNS_ADDING:
/* Handle a fork failure of the first process */
WARN_ON(ns->child_reaper);
- ns->nr_hashed = 0;
+ ns->pid_allocated = 0;
/* fall through */
case 0:
schedule_work(&ns->proc_work);
break;
}
+
+ idr_remove(&ns->idr, upid->nr);
}
spin_unlock_irqrestore(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- free_pidmap(pid->numbers + i);
-
call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -308,8 +159,29 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
pid->level = ns->level;
+
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ int pid_min = 1;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_irq(&pidmap_lock);
+
+ /*
+ * init really needs pid 1, but after reaching the maximum
+ * wrap back to RESERVED_PIDS
+ */
+ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+ pid_min = RESERVED_PIDS;
+
+ /*
+ * Store a null pointer so find_pid_ns does not find
+ * a partially initialized PID (see below).
+ */
+ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+ pid_max, GFP_ATOMIC);
+ spin_unlock_irq(&pidmap_lock);
+ idr_preload_end();
+
if (nr < 0) {
retval = nr;
goto out_free;
@@ -334,12 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns)
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
- if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+ if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
for ( ; upid >= pid->numbers; --upid) {
- hlist_add_head_rcu(&upid->pid_chain,
- &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
- upid->ns->nr_hashed++;
+ /* Make the PID visible to find_pid_ns. */
+ idr_replace(&upid->ns->idr, pid, upid->nr);
+ upid->ns->pid_allocated++;
}
spin_unlock_irq(&pidmap_lock);
@@ -350,8 +222,11 @@ out_unlock:
put_pid_ns(ns);
out_free:
+ spin_lock_irq(&pidmap_lock);
while (++i <= ns->level)
- free_pidmap(pid->numbers + i);
+ idr_remove(&ns->idr, (pid->numbers + i)->nr);
+
+ spin_unlock_irq(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -360,21 +235,13 @@ out_free:
void disable_pid_allocation(struct pid_namespace *ns)
{
spin_lock_irq(&pidmap_lock);
- ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+ ns->pid_allocated &= ~PIDNS_ADDING;
spin_unlock_irq(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
- struct upid *pnr;
-
- hlist_for_each_entry_rcu(pnr,
- &pid_hash[pid_hashfn(nr, ns)], pid_chain)
- if (pnr->nr == nr && pnr->ns == ns)
- return container_of(pnr, struct pid,
- numbers[ns->level]);
-
- return NULL;
+ return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);
@@ -530,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (type != PIDTYPE_PID) {
if (type == __PIDTYPE_TGID)
type = PIDTYPE_PID;
+
task = task->group_leader;
}
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
@@ -553,35 +421,13 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
- struct pid *pid;
-
- do {
- pid = find_pid_ns(nr, ns);
- if (pid)
- break;
- nr = next_pidmap(ns, nr);
- } while (nr > 0);
-
- return pid;
-}
-
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
- */
-void __init pidhash_init(void)
-{
- pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
- HASH_EARLY | HASH_SMALL | HASH_ZERO,
- &pidhash_shift, NULL,
- 0, 4096);
+ return idr_get_next(&ns->idr, &nr);
}
-void __init pidmap_init(void)
+void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
- BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
pid_max = min(pid_max_max, max_t(int, pid_max,
@@ -590,10 +436,7 @@ void __init pidmap_init(void)
PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
- init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /* Reserve PID 0. We never call free_pidmap(0) */
- set_bit(0, init_pid_ns.pidmap[0].page);
- atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ idr_init(&init_pid_ns.idr);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 4918314893bc..0b53eef7d34b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
#include <linux/export.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
+#include <linux/idr.h>
struct pid_cache {
int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
struct ucounts *ucounts;
- int i;
int err;
err = -EINVAL;
@@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns == NULL)
goto out_dec;
- ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (!ns->pidmap[0].page)
- goto out_free;
+ idr_init(&ns->idr);
ns->pid_cachep = create_pid_cachep(level + 1);
if (ns->pid_cachep == NULL)
- goto out_free_map;
+ goto out_free_idr;
err = ns_alloc_inum(&ns->ns);
if (err)
- goto out_free_map;
+ goto out_free_idr;
ns->ns.ops = &pidns_operations;
kref_init(&ns->kref);
@@ -135,20 +133,13 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
- ns->nr_hashed = PIDNS_HASH_ADDING;
+ ns->pid_allocated = PIDNS_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
- set_bit(0, ns->pidmap[0].page);
- atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
- for (i = 1; i < PIDMAP_ENTRIES; i++)
- atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
return ns;
-out_free_map:
- kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+ idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
out_dec:
dec_pid_namespaces(ucounts);
@@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
- int i;
-
ns_free_inum(&ns->ns);
- for (i = 0; i < PIDMAP_ENTRIES; i++)
- kfree(ns->pidmap[i].page);
+
+ idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}
@@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
int rc;
struct task_struct *task, *me = current;
int init_pids = thread_group_leader(me) ? 1 : 2;
+ struct pid *pid;
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* maintain a tasklist for each pid namespace.
*
*/
+ rcu_read_lock();
read_lock(&tasklist_lock);
- nr = next_pidmap(pid_ns, 1);
- while (nr > 0) {
- rcu_read_lock();
-
- task = pid_task(find_vpid(nr), PIDTYPE_PID);
+ nr = 2;
+ idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
+ task = pid_task(pid, PIDTYPE_PID);
if (task && !__fatal_signal_pending(task))
send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
-
- rcu_read_unlock();
-
- nr = next_pidmap(pid_ns, nr);
}
read_unlock(&tasklist_lock);
+ rcu_read_unlock();
/*
* Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
@@ -268,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* sys_wait4() above can't reap the EXIT_DEAD children but we do not
* really care, we could reparent them to the global init. We could
* exit and reap ->child_reaper even if it is not the last thread in
- * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+ * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
* pid_ns can not go away until proc_kill_sb() drops the reference.
*
* But this ns can also have other tasks injected by setns()+fork().
@@ -282,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*/
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (pid_ns->nr_hashed == init_pids)
+ if (pid_ns->pid_allocated == init_pids)
break;
schedule();
}
@@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
{
struct pid_namespace *pid_ns = task_active_pid_ns(current);
struct ctl_table tmp = *table;
+ int ret, next;
if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
* it should synchronize its usage with external means.
*/
- tmp.data = &pid_ns->last_pid;
- return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ next = idr_get_cursor(&pid_ns->idr) - 1;
+
+ tmp.data = &next;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (!ret && write)
+ idr_set_cursor(&pid_ns->idr, next + 1);
+
+ return ret;
}
extern int pid_max;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 512f7c2baedd..5d81206a572d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2190,7 +2190,7 @@ again:
}
if (console_seq < log_first_seq) {
- len = sprintf(text, "** %u printk messages dropped ** ",
+ len = sprintf(text, "** %u printk messages dropped **\n",
(unsigned)(log_first_seq - console_seq));
/* messages are gone, move to first one */
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 3cdaeaef9ce1..3e3c2004bb23 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -39,7 +39,7 @@
* There are situations when we want to make sure that all buffers
* were handled or when IRQs are blocked.
*/
-static int printk_safe_irq_ready;
+static int printk_safe_irq_ready __read_mostly;
#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
sizeof(atomic_t) - \
@@ -63,11 +63,8 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
/* Get flushed in a more safe context. */
static void queue_flush_work(struct printk_safe_seq_buf *s)
{
- if (printk_safe_irq_ready) {
- /* Make sure that IRQ work is really initialized. */
- smp_rmb();
+ if (printk_safe_irq_ready)
irq_work_queue(&s->work);
- }
}
/*
@@ -75,7 +72,7 @@ static void queue_flush_work(struct printk_safe_seq_buf *s)
* have dedicated buffers, because otherwise printk-safe preempted by
* NMI-printk would have overwritten the NMI messages.
*
- * The messages are fushed from irq work (or from panic()), possibly,
+ * The messages are flushed from irq work (or from panic()), possibly,
* from other CPU, concurrently with printk_safe_log_store(). Should this
* happen, printk_safe_log_store() will notice the buffer->len mismatch
* and repeat the write.
@@ -398,8 +395,12 @@ void __init printk_safe_init(void)
#endif
}
- /* Make sure that IRQ works are initialized before enabling. */
- smp_wmb();
+ /*
+ * In the highly unlikely event that a NMI were to trigger at
+ * this moment. Make sure IRQ work is set up before this
+ * variable is set.
+ */
+ barrier();
printk_safe_irq_ready = 1;
/* Flush pending messages that did not have scheduled IRQ works. */
diff --git a/kernel/reboot.c b/kernel/reboot.c
index bd30a973fe94..e4ced883d8de 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_reboot_notifier);
+static void devm_unregister_reboot_notifier(struct device *dev, void *res)
+{
+ WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res));
+}
+
+int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb)
+{
+ struct notifier_block **rcnb;
+ int ret;
+
+ rcnb = devres_alloc(devm_unregister_reboot_notifier,
+ sizeof(*rcnb), GFP_KERNEL);
+ if (!rcnb)
+ return -ENOMEM;
+
+ ret = register_reboot_notifier(nb);
+ if (!ret) {
+ *rcnb = nb;
+ devres_add(dev, rcnb);
+ } else {
+ devres_free(rcnb);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(devm_register_reboot_notifier);
+
/*
* Notifier list for kernel code which wants to be called
* to restart the system.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a092f350f3a2..75554f366fd3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -16,6 +16,7 @@
#include <linux/init_task.h>
#include <linux/context_tracking.h>
#include <linux/rcupdate_wait.h>
+#include <linux/compat.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
@@ -5107,13 +5108,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
* Return: On success, 0 and the timeslice is in @interval. Otherwise,
* an error code.
*/
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
- struct timespec __user *, interval)
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
struct task_struct *p;
unsigned int time_slice;
struct rq_flags rf;
- struct timespec t;
struct rq *rq;
int retval;
@@ -5137,15 +5136,40 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
task_rq_unlock(rq, p, &rf);
rcu_read_unlock();
- jiffies_to_timespec(time_slice, &t);
- retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
- return retval;
+ jiffies_to_timespec64(time_slice, t);
+ return 0;
out_unlock:
rcu_read_unlock();
return retval;
}
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_timespec64(&t, interval);
+
+ return retval;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = compat_put_timespec64(&t, interval);
+ return retval;
+}
+#endif
+
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index aa1fb9f905db..9558664bd9ec 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -78,7 +78,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
handler = sig_handler(t, sig);
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
- handler == SIG_DFL && !force)
+ handler == SIG_DFL && !(force && sig_kernel_only(sig)))
return 1;
return sig_handler_ignored(handler, sig);
@@ -94,13 +94,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return 0;
- if (!sig_task_ignored(t, sig, force))
- return 0;
-
/*
- * Tracers may want to know about even ignored signals.
+ * Tracers may want to know about even ignored signal unless it
+ * is SIGKILL which can't be reported anyway but can be ignored
+ * by SIGNAL_UNKILLABLE task.
*/
- return !t->ptrace;
+ if (t->ptrace && sig != SIGKILL)
+ return 0;
+
+ return sig_task_ignored(t, sig, force);
}
/*
@@ -929,9 +931,9 @@ static void complete_signal(int sig, struct task_struct *p, int group)
* then start taking the whole group down immediately.
*/
if (sig_fatal(p, sig) &&
- !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
+ !(signal->flags & SIGNAL_GROUP_EXIT) &&
!sigismember(&t->real_blocked, sig) &&
- (sig == SIGKILL || !t->ptrace)) {
+ (sig == SIGKILL || !p->ptrace)) {
/*
* This signal will be fatal to the whole group.
*/
@@ -2599,7 +2601,6 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
{
-#ifdef __BIG_ENDIAN
sigset_t old_set = current->blocked;
/* XXX: Don't preclude handling different sized sigset_t's. */
@@ -2607,38 +2608,22 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
return -EINVAL;
if (nset) {
- compat_sigset_t new32;
sigset_t new_set;
int error;
- if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+ if (get_compat_sigset(&new_set, nset))
return -EFAULT;
-
- sigset_from_compat(&new_set, &new32);
sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
error = sigprocmask(how, &new_set, NULL);
if (error)
return error;
}
- if (oset) {
- compat_sigset_t old32;
- sigset_to_compat(&old32, &old_set);
- if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
- return -EFAULT;
- }
- return 0;
-#else
- return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
- (sigset_t __user *)oset, sigsetsize);
-#endif
+ return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
}
#endif
-static int do_sigpending(void *set, unsigned long sigsetsize)
+static int do_sigpending(sigset_t *set)
{
- if (sigsetsize > sizeof(sigset_t))
- return -EINVAL;
-
spin_lock_irq(&current->sighand->siglock);
sigorsets(set, &current->pending.signal,
&current->signal->shared_pending.signal);
@@ -2658,7 +2643,12 @@ static int do_sigpending(void *set, unsigned long sigsetsize)
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
sigset_t set;
- int err = do_sigpending(&set, sigsetsize);
+ int err;
+
+ if (sigsetsize > sizeof(*uset))
+ return -EINVAL;
+
+ err = do_sigpending(&set);
if (!err && copy_to_user(uset, &set, sigsetsize))
err = -EFAULT;
return err;
@@ -2668,20 +2658,16 @@ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
compat_size_t, sigsetsize)
{
-#ifdef __BIG_ENDIAN
sigset_t set;
- int err = do_sigpending(&set, sigsetsize);
- if (!err) {
- compat_sigset_t set32;
- sigset_to_compat(&set32, &set);
- /* we can get here only if sigsetsize <= sizeof(set) */
- if (copy_to_user(uset, &set32, sigsetsize))
- err = -EFAULT;
- }
+ int err;
+
+ if (sigsetsize > sizeof(*uset))
+ return -EINVAL;
+
+ err = do_sigpending(&set);
+ if (!err)
+ err = put_compat_sigset(uset, &set, sigsetsize);
return err;
-#else
- return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
-#endif
}
#endif
@@ -2915,7 +2901,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
struct compat_siginfo __user *, uinfo,
struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
{
- compat_sigset_t s32;
sigset_t s;
struct timespec t;
siginfo_t info;
@@ -2924,9 +2909,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+ if (get_compat_sigset(&s, uthese))
return -EFAULT;
- sigset_from_compat(&s, &s32);
if (uts) {
if (compat_get_timespec(&t, uts))
@@ -3344,15 +3328,11 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
-#ifdef __BIG_ENDIAN
sigset_t set;
- int err = do_sigpending(&set, sizeof(set.sig[0]));
+ int err = do_sigpending(&set);
if (!err)
err = put_user(set.sig[0], set32);
return err;
-#else
- return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32));
-#endif
}
#endif
@@ -3450,7 +3430,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
compat_size_t, sigsetsize)
{
struct k_sigaction new_ka, old_ka;
- compat_sigset_t mask;
#ifdef __ARCH_HAS_SA_RESTORER
compat_uptr_t restorer;
#endif
@@ -3468,19 +3447,18 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
ret |= get_user(restorer, &act->sa_restorer);
new_ka.sa.sa_restorer = compat_ptr(restorer);
#endif
- ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+ ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
if (ret)
return -EFAULT;
- sigset_from_compat(&new_ka.sa.sa_mask, &mask);
}
ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
if (!ret && oact) {
- sigset_to_compat(&mask, &old_ka.sa.sa_mask);
ret = put_user(ptr_to_compat(old_ka.sa.sa_handler),
&oact->sa_handler);
- ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+ ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
+ sizeof(oact->sa_mask));
ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
#ifdef __ARCH_HAS_SA_RESTORER
ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
@@ -3660,22 +3638,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
{
-#ifdef __BIG_ENDIAN
sigset_t newset;
- compat_sigset_t newset32;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+ if (get_compat_sigset(&newset, unewset))
return -EFAULT;
- sigset_from_compat(&newset, &newset32);
return sigsuspend(&newset);
-#else
- /* on little-endian bitmaps don't care about granularity */
- return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
-#endif
}
#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4a13a389e99b..557d46728577 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -66,6 +66,7 @@
#include <linux/kexec.h>
#include <linux/bpf.h>
#include <linux/mount.h>
+#include <linux/pipe_fs_i.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
@@ -1816,7 +1817,7 @@ static struct ctl_table fs_table[] = {
{
.procname = "pipe-max-size",
.data = &pipe_max_size,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(pipe_max_size),
.mode = 0644,
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
@@ -2575,12 +2576,13 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
if (write) {
unsigned int val = *lvalp;
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
+
if ((param->min && *param->min > val) ||
(param->max && *param->max < val))
return -ERANGE;
- if (*lvalp > UINT_MAX)
- return -EINVAL;
*valp = val;
} else {
unsigned int val = *valp;
@@ -2620,6 +2622,48 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
do_proc_douintvec_minmax_conv, &param);
}
+struct do_proc_dopipe_max_size_conv_param {
+ unsigned int *min;
+};
+
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ struct do_proc_dopipe_max_size_conv_param *param = data;
+
+ if (write) {
+ unsigned int val;
+
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
+
+ val = round_pipe_size(*lvalp);
+ if (val == 0)
+ return -EINVAL;
+
+ if (param->min && *param->min > val)
+ return -ERANGE;
+
+ *valp = val;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long) val;
+ }
+
+ return 0;
+}
+
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_dopipe_max_size_conv_param param = {
+ .min = (unsigned int *) table->extra1,
+ };
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_dopipe_max_size_conv, &param);
+}
+
static void validate_coredump_safety(void)
{
#ifdef CONFIG_COREDUMP
@@ -3083,14 +3127,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
else
bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
}
- kfree(tmp_bitmap);
*lenp -= left;
*ppos += *lenp;
- return 0;
- } else {
- kfree(tmp_bitmap);
- return err;
}
+
+ kfree(tmp_bitmap);
+ return err;
}
#else /* CONFIG_PROC_SYSCTL */
@@ -3125,6 +3167,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -3168,6 +3216,7 @@ EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
+EXPORT_SYMBOL_GPL(proc_dopipe_max_size);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d689a9557e17..e776fc8cc1df 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -21,10 +21,6 @@ config CLOCKSOURCE_VALIDATE_LAST_CYCLE
config GENERIC_TIME_VSYSCALL
bool
-# Timekeeping vsyscall support
-config GENERIC_TIME_VSYSCALL_OLD
- bool
-
# Old style timekeeping
config ARCH_USES_GETTIMEOFFSET
bool
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 03918a19cf2d..65f9e3f24dde 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -171,7 +171,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-static void clocksource_watchdog(unsigned long data)
+static void clocksource_watchdog(struct timer_list *unused)
{
struct clocksource *cs;
u64 csnow, wdnow, cslast, wdlast, delta;
@@ -290,8 +290,7 @@ static inline void clocksource_start_watchdog(void)
{
if (watchdog_running || !watchdog || list_empty(&watchdog_list))
return;
- init_timer(&watchdog_timer);
- watchdog_timer.function = clocksource_watchdog;
+ timer_setup(&watchdog_timer, clocksource_watchdog, 0);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
watchdog_running = 1;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 198afa78bf69..cd03317e7b57 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -557,45 +557,6 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}
-#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
-#warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon.
-
-static inline void update_vsyscall(struct timekeeper *tk)
-{
- struct timespec xt, wm;
-
- xt = timespec64_to_timespec(tk_xtime(tk));
- wm = timespec64_to_timespec(tk->wall_to_monotonic);
- update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
- tk->tkr_mono.cycle_last);
-}
-
-static inline void old_vsyscall_fixup(struct timekeeper *tk)
-{
- s64 remainder;
-
- /*
- * Store only full nanoseconds into xtime_nsec after rounding
- * it up and add the remainder to the error difference.
- * XXX - This is necessary to avoid small 1ns inconsistnecies caused
- * by truncating the remainder in vsyscalls. However, it causes
- * additional work to be done in timekeeping_adjust(). Once
- * the vsyscall implementations are converted to use xtime_nsec
- * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
- * users are removed, this can be killed.
- */
- remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
- if (remainder != 0) {
- tk->tkr_mono.xtime_nsec -= remainder;
- tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
- tk->ntp_error += remainder << tk->ntp_error_shift;
- tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
- }
-}
-#else
-#define old_vsyscall_fixup(tk)
-#endif
-
static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
@@ -2164,12 +2125,6 @@ void update_wall_time(void)
timekeeping_adjust(tk, offset);
/*
- * XXX This can be killed once everyone converts
- * to the new update_vsyscall.
- */
- old_vsyscall_fixup(tk);
-
- /*
* Finally, make sure that after the rounding
* xtime_nsec isn't larger than NSEC_PER_SEC
*/
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index af0b8bae4502..ffebcf878fba 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -707,14 +707,18 @@ static inline void debug_timer_assert_init(struct timer_list *timer)
debug_object_assert_init(timer, &timer_debug_descr);
}
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+ void (*func)(struct timer_list *),
+ unsigned int flags,
const char *name, struct lock_class_key *key);
-void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
+void init_timer_on_stack_key(struct timer_list *timer,
+ void (*func)(struct timer_list *),
+ unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_object_init_on_stack(timer, &timer_debug_descr);
- do_init_timer(timer, flags, name, key);
+ do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
@@ -755,10 +759,13 @@ static inline void debug_assert_init(struct timer_list *timer)
debug_timer_assert_init(timer);
}
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+ void (*func)(struct timer_list *),
+ unsigned int flags,
const char *name, struct lock_class_key *key)
{
timer->entry.pprev = NULL;
+ timer->function = func;
timer->flags = flags | raw_smp_processor_id();
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
@@ -766,6 +773,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
/**
* init_timer_key - initialize a timer
* @timer: the timer to be initialized
+ * @func: timer callback function
* @flags: timer flags
* @name: name of the timer
* @key: lockdep class key of the fake lock used for tracking timer
@@ -774,11 +782,12 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
* init_timer_key() must be done to a timer prior calling *any* of the
* other timer functions.
*/
-void init_timer_key(struct timer_list *timer, unsigned int flags,
+void init_timer_key(struct timer_list *timer,
+ void (*func)(struct timer_list *), unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_init(timer);
- do_init_timer(timer, flags, name, key);
+ do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL(init_timer_key);
@@ -1107,12 +1116,12 @@ EXPORT_SYMBOL(timer_reduce);
* add_timer - start a timer
* @timer: the timer to be added
*
- * The kernel will do a ->function(->data) callback from the
+ * The kernel will do a ->function(@timer) callback from the
* timer interrupt at the ->expires point in the future. The
* current time is 'jiffies'.
*
- * The timer's ->expires, ->function (and if the handler uses it, ->data)
- * fields must be set prior calling this function.
+ * The timer's ->expires, ->function fields must be set prior calling this
+ * function.
*
* Timers with an ->expires field in the past will be executed in the next
* timer tick.
@@ -1284,8 +1293,7 @@ int del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(del_timer_sync);
#endif
-static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
- unsigned long data)
+static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
{
int count = preempt_count();
@@ -1309,7 +1317,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
lock_map_acquire(&lockdep_map);
trace_timer_expire_entry(timer);
- fn(data);
+ fn(timer);
trace_timer_expire_exit(timer);
lock_map_release(&lockdep_map);
@@ -1331,8 +1339,7 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
while (!hlist_empty(head)) {
struct timer_list *timer;
- void (*fn)(unsigned long);
- unsigned long data;
+ void (*fn)(struct timer_list *);
timer = hlist_entry(head->first, struct timer_list, entry);
@@ -1340,15 +1347,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
detach_timer(timer, true);
fn = timer->function;
- data = timer->data;
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
- call_timer_fn(timer, fn, data);
+ call_timer_fn(timer, fn);
raw_spin_lock(&base->lock);
} else {
raw_spin_unlock_irq(&base->lock);
- call_timer_fn(timer, fn, data);
+ call_timer_fn(timer, fn);
raw_spin_lock_irq(&base->lock);
}
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 0e7f5428a148..0ed768b56c60 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -389,7 +389,7 @@ static int __init init_timer_list_procfs(void)
{
struct proc_dir_entry *pe;
- pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
+ pe = proc_create("timer_list", 0400, NULL, &timer_list_fops);
if (!pe)
return -ENOMEM;
return 0;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f54b7b6b4a4b..af7dad126c13 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -160,6 +160,17 @@ config FUNCTION_GRAPH_TRACER
address on the current task structure into a stack of calls.
+config PREEMPTIRQ_EVENTS
+ bool "Enable trace events for preempt and irq disable/enable"
+ select TRACE_IRQFLAGS
+ depends on DEBUG_PREEMPT || !PROVE_LOCKING
+ default n
+ help
+ Enable tracing of disable and enable events for preemption and irqs.
+ For tracing preempt disable/enable events, DEBUG_PREEMPT must be
+ enabled. For tracing irq disable/enable events, PROVE_LOCKING must
+ be disabled.
+
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 19a15b2f1190..e2538c7638d4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_TRACING_MAP) += tracing_map.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 206e0e2ace53..987d9a9ae283 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -591,7 +591,7 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
return ret;
if (copy_to_user(arg, &buts, sizeof(buts))) {
- blk_trace_remove(q);
+ __blk_trace_remove(q);
return -EFAULT;
}
return 0;
@@ -637,7 +637,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
return ret;
if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
- blk_trace_remove(q);
+ __blk_trace_remove(q);
return -EFAULT;
}
@@ -872,7 +872,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what, int error, union kernfs_node_id *cgid)
+ u32 what, int error)
{
struct blk_trace *bt = q->blk_trace;
@@ -880,22 +880,21 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
+ bio_op(bio), bio->bi_opf, what, error, 0, NULL,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_bounce(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio,
int error)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
}
static void blk_add_trace_bio_backmerge(void *ignore,
@@ -903,8 +902,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
}
static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -912,15 +910,13 @@ static void blk_add_trace_bio_frontmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
}
static void blk_add_trace_bio_queue(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
}
static void blk_add_trace_getrq(void *ignore,
@@ -928,8 +924,7 @@ static void blk_add_trace_getrq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
else {
struct blk_trace *bt = q->blk_trace;
@@ -945,8 +940,7 @@ static void blk_add_trace_sleeprq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
else {
struct blk_trace *bt = q->blk_trace;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a5580c670866..27d1f4ffa3de 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -78,16 +78,12 @@ EXPORT_SYMBOL_GPL(trace_call_bpf);
BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
{
- int ret = 0;
-
- if (unlikely(size == 0))
- goto out;
+ int ret;
ret = probe_kernel_read(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
- out:
return ret;
}
@@ -407,7 +403,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM,
- .arg5_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
@@ -498,7 +494,7 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
- .arg2_type = ARG_CONST_SIZE,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
@@ -609,7 +605,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM,
- .arg5_type = ARG_CONST_SIZE,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8319e09e15b9..ccdf3664e4a9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -203,30 +203,6 @@ void clear_ftrace_function(void)
ftrace_trace_function = ftrace_stub;
}
-static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- *per_cpu_ptr(ops->disabled, cpu) = 1;
-}
-
-static int per_cpu_ops_alloc(struct ftrace_ops *ops)
-{
- int __percpu *disabled;
-
- if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
- return -EINVAL;
-
- disabled = alloc_percpu(int);
- if (!disabled)
- return -ENOMEM;
-
- ops->disabled = disabled;
- per_cpu_ops_disable_all(ops);
- return 0;
-}
-
static void ftrace_sync(struct work_struct *work)
{
/*
@@ -262,8 +238,8 @@ static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
* If this is a dynamic, RCU, or per CPU ops, or we force list func,
* then it needs to call the list anyway.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
- FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) ||
+ FTRACE_FORCE_LIST_FUNC)
return ftrace_ops_list_func;
return ftrace_ops_get_func(ops);
@@ -422,11 +398,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
- if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
- if (per_cpu_ops_alloc(ops))
- return -ENOMEM;
- }
-
add_ftrace_ops(&ftrace_ops_list, ops);
/* Always save the function, and reset at unregistering */
@@ -2727,11 +2698,6 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
-static void per_cpu_ops_free(struct ftrace_ops *ops)
-{
- free_percpu(ops->disabled);
-}
-
static void ftrace_startup_enable(int command)
{
if (saved_ftrace_func != ftrace_trace_function) {
@@ -2833,7 +2799,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
goto free_ops;
return 0;
@@ -2880,7 +2846,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* The same goes for freeing the per_cpu data of the per_cpu
* ops.
*/
- if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC) {
/*
* We need to do a hard force of sched synchronization.
* This is because we use preempt_disable() to do RCU, but
@@ -2903,9 +2869,6 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
free_ops:
arch_ftrace_trampoline_free(ops);
-
- if (ops->flags & FTRACE_OPS_FL_PER_CPU)
- per_cpu_ops_free(ops);
}
return 0;
@@ -5672,10 +5635,29 @@ static int ftrace_process_locs(struct module *mod,
return ret;
}
+struct ftrace_mod_func {
+ struct list_head list;
+ char *name;
+ unsigned long ip;
+ unsigned int size;
+};
+
+struct ftrace_mod_map {
+ struct rcu_head rcu;
+ struct list_head list;
+ struct module *mod;
+ unsigned long start_addr;
+ unsigned long end_addr;
+ struct list_head funcs;
+ unsigned int num_funcs;
+};
+
#ifdef CONFIG_MODULES
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+static LIST_HEAD(ftrace_mod_maps);
+
static int referenced_filters(struct dyn_ftrace *rec)
{
struct ftrace_ops *ops;
@@ -5729,8 +5711,26 @@ static void clear_mod_from_hashes(struct ftrace_page *pg)
mutex_unlock(&trace_types_lock);
}
+static void ftrace_free_mod_map(struct rcu_head *rcu)
+{
+ struct ftrace_mod_map *mod_map = container_of(rcu, struct ftrace_mod_map, rcu);
+ struct ftrace_mod_func *mod_func;
+ struct ftrace_mod_func *n;
+
+ /* All the contents of mod_map are now not visible to readers */
+ list_for_each_entry_safe(mod_func, n, &mod_map->funcs, list) {
+ kfree(mod_func->name);
+ list_del(&mod_func->list);
+ kfree(mod_func);
+ }
+
+ kfree(mod_map);
+}
+
void ftrace_release_mod(struct module *mod)
{
+ struct ftrace_mod_map *mod_map;
+ struct ftrace_mod_map *n;
struct dyn_ftrace *rec;
struct ftrace_page **last_pg;
struct ftrace_page *tmp_page = NULL;
@@ -5742,6 +5742,14 @@ void ftrace_release_mod(struct module *mod)
if (ftrace_disabled)
goto out_unlock;
+ list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) {
+ if (mod_map->mod == mod) {
+ list_del_rcu(&mod_map->list);
+ call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map);
+ break;
+ }
+ }
+
/*
* Each module has its own ftrace_pages, remove
* them from the list.
@@ -5749,7 +5757,8 @@ void ftrace_release_mod(struct module *mod)
last_pg = &ftrace_pages_start;
for (pg = ftrace_pages_start; pg; pg = *last_pg) {
rec = &pg->records[0];
- if (within_module_core(rec->ip, mod)) {
+ if (within_module_core(rec->ip, mod) ||
+ within_module_init(rec->ip, mod)) {
/*
* As core pages are first, the first
* page should never be a module page.
@@ -5818,7 +5827,8 @@ void ftrace_module_enable(struct module *mod)
* not part of this module, then skip this pg,
* which the "break" will do.
*/
- if (!within_module_core(rec->ip, mod))
+ if (!within_module_core(rec->ip, mod) &&
+ !within_module_init(rec->ip, mod))
break;
cnt = 0;
@@ -5863,23 +5873,245 @@ void ftrace_module_init(struct module *mod)
ftrace_process_locs(mod, mod->ftrace_callsites,
mod->ftrace_callsites + mod->num_ftrace_callsites);
}
+
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+ struct dyn_ftrace *rec)
+{
+ struct ftrace_mod_func *mod_func;
+ unsigned long symsize;
+ unsigned long offset;
+ char str[KSYM_SYMBOL_LEN];
+ char *modname;
+ const char *ret;
+
+ ret = kallsyms_lookup(rec->ip, &symsize, &offset, &modname, str);
+ if (!ret)
+ return;
+
+ mod_func = kmalloc(sizeof(*mod_func), GFP_KERNEL);
+ if (!mod_func)
+ return;
+
+ mod_func->name = kstrdup(str, GFP_KERNEL);
+ if (!mod_func->name) {
+ kfree(mod_func);
+ return;
+ }
+
+ mod_func->ip = rec->ip - offset;
+ mod_func->size = symsize;
+
+ mod_map->num_funcs++;
+
+ list_add_rcu(&mod_func->list, &mod_map->funcs);
+}
+
+static struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+ unsigned long start, unsigned long end)
+{
+ struct ftrace_mod_map *mod_map;
+
+ mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL);
+ if (!mod_map)
+ return NULL;
+
+ mod_map->mod = mod;
+ mod_map->start_addr = start;
+ mod_map->end_addr = end;
+ mod_map->num_funcs = 0;
+
+ INIT_LIST_HEAD_RCU(&mod_map->funcs);
+
+ list_add_rcu(&mod_map->list, &ftrace_mod_maps);
+
+ return mod_map;
+}
+
+static const char *
+ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
+ unsigned long addr, unsigned long *size,
+ unsigned long *off, char *sym)
+{
+ struct ftrace_mod_func *found_func = NULL;
+ struct ftrace_mod_func *mod_func;
+
+ list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+ if (addr >= mod_func->ip &&
+ addr < mod_func->ip + mod_func->size) {
+ found_func = mod_func;
+ break;
+ }
+ }
+
+ if (found_func) {
+ if (size)
+ *size = found_func->size;
+ if (off)
+ *off = addr - found_func->ip;
+ if (sym)
+ strlcpy(sym, found_func->name, KSYM_NAME_LEN);
+
+ return found_func->name;
+ }
+
+ return NULL;
+}
+
+const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+ unsigned long *off, char **modname, char *sym)
+{
+ struct ftrace_mod_map *mod_map;
+ const char *ret = NULL;
+
+ /* mod_map is freed via call_rcu_sched() */
+ preempt_disable();
+ list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+ ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym);
+ if (ret) {
+ if (modname)
+ *modname = mod_map->mod->name;
+ break;
+ }
+ }
+ preempt_enable();
+
+ return ret;
+}
+
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *name,
+ char *module_name, int *exported)
+{
+ struct ftrace_mod_map *mod_map;
+ struct ftrace_mod_func *mod_func;
+
+ preempt_disable();
+ list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+
+ if (symnum >= mod_map->num_funcs) {
+ symnum -= mod_map->num_funcs;
+ continue;
+ }
+
+ list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+ if (symnum > 1) {
+ symnum--;
+ continue;
+ }
+
+ *value = mod_func->ip;
+ *type = 'T';
+ strlcpy(name, mod_func->name, KSYM_NAME_LEN);
+ strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN);
+ *exported = 1;
+ preempt_enable();
+ return 0;
+ }
+ WARN_ON(1);
+ break;
+ }
+ preempt_enable();
+ return -ERANGE;
+}
+
+#else
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+ struct dyn_ftrace *rec) { }
+static inline struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+ unsigned long start, unsigned long end)
+{
+ return NULL;
+}
#endif /* CONFIG_MODULES */
-void __init ftrace_free_init_mem(void)
+struct ftrace_init_func {
+ struct list_head list;
+ unsigned long ip;
+};
+
+/* Clear any init ips from hashes */
+static void
+clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash)
+{
+ struct ftrace_func_entry *entry;
+
+ if (ftrace_hash_empty(hash))
+ return;
+
+ entry = __ftrace_lookup_ip(hash, func->ip);
+
+ /*
+ * Do not allow this rec to match again.
+ * Yeah, it may waste some memory, but will be removed
+ * if/when the hash is modified again.
+ */
+ if (entry)
+ entry->ip = 0;
+}
+
+static void
+clear_func_from_hashes(struct ftrace_init_func *func)
+{
+ struct trace_array *tr;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!tr->ops || !tr->ops->func_hash)
+ continue;
+ mutex_lock(&tr->ops->func_hash->regex_lock);
+ clear_func_from_hash(func, tr->ops->func_hash->filter_hash);
+ clear_func_from_hash(func, tr->ops->func_hash->notrace_hash);
+ mutex_unlock(&tr->ops->func_hash->regex_lock);
+ }
+ mutex_unlock(&trace_types_lock);
+}
+
+static void add_to_clear_hash_list(struct list_head *clear_list,
+ struct dyn_ftrace *rec)
+{
+ struct ftrace_init_func *func;
+
+ func = kmalloc(sizeof(*func), GFP_KERNEL);
+ if (!func) {
+ WARN_ONCE(1, "alloc failure, ftrace filter could be stale\n");
+ return;
+ }
+
+ func->ip = rec->ip;
+ list_add(&func->list, clear_list);
+}
+
+void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
{
- unsigned long start = (unsigned long)(&__init_begin);
- unsigned long end = (unsigned long)(&__init_end);
+ unsigned long start = (unsigned long)(start_ptr);
+ unsigned long end = (unsigned long)(end_ptr);
struct ftrace_page **last_pg = &ftrace_pages_start;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
struct dyn_ftrace key;
+ struct ftrace_mod_map *mod_map = NULL;
+ struct ftrace_init_func *func, *func_next;
+ struct list_head clear_hash;
int order;
+ INIT_LIST_HEAD(&clear_hash);
+
key.ip = start;
key.flags = end; /* overload flags, as it is unsigned long */
mutex_lock(&ftrace_lock);
+ /*
+ * If we are freeing module init memory, then check if
+ * any tracer is active. If so, we need to save a mapping of
+ * the module functions being freed with the address.
+ */
+ if (mod && ftrace_ops_list != &ftrace_list_end)
+ mod_map = allocate_ftrace_mod_map(mod, start, end);
+
for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) {
if (end < pg->records[0].ip ||
start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
@@ -5890,6 +6122,13 @@ void __init ftrace_free_init_mem(void)
ftrace_cmp_recs);
if (!rec)
continue;
+
+ /* rec will be cleared from hashes after ftrace_lock unlock */
+ add_to_clear_hash_list(&clear_hash, rec);
+
+ if (mod_map)
+ save_ftrace_mod_rec(mod_map, rec);
+
pg->index--;
ftrace_update_tot_cnt--;
if (!pg->index) {
@@ -5908,6 +6147,19 @@ void __init ftrace_free_init_mem(void)
goto again;
}
mutex_unlock(&ftrace_lock);
+
+ list_for_each_entry_safe(func, func_next, &clear_hash, list) {
+ clear_func_from_hashes(func);
+ kfree(func);
+ }
+}
+
+void __init ftrace_free_init_mem(void)
+{
+ void *start = (void *)(&__init_begin);
+ void *end = (void *)(&__init_end);
+
+ ftrace_free_mem(NULL, start, end);
}
void __init ftrace_init(void)
@@ -6063,10 +6315,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
* If any of the above fails then the op->func() is not executed.
*/
if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
- (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
- !ftrace_function_local_disabled(op)) &&
ftrace_ops_test(op, ip, regs)) {
-
if (FTRACE_WARN_ON(!op->func)) {
pr_warn("op=%p %pS\n", op, op);
goto out;
@@ -6124,10 +6373,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
preempt_disable_notrace();
- if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
- !ftrace_function_local_disabled(op)) {
- op->func(ip, parent_ip, op, regs);
- }
+ op->func(ip, parent_ip, op, regs);
preempt_enable_notrace();
trace_clear_recursion(bit);
@@ -6151,7 +6397,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
* or does per cpu logic, then we need to call the assist handler.
*/
if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
- ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+ ops->flags & FTRACE_OPS_FL_RCU)
return ftrace_ops_assist_func;
return ops->func;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d57fede84b38..91874a95060d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2536,61 +2536,29 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
* by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. To pass this
- * information from the lock to the unlock without having to
- * access the 'in_interrupt()' functions again (which do show
- * a bit of overhead in something as critical as function tracing,
- * we use a bitmask trick.
+ * be modified more than once via an interrupt. There are four
+ * different contexts that we need to consider.
*
- * bit 0 = NMI context
- * bit 1 = IRQ context
- * bit 2 = SoftIRQ context
- * bit 3 = normal context.
+ * Normal context.
+ * SoftIRQ context
+ * IRQ context
+ * NMI context
*
- * This works because this is the order of contexts that can
- * preempt other contexts. A SoftIRQ never preempts an IRQ
- * context.
- *
- * When the context is determined, the corresponding bit is
- * checked and set (if it was set, then a recursion of that context
- * happened).
- *
- * On unlock, we need to clear this bit. To do so, just subtract
- * 1 from the current_context and AND it to itself.
- *
- * (binary)
- * 101 - 1 = 100
- * 101 & 100 = 100 (clearing bit zero)
- *
- * 1010 - 1 = 1001
- * 1010 & 1001 = 1000 (clearing bit 1)
- *
- * The least significant bit can be cleared this way, and it
- * just so happens that it is the same bit corresponding to
- * the current context.
+ * If for some reason the ring buffer starts to recurse, we
+ * only allow that to happen at most 4 times (one for each
+ * context). If it happens 5 times, then we consider this a
+ * recusive loop and do not let it go further.
*/
static __always_inline int
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
- unsigned int val = cpu_buffer->current_context;
- int bit;
-
- if (in_interrupt()) {
- if (in_nmi())
- bit = RB_CTX_NMI;
- else if (in_irq())
- bit = RB_CTX_IRQ;
- else
- bit = RB_CTX_SOFTIRQ;
- } else
- bit = RB_CTX_NORMAL;
-
- if (unlikely(val & (1 << bit)))
+ if (cpu_buffer->current_context >= 4)
return 1;
- val |= (1 << bit);
- cpu_buffer->current_context = val;
+ cpu_buffer->current_context++;
+ /* Interrupts must see this update */
+ barrier();
return 0;
}
@@ -2598,7 +2566,9 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
static __always_inline void
trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
- cpu_buffer->current_context &= cpu_buffer->current_context - 1;
+ /* Don't let the dec leak out */
+ barrier();
+ cpu_buffer->current_context--;
}
/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 752e5daf0896..73e67b68c53b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7687,6 +7687,7 @@ static int instance_mkdir(const char *name)
struct trace_array *tr;
int ret;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = -EEXIST;
@@ -7742,6 +7743,7 @@ static int instance_mkdir(const char *name)
list_add(&tr->list, &ftrace_trace_arrays);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return 0;
@@ -7753,6 +7755,7 @@ static int instance_mkdir(const char *name)
out_unlock:
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
@@ -7765,6 +7768,7 @@ static int instance_rmdir(const char *name)
int ret;
int i;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = -ENODEV;
@@ -7810,6 +7814,7 @@ static int instance_rmdir(const char *name)
out_unlock:
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -8276,6 +8281,92 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
}
EXPORT_SYMBOL_GPL(ftrace_dump);
+int trace_run_command(const char *buf, int (*createfn)(int, char **))
+{
+ char **argv;
+ int argc, ret;
+
+ argc = 0;
+ ret = 0;
+ argv = argv_split(GFP_KERNEL, buf, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ if (argc)
+ ret = createfn(argc, argv);
+
+ argv_free(argv);
+
+ return ret;
+}
+
+#define WRITE_BUFSIZE 4096
+
+ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos,
+ int (*createfn)(int, char **))
+{
+ char *kbuf, *buf, *tmp;
+ int ret = 0;
+ size_t done = 0;
+ size_t size;
+
+ kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ while (done < count) {
+ size = count - done;
+
+ if (size >= WRITE_BUFSIZE)
+ size = WRITE_BUFSIZE - 1;
+
+ if (copy_from_user(kbuf, buffer + done, size)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ kbuf[size] = '\0';
+ buf = kbuf;
+ do {
+ tmp = strchr(buf, '\n');
+ if (tmp) {
+ *tmp = '\0';
+ size = tmp - buf + 1;
+ } else {
+ size = strlen(buf);
+ if (done + size < count) {
+ if (buf != kbuf)
+ break;
+ /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
+ pr_warn("Line length is too long: Should be less than %d\n",
+ WRITE_BUFSIZE - 2);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ done += size;
+
+ /* Remove comments */
+ tmp = strchr(buf, '#');
+
+ if (tmp)
+ *tmp = '\0';
+
+ ret = trace_run_command(buf, createfn);
+ if (ret)
+ goto out;
+ buf += size;
+
+ } while (done < count);
+ }
+ ret = done;
+
+out:
+ kfree(kbuf);
+
+ return ret;
+}
+
__init static int tracer_alloc_buffers(void)
{
int ring_buf_size;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6b0b343a36a2..2a6d0325a761 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -739,8 +739,6 @@ extern int trace_selftest_startup_wakeup(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_nop(struct tracer *trace,
struct trace_array *tr);
-extern int trace_selftest_startup_sched_switch(struct tracer *trace,
- struct trace_array *tr);
extern int trace_selftest_startup_branch(struct tracer *trace,
struct trace_array *tr);
/*
@@ -1755,6 +1753,13 @@ void trace_printk_start_comm(void);
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
+#define MAX_EVENT_NAME_LEN 64
+
+extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
+extern ssize_t trace_parse_run_command(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos,
+ int (*createfn)(int, char**));
+
/*
* Normal trace_printk() and friends allocates special buffers
* to do the manipulation, as well as saves the print formats
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 13ba2d3f6a91..55d6dff37daf 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_event *p_event)
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- struct hlist_head __percpu *pcpu_list;
- struct hlist_head *list;
-
- pcpu_list = tp_event->perf_events;
- if (WARN_ON_ONCE(!pcpu_list))
- return -EINVAL;
if (!(flags & PERF_EF_START))
p_event->hw.state = PERF_HES_STOPPED;
- list = this_cpu_ptr(pcpu_list);
- hlist_add_head_rcu(&p_event->hlist_entry, list);
+ /*
+ * If TRACE_REG_PERF_ADD returns false; no custom action was performed
+ * and we need to take the default action of enqueueing our event on
+ * the right per-cpu hlist.
+ */
+ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+ struct hlist_head __percpu *pcpu_list;
+ struct hlist_head *list;
+
+ pcpu_list = tp_event->perf_events;
+ if (WARN_ON_ONCE(!pcpu_list))
+ return -EINVAL;
+
+ list = this_cpu_ptr(pcpu_list);
+ hlist_add_head_rcu(&p_event->hlist_entry, list);
+ }
- return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+ return 0;
}
void perf_trace_del(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
- hlist_del_rcu(&p_event->hlist_entry);
- tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+ /*
+ * If TRACE_REG_PERF_DEL returns false; no custom action was performed
+ * and we need to take the default action of dequeueing our event from
+ * the right per-cpu hlist.
+ */
+ if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+ hlist_del_rcu(&p_event->hlist_entry);
}
void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -306,16 +320,25 @@ static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
- struct perf_event *event;
struct ftrace_entry *entry;
- struct hlist_head *head;
+ struct perf_event *event;
+ struct hlist_head head;
struct pt_regs regs;
int rctx;
- head = this_cpu_ptr(event_function.perf_events);
- if (hlist_empty(head))
+ if ((unsigned long)ops->private != smp_processor_id())
return;
+ event = container_of(ops, struct perf_event, ftrace_ops);
+
+ /*
+ * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
+ * the perf code does is hlist_for_each_entry_rcu(), so we can
+ * get away with simply setting the @head.first pointer in order
+ * to create a singular list.
+ */
+ head.first = &event->hlist_entry;
+
#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
sizeof(u64)) - sizeof(u32))
@@ -330,9 +353,8 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
entry->ip = ip;
entry->parent_ip = parent_ip;
- event = container_of(ops, struct perf_event, ftrace_ops);
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
- 1, &regs, head, NULL, event);
+ 1, &regs, &head, NULL);
#undef ENTRY_SIZE
}
@@ -341,8 +363,10 @@ static int perf_ftrace_function_register(struct perf_event *event)
{
struct ftrace_ops *ops = &event->ftrace_ops;
- ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
- ops->func = perf_ftrace_function_call;
+ ops->flags = FTRACE_OPS_FL_RCU;
+ ops->func = perf_ftrace_function_call;
+ ops->private = (void *)(unsigned long)nr_cpu_ids;
+
return register_ftrace_function(ops);
}
@@ -354,19 +378,11 @@ static int perf_ftrace_function_unregister(struct perf_event *event)
return ret;
}
-static void perf_ftrace_function_enable(struct perf_event *event)
-{
- ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
- ftrace_function_local_disable(&event->ftrace_ops);
-}
-
int perf_ftrace_event_register(struct trace_event_call *call,
enum trace_reg type, void *data)
{
+ struct perf_event *event = data;
+
switch (type) {
case TRACE_REG_REGISTER:
case TRACE_REG_UNREGISTER:
@@ -379,11 +395,11 @@ int perf_ftrace_event_register(struct trace_event_call *call,
case TRACE_REG_PERF_CLOSE:
return perf_ftrace_function_unregister(data);
case TRACE_REG_PERF_ADD:
- perf_ftrace_function_enable(data);
- return 0;
+ event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
+ return 1;
case TRACE_REG_PERF_DEL:
- perf_ftrace_function_disable(data);
- return 0;
+ event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
+ return 1;
}
return -EINVAL;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 87468398b9ed..ec0f9aa4e151 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1406,8 +1406,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
return -ENODEV;
/* Make sure the system still exists */
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
list_for_each_entry(dir, &tr->systems, list) {
if (dir == inode->i_private) {
@@ -1421,8 +1421,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
}
}
exit_loop:
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
if (!system)
return -ENODEV;
@@ -2294,15 +2294,15 @@ static void __add_event_to_tracers(struct trace_event_call *call);
int trace_add_event_call(struct trace_event_call *call)
{
int ret;
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
ret = __register_event(call, NULL);
if (ret >= 0)
__add_event_to_tracers(call);
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -2356,13 +2356,13 @@ int trace_remove_event_call(struct trace_event_call *call)
{
int ret;
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
down_write(&trace_event_sem);
ret = probe_remove_event_call(call);
up_write(&trace_event_sem);
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -2424,8 +2424,8 @@ static int trace_module_notify(struct notifier_block *self,
{
struct module *mod = data;
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
switch (val) {
case MODULE_STATE_COMING:
trace_module_add_events(mod);
@@ -2434,8 +2434,8 @@ static int trace_module_notify(struct notifier_block *self,
trace_module_remove_events(mod);
break;
}
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return 0;
}
@@ -2950,24 +2950,24 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
* creates the event hierachry in the @parent/events directory.
*
* Returns 0 on success.
+ *
+ * Must be called with event_mutex held.
*/
int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
{
int ret;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
ret = create_event_toplevel_files(parent, tr);
if (ret)
- goto out_unlock;
+ goto out;
down_write(&trace_event_sem);
__trace_add_event_dirs(tr);
up_write(&trace_event_sem);
- out_unlock:
- mutex_unlock(&event_mutex);
-
+ out:
return ret;
}
@@ -2996,9 +2996,10 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
return ret;
}
+/* Must be called with event_mutex held */
int event_trace_del_tracer(struct trace_array *tr)
{
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
/* Disable any event triggers and associated soft-disabled events */
clear_event_triggers(tr);
@@ -3019,8 +3020,6 @@ int event_trace_del_tracer(struct trace_array *tr)
tr->event_dir = NULL;
- mutex_unlock(&event_mutex);
-
return 0;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1c21d0e2a145..1e1558c99d56 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -28,12 +28,16 @@ struct hist_field;
typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
+#define HIST_FIELD_OPERANDS_MAX 2
+
struct hist_field {
struct ftrace_event_field *field;
unsigned long flags;
hist_field_fn_t fn;
unsigned int size;
unsigned int offset;
+ unsigned int is_signed;
+ struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
};
static u64 hist_field_none(struct hist_field *field, void *event)
@@ -71,7 +75,9 @@ static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
static u64 hist_field_log2(struct hist_field *hist_field, void *event)
{
- u64 val = *(u64 *)(event + hist_field->field->offset);
+ struct hist_field *operand = hist_field->operands[0];
+
+ u64 val = operand->fn(operand, event);
return (u64) ilog2(roundup_pow_of_two(val));
}
@@ -110,16 +116,16 @@ DEFINE_HIST_FIELD_FN(u8);
#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
enum hist_field_flags {
- HIST_FIELD_FL_HITCOUNT = 1,
- HIST_FIELD_FL_KEY = 2,
- HIST_FIELD_FL_STRING = 4,
- HIST_FIELD_FL_HEX = 8,
- HIST_FIELD_FL_SYM = 16,
- HIST_FIELD_FL_SYM_OFFSET = 32,
- HIST_FIELD_FL_EXECNAME = 64,
- HIST_FIELD_FL_SYSCALL = 128,
- HIST_FIELD_FL_STACKTRACE = 256,
- HIST_FIELD_FL_LOG2 = 512,
+ HIST_FIELD_FL_HITCOUNT = 1 << 0,
+ HIST_FIELD_FL_KEY = 1 << 1,
+ HIST_FIELD_FL_STRING = 1 << 2,
+ HIST_FIELD_FL_HEX = 1 << 3,
+ HIST_FIELD_FL_SYM = 1 << 4,
+ HIST_FIELD_FL_SYM_OFFSET = 1 << 5,
+ HIST_FIELD_FL_EXECNAME = 1 << 6,
+ HIST_FIELD_FL_SYSCALL = 1 << 7,
+ HIST_FIELD_FL_STACKTRACE = 1 << 8,
+ HIST_FIELD_FL_LOG2 = 1 << 9,
};
struct hist_trigger_attrs {
@@ -146,6 +152,25 @@ struct hist_trigger_data {
struct tracing_map *map;
};
+static const char *hist_field_name(struct hist_field *field,
+ unsigned int level)
+{
+ const char *field_name = "";
+
+ if (level > 1)
+ return field_name;
+
+ if (field->field)
+ field_name = field->field->name;
+ else if (field->flags & HIST_FIELD_FL_LOG2)
+ field_name = hist_field_name(field->operands[0], ++level);
+
+ if (field_name == NULL)
+ field_name = "";
+
+ return field_name;
+}
+
static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
{
hist_field_fn_t fn = NULL;
@@ -340,8 +365,20 @@ static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
.elt_init = hist_trigger_elt_comm_init,
};
-static void destroy_hist_field(struct hist_field *hist_field)
+static void destroy_hist_field(struct hist_field *hist_field,
+ unsigned int level)
{
+ unsigned int i;
+
+ if (level > 2)
+ return;
+
+ if (!hist_field)
+ return;
+
+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
+ destroy_hist_field(hist_field->operands[i], level + 1);
+
kfree(hist_field);
}
@@ -368,7 +405,10 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
}
if (flags & HIST_FIELD_FL_LOG2) {
+ unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
hist_field->fn = hist_field_log2;
+ hist_field->operands[0] = create_hist_field(field, fl);
+ hist_field->size = hist_field->operands[0]->size;
goto out;
}
@@ -388,7 +428,7 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
hist_field->fn = select_value_fn(field->size,
field->is_signed);
if (!hist_field->fn) {
- destroy_hist_field(hist_field);
+ destroy_hist_field(hist_field, 0);
return NULL;
}
}
@@ -405,7 +445,7 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data)
for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
if (hist_data->fields[i]) {
- destroy_hist_field(hist_data->fields[i]);
+ destroy_hist_field(hist_data->fields[i], 0);
hist_data->fields[i] = NULL;
}
}
@@ -450,7 +490,7 @@ static int create_val_field(struct hist_trigger_data *hist_data,
}
field = trace_find_event_field(file->event_call, field_name);
- if (!field) {
+ if (!field || !field->size) {
ret = -EINVAL;
goto out;
}
@@ -548,7 +588,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
}
field = trace_find_event_field(file->event_call, field_name);
- if (!field) {
+ if (!field || !field->size) {
ret = -EINVAL;
goto out;
}
@@ -653,7 +693,6 @@ static int is_descending(const char *str)
static int create_sort_keys(struct hist_trigger_data *hist_data)
{
char *fields_str = hist_data->attrs->sort_key_str;
- struct ftrace_event_field *field = NULL;
struct tracing_map_sort_key *sort_key;
int descending, ret = 0;
unsigned int i, j;
@@ -670,7 +709,9 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
}
for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
+ struct hist_field *hist_field;
char *field_str, *field_name;
+ const char *test_name;
sort_key = &hist_data->sort_keys[i];
@@ -703,8 +744,10 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
}
for (j = 1; j < hist_data->n_fields; j++) {
- field = hist_data->fields[j]->field;
- if (field && (strcmp(field_name, field->name) == 0)) {
+ hist_field = hist_data->fields[j];
+ test_name = hist_field_name(hist_field, 0);
+
+ if (strcmp(field_name, test_name) == 0) {
sort_key->field_idx = j;
descending = is_descending(field_str);
if (descending < 0) {
@@ -952,6 +995,7 @@ hist_trigger_entry_print(struct seq_file *m,
struct hist_field *key_field;
char str[KSYM_SYMBOL_LEN];
bool multiline = false;
+ const char *field_name;
unsigned int i;
u64 uval;
@@ -963,26 +1007,27 @@ hist_trigger_entry_print(struct seq_file *m,
if (i > hist_data->n_vals)
seq_puts(m, ", ");
+ field_name = hist_field_name(key_field, 0);
+
if (key_field->flags & HIST_FIELD_FL_HEX) {
uval = *(u64 *)(key + key_field->offset);
- seq_printf(m, "%s: %llx",
- key_field->field->name, uval);
+ seq_printf(m, "%s: %llx", field_name, uval);
} else if (key_field->flags & HIST_FIELD_FL_SYM) {
uval = *(u64 *)(key + key_field->offset);
sprint_symbol_no_offset(str, uval);
- seq_printf(m, "%s: [%llx] %-45s",
- key_field->field->name, uval, str);
+ seq_printf(m, "%s: [%llx] %-45s", field_name,
+ uval, str);
} else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
uval = *(u64 *)(key + key_field->offset);
sprint_symbol(str, uval);
- seq_printf(m, "%s: [%llx] %-55s",
- key_field->field->name, uval, str);
+ seq_printf(m, "%s: [%llx] %-55s", field_name,
+ uval, str);
} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
char *comm = elt->private_data;
uval = *(u64 *)(key + key_field->offset);
- seq_printf(m, "%s: %-16s[%10llu]",
- key_field->field->name, comm, uval);
+ seq_printf(m, "%s: %-16s[%10llu]", field_name,
+ comm, uval);
} else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
const char *syscall_name;
@@ -991,8 +1036,8 @@ hist_trigger_entry_print(struct seq_file *m,
if (!syscall_name)
syscall_name = "unknown_syscall";
- seq_printf(m, "%s: %-30s[%3llu]",
- key_field->field->name, syscall_name, uval);
+ seq_printf(m, "%s: %-30s[%3llu]", field_name,
+ syscall_name, uval);
} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
seq_puts(m, "stacktrace:\n");
hist_trigger_stacktrace_print(m,
@@ -1000,15 +1045,14 @@ hist_trigger_entry_print(struct seq_file *m,
HIST_STACKTRACE_DEPTH);
multiline = true;
} else if (key_field->flags & HIST_FIELD_FL_LOG2) {
- seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
+ seq_printf(m, "%s: ~ 2^%-2llu", field_name,
*(u64 *)(key + key_field->offset));
} else if (key_field->flags & HIST_FIELD_FL_STRING) {
- seq_printf(m, "%s: %-50s", key_field->field->name,
+ seq_printf(m, "%s: %-50s", field_name,
(char *)(key + key_field->offset));
} else {
uval = *(u64 *)(key + key_field->offset);
- seq_printf(m, "%s: %10llu", key_field->field->name,
- uval);
+ seq_printf(m, "%s: %10llu", field_name, uval);
}
}
@@ -1021,13 +1065,13 @@ hist_trigger_entry_print(struct seq_file *m,
tracing_map_read_sum(elt, HITCOUNT_IDX));
for (i = 1; i < hist_data->n_vals; i++) {
+ field_name = hist_field_name(hist_data->fields[i], 0);
+
if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
- seq_printf(m, " %s: %10llx",
- hist_data->fields[i]->field->name,
+ seq_printf(m, " %s: %10llx", field_name,
tracing_map_read_sum(elt, i));
} else {
- seq_printf(m, " %s: %10llu",
- hist_data->fields[i]->field->name,
+ seq_printf(m, " %s: %10llu", field_name,
tracing_map_read_sum(elt, i));
}
}
@@ -1062,7 +1106,7 @@ static void hist_trigger_show(struct seq_file *m,
struct event_trigger_data *data, int n)
{
struct hist_trigger_data *hist_data;
- int n_entries, ret = 0;
+ int n_entries;
if (n > 0)
seq_puts(m, "\n\n");
@@ -1073,10 +1117,8 @@ static void hist_trigger_show(struct seq_file *m,
hist_data = data->private_data;
n_entries = print_entries(m, hist_data);
- if (n_entries < 0) {
- ret = n_entries;
+ if (n_entries < 0)
n_entries = 0;
- }
seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n",
(u64)atomic64_read(&hist_data->map->hits),
@@ -1142,7 +1184,9 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
{
- seq_printf(m, "%s", hist_field->field->name);
+ const char *field_name = hist_field_name(hist_field, 0);
+
+ seq_printf(m, "%s", field_name);
if (hist_field->flags) {
const char *flags_str = get_hist_field_flags(hist_field);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7758bc0617cb..03ecb4465ee4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,10 @@
#include "trace.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
static struct trace_array *irqsoff_trace __read_mostly;
static int tracer_enabled __read_mostly;
@@ -463,63 +467,43 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
#else /* !CONFIG_PROVE_LOCKING */
/*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
-
-/*
* We are only interested in hardirq on/off events:
*/
-void trace_hardirqs_on(void)
+static inline void tracer_hardirqs_on(void)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_on);
-void trace_hardirqs_off(void)
+static inline void tracer_hardirqs_off(void)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_off);
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_PROVE_LOCKING */
#endif /* CONFIG_IRQSOFF_TRACER */
#ifdef CONFIG_PREEMPT_TRACER
-void trace_preempt_on(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
stop_critical_timing(a0, a1);
}
-void trace_preempt_off(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
start_critical_timing(a0, a1);
@@ -781,3 +765,100 @@ __init static int init_irqsoff_tracer(void)
return 0;
}
core_initcall(init_irqsoff_tracer);
+#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
+
+#ifndef CONFIG_IRQSOFF_TRACER
+static inline void tracer_hardirqs_on(void) { }
+static inline void tracer_hardirqs_off(void) { }
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#endif
+
+#ifndef CONFIG_PREEMPT_TRACER
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
+void trace_hardirqs_on(void)
+{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_on();
+
+ this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_off();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_on_caller(caller_addr);
+
+ this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_off_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACER) || \
+ (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+ trace_preempt_enable_rcuidle(a0, a1);
+ tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+ trace_preempt_disable_rcuidle(a0, a1);
+ tracer_preempt_off(a0, a1);
+}
+#endif
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index abf92e478cfb..492700c5fb4d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -907,8 +907,8 @@ static int probes_open(struct inode *inode, struct file *file)
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- return traceprobe_probes_write(file, buffer, count, ppos,
- create_trace_kprobe);
+ return trace_parse_run_command(file, buffer, count, ppos,
+ create_trace_kprobe);
}
static const struct file_operations kprobe_events_ops = {
@@ -1199,7 +1199,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL, NULL);
+ head, NULL);
}
NOKPROBE_SYMBOL(kprobe_perf_func);
@@ -1234,7 +1234,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL, NULL);
+ head, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
@@ -1431,9 +1431,9 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
- ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
- "$stack $stack0 +0($stack)",
- create_trace_kprobe);
+ ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
+ "$stack $stack0 +0($stack)",
+ create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function entry.\n");
warn++;
@@ -1453,8 +1453,8 @@ static __init int kprobe_trace_self_tests_init(void)
}
}
- ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
- "$retval", create_trace_kprobe);
+ ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
+ "$retval", create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function return.\n");
warn++;
@@ -1524,13 +1524,13 @@ static __init int kprobe_trace_self_tests_init(void)
disable_trace_kprobe(tk, file);
}
- ret = traceprobe_command("-:testprobe", create_trace_kprobe);
+ ret = trace_run_command("-:testprobe", create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
}
- ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
+ ret = trace_run_command("-:testprobe2", create_trace_kprobe);
if (WARN_ON_ONCE(ret)) {
pr_warn("error on deleting a probe.\n");
warn++;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 52478f033f88..d59357308677 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -623,92 +623,6 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
kfree(arg->comm);
}
-int traceprobe_command(const char *buf, int (*createfn)(int, char **))
-{
- char **argv;
- int argc, ret;
-
- argc = 0;
- ret = 0;
- argv = argv_split(GFP_KERNEL, buf, &argc);
- if (!argv)
- return -ENOMEM;
-
- if (argc)
- ret = createfn(argc, argv);
-
- argv_free(argv);
-
- return ret;
-}
-
-#define WRITE_BUFSIZE 4096
-
-ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
- size_t count, loff_t *ppos,
- int (*createfn)(int, char **))
-{
- char *kbuf, *buf, *tmp;
- int ret = 0;
- size_t done = 0;
- size_t size;
-
- kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
- if (!kbuf)
- return -ENOMEM;
-
- while (done < count) {
- size = count - done;
-
- if (size >= WRITE_BUFSIZE)
- size = WRITE_BUFSIZE - 1;
-
- if (copy_from_user(kbuf, buffer + done, size)) {
- ret = -EFAULT;
- goto out;
- }
- kbuf[size] = '\0';
- buf = kbuf;
- do {
- tmp = strchr(buf, '\n');
- if (tmp) {
- *tmp = '\0';
- size = tmp - buf + 1;
- } else {
- size = strlen(buf);
- if (done + size < count) {
- if (buf != kbuf)
- break;
- /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
- pr_warn("Line length is too long: Should be less than %d\n",
- WRITE_BUFSIZE - 2);
- ret = -EINVAL;
- goto out;
- }
- }
- done += size;
-
- /* Remove comments */
- tmp = strchr(buf, '#');
-
- if (tmp)
- *tmp = '\0';
-
- ret = traceprobe_command(buf, createfn);
- if (ret)
- goto out;
- buf += size;
-
- } while (done < count);
- }
- ret = done;
-
-out:
- kfree(kbuf);
-
- return ret;
-}
-
static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
bool is_return)
{
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 903273c93e61..fb66e3eaa192 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -42,7 +42,6 @@
#define MAX_TRACE_ARGS 128
#define MAX_ARGSTR_LEN 63
-#define MAX_EVENT_NAME_LEN 64
#define MAX_STRING_SIZE PATH_MAX
/* Reserved field names */
@@ -356,12 +355,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg);
extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
-extern ssize_t traceprobe_probes_write(struct file *file,
- const char __user *buffer, size_t count, loff_t *ppos,
- int (*createfn)(int, char**));
-
-extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
-
/* Sum up total data length for dynamic arraies (strings) */
static nokprobe_inline int
__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index cd70eb5df38e..11e9daa4a568 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -60,7 +60,7 @@ static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
* Test the trace buffer to see if all the elements
* are still sane.
*/
-static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
+static int __maybe_unused trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
{
unsigned long flags, cnt = 0;
int cpu, ret = 0;
@@ -1151,38 +1151,6 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
}
#endif /* CONFIG_SCHED_TRACER */
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-int
-trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
-{
- unsigned long count;
- int ret;
-
- /* start the tracing */
- ret = tracer_init(trace, tr);
- if (ret) {
- warn_failed_init_tracer(trace, ret);
- return ret;
- }
-
- /* Sleep for a 1/10 of a second */
- msleep(100);
- /* stop the tracing. */
- tracing_stop();
- /* check the trace buffer */
- ret = trace_test_buffer(&tr->trace_buffer, &count);
- trace->reset(tr);
- tracing_start();
-
- if (!ret && !count) {
- printk(KERN_CONT ".. no entries found ..");
- ret = -1;
- }
-
- return ret;
-}
-#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
-
#ifdef CONFIG_BRANCH_TRACER
int
trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 19bcaaac884b..f93a56d2db27 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -625,7 +625,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
- head, NULL, NULL);
+ head, NULL);
}
static int perf_sysenter_enable(struct trace_event_call *call)
@@ -721,7 +721,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
}
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
- 1, regs, head, NULL, NULL);
+ 1, regs, head, NULL);
}
static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 153c0e411461..40592e7b3568 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -651,7 +651,7 @@ static int probes_open(struct inode *inode, struct file *file)
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
+ return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
}
static const struct file_operations uprobe_events_ops = {
@@ -1155,7 +1155,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
}
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL, NULL);
+ head, NULL);
out:
preempt_enable();
}
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 305039b122fa..07e75344725b 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -428,7 +428,8 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
if (test_key && test_key == key_hash && entry->val &&
keys_match(key, entry->val->key, map->key_size)) {
- atomic64_inc(&map->hits);
+ if (!lookup_only)
+ atomic64_inc(&map->hits);
return entry->val;
}
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index ab0ca77331d0..5b5bbf8ae550 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -6,7 +6,7 @@
#define TRACING_MAP_BITS_MAX 17
#define TRACING_MAP_BITS_MIN 7
-#define TRACING_MAP_KEYS_MAX 2
+#define TRACING_MAP_KEYS_MAX 3
#define TRACING_MAP_VALS_MAX 3
#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
TRACING_MAP_VALS_MAX)
diff --git a/kernel/umh.c b/kernel/umh.c
index 6ff9905250ff..18e5fa4b0e71 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -537,14 +537,14 @@ static int proc_cap_handler(struct ctl_table *table, int write,
/*
* Drop everything not in the new_cap (but don't add things)
*/
- spin_lock(&umh_sysctl_lock);
if (write) {
+ spin_lock(&umh_sysctl_lock);
if (table->data == CAP_BSET)
usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
if (table->data == CAP_PI)
usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ spin_unlock(&umh_sysctl_lock);
}
- spin_unlock(&umh_sysctl_lock);
return 0;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dde6298f6b22..8fdb710bfdd7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1509,7 +1509,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work = &dwork->work;
WARN_ON_ONCE(!wq);
- WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
+ WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
WARN_ON_ONCE(timer_pending(timer));
WARN_ON_ONCE(!list_empty(&work->entry));