summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore2
-rw-r--r--kernel/Kconfig.locks3
-rw-r--r--kernel/Makefile11
-rw-r--r--kernel/async.c53
-rw-r--r--kernel/audit.c267
-rw-r--r--kernel/audit.h81
-rw-r--r--kernel/audit_fsnotify.c2
-rw-r--r--kernel/audit_tree.c19
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditfilter.c6
-rw-r--r--kernel/auditsc.c320
-rw-r--r--kernel/bpf/arraymap.c23
-rw-r--r--kernel/bpf/btf.c146
-rw-r--r--kernel/bpf/cgroup.c3
-rw-r--r--kernel/bpf/core.c317
-rw-r--r--kernel/bpf/cpumap.c13
-rw-r--r--kernel/bpf/disasm.c34
-rw-r--r--kernel/bpf/hashtab.c63
-rw-r--r--kernel/bpf/helpers.c96
-rw-r--r--kernel/bpf/inode.c32
-rw-r--r--kernel/bpf/local_storage.c16
-rw-r--r--kernel/bpf/map_in_map.c6
-rw-r--r--kernel/bpf/offload.c45
-rw-r--r--kernel/bpf/syscall.c108
-rw-r--r--kernel/bpf/verifier.c1108
-rw-r--r--kernel/capability.c45
-rw-r--r--kernel/cgroup/cgroup-internal.h51
-rw-r--r--kernel/cgroup/cgroup-v1.c428
-rw-r--r--kernel/cgroup/cgroup.c269
-rw-r--r--kernel/cgroup/cpuset.c69
-rw-r--r--kernel/cgroup/pids.c4
-rw-r--r--kernel/cgroup/rdma.c5
-rw-r--r--kernel/cgroup/rstat.c10
-rw-r--r--kernel/compat.c64
-rw-r--r--kernel/configs.c42
-rw-r--r--kernel/cpu.c29
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/cred.c13
-rw-r--r--kernel/dma/Kconfig124
-rw-r--r--kernel/dma/Makefile2
-rw-r--r--kernel/dma/coherent.c50
-rw-r--r--kernel/dma/debug.c110
-rw-r--r--kernel/dma/direct.c28
-rw-r--r--kernel/dma/mapping.c25
-rw-r--r--kernel/dma/swiotlb.c91
-rw-r--r--kernel/events/callchain.c3
-rw-r--r--kernel/events/core.c410
-rw-r--r--kernel/events/hw_breakpoint.c15
-rw-r--r--kernel/events/internal.h5
-rw-r--r--kernel/events/ring_buffer.c49
-rw-r--r--kernel/events/uprobes.c23
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c27
-rw-r--r--kernel/futex.c32
-rw-r--r--kernel/gcov/gcc_3_4.c6
-rw-r--r--kernel/hung_task.c3
-rw-r--r--kernel/irq/affinity.c121
-rw-r--r--kernel/irq/chip.c82
-rw-r--r--kernel/irq/debugfs.c8
-rw-r--r--kernel/irq/devres.c2
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/internals.h10
-rw-r--r--kernel/irq/irq_sim.c12
-rw-r--r--kernel/irq/irqdesc.c43
-rw-r--r--kernel/irq/irqdomain.c61
-rw-r--r--kernel/irq/manage.c407
-rw-r--r--kernel/kallsyms.c2
-rw-r--r--kernel/kcov.c15
-rw-r--r--kernel/kprobes.c21
-rw-r--r--kernel/kthread.c54
-rw-r--r--kernel/livepatch/core.c854
-rw-r--r--kernel/livepatch/core.h11
-rw-r--r--kernel/livepatch/patch.c57
-rw-r--r--kernel/livepatch/patch.h5
-rw-r--r--kernel/livepatch/transition.c124
-rw-r--r--kernel/livepatch/transition.h1
-rw-r--r--kernel/locking/lockdep.c977
-rw-r--r--kernel/locking/lockdep_internals.h7
-rw-r--r--kernel/locking/lockdep_proc.c12
-rw-r--r--kernel/locking/locktorture.c21
-rw-r--r--kernel/locking/qspinlock.c21
-rw-r--r--kernel/locking/qspinlock_stat.h21
-rw-r--r--kernel/locking/rwsem-xadd.c4
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/power/energy_model.c57
-rw-r--r--kernel/power/qos.c8
-rw-r--r--kernel/power/snapshot.c20
-rw-r--r--kernel/printk/printk.c100
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/rcu/Kconfig30
-rw-r--r--kernel/rcu/rcu.h21
-rw-r--r--kernel/rcu/rcu_segcblist.c17
-rw-r--r--kernel/rcu/rcu_segcblist.h17
-rw-r--r--kernel/rcu/rcuperf.c27
-rw-r--r--kernel/rcu/rcutorture.c59
-rw-r--r--kernel/rcu/srcutiny.c17
-rw-r--r--kernel/rcu/srcutree.c72
-rw-r--r--kernel/rcu/sync.c15
-rw-r--r--kernel/rcu/tiny.c19
-rw-r--r--kernel/rcu/tree.c269
-rw-r--r--kernel/rcu/tree.h53
-rw-r--r--kernel/rcu/tree_exp.h201
-rw-r--r--kernel/rcu/tree_plugin.h238
-rw-r--r--kernel/rcu/update.c19
-rw-r--r--kernel/relay.c1
-rw-r--r--kernel/resource.c22
-rw-r--r--kernel/sched/core.c117
-rw-r--r--kernel/sched/cpufreq.c4
-rw-r--r--kernel/sched/cpufreq_schedutil.c61
-rw-r--r--kernel/sched/deadline.c6
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/fair.c547
-rw-r--r--kernel/sched/isolation.c2
-rw-r--r--kernel/sched/pelt.c45
-rw-r--r--kernel/sched/pelt.h114
-rw-r--r--kernel/sched/rt.c6
-rw-r--r--kernel/sched/sched.h56
-rw-r--r--kernel/sched/topology.c37
-rw-r--r--kernel/seccomp.c8
-rw-r--r--kernel/signal.c130
-rw-r--r--kernel/softirq.c3
-rw-r--r--kernel/sys.c11
-rw-r--r--kernel/sys_ni.c29
-rw-r--r--kernel/sysctl.c144
-rw-r--r--kernel/time/Kconfig29
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/hrtimer.c4
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c18
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/posix-clock.c2
-rw-r--r--kernel/time/posix-cpu-timers.c13
-rw-r--r--kernel/time/posix-stubs.c25
-rw-r--r--kernel/time/posix-timers.c72
-rw-r--r--kernel/time/posix-timers.h2
-rw-r--r--kernel/time/tick-broadcast.c1
-rw-r--r--kernel/time/time.c92
-rw-r--r--kernel/time/timekeeping.c4
-rw-r--r--kernel/time/timekeeping_debug.c11
-rw-r--r--kernel/time/timer.c4
-rw-r--r--kernel/torture.c25
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/blktrace.c1
-rw-r--r--kernel/trace/bpf_trace.c3
-rw-r--r--kernel/trace/ftrace.c42
-rw-r--r--kernel/trace/ring_buffer.c19
-rw-r--r--kernel/trace/trace.c231
-rw-r--r--kernel/trace/trace.h66
-rw-r--r--kernel/trace/trace_dynevent.c2
-rw-r--r--kernel/trace/trace_entries.h41
-rw-r--r--kernel/trace/trace_event_perf.c16
-rw-r--r--kernel/trace/trace_events_filter.c12
-rw-r--r--kernel/trace/trace_events_hist.c1058
-rw-r--r--kernel/trace/trace_functions_graph.c30
-rw-r--r--kernel/trace/trace_irqsoff.c11
-rw-r--r--kernel/trace/trace_kdb.c6
-rw-r--r--kernel/trace/trace_kprobe.c25
-rw-r--r--kernel/trace/trace_preemptirq.c5
-rw-r--r--kernel/trace/trace_probe.c21
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_wakeup.c11
-rw-r--r--kernel/trace/trace_syscalls.c9
-rw-r--r--kernel/trace/trace_uprobe.c8
-rw-r--r--kernel/watchdog.c17
-rw-r--r--kernel/workqueue.c169
166 files changed, 8551 insertions, 3965 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index b3097bde4e9c..6e699100872f 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,7 +1,5 @@
#
# Generated files
#
-config_data.h
-config_data.gz
timeconst.h
hz.bc
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 84d882f3e299..fbba478ae522 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS
def_bool y if ARCH_USE_QUEUED_SPINLOCKS
depends on SMP
+config BPF_ARCH_SPINLOCK
+ bool
+
config ARCH_USE_QUEUED_RWLOCKS
bool
diff --git a/kernel/Makefile b/kernel/Makefile
index 6aa7543bcdb2..6c57e78817da 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -116,17 +116,8 @@ obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
KCOV_INSTRUMENT_stackleak.o := n
-$(obj)/configs.o: $(obj)/config_data.h
+$(obj)/configs.o: $(obj)/config_data.gz
targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
-
-filechk_ikconfiggz = \
- echo "static const char kernel_config_data[] __used = MAGIC_START"; \
- cat $< | scripts/bin2c; \
- echo "MAGIC_END;"
-
-targets += config_data.h
-$(obj)/config_data.h: $(obj)/config_data.gz FORCE
- $(call filechk,ikconfiggz)
diff --git a/kernel/async.c b/kernel/async.c
index a893d6170944..f6bd0d9885e1 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -149,7 +149,25 @@ static void async_run_entry_fn(struct work_struct *work)
wake_up(&async_done);
}
-static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
+/**
+ * async_schedule_node_domain - NUMA specific version of async_schedule_domain
+ * @func: function to execute asynchronously
+ * @data: data pointer to pass to the function
+ * @node: NUMA node that we want to schedule this on or close to
+ * @domain: the domain
+ *
+ * Returns an async_cookie_t that may be used for checkpointing later.
+ * @domain may be used in the async_synchronize_*_domain() functions to
+ * wait within a certain synchronization domain rather than globally.
+ *
+ * Note: This function may be called from atomic or non-atomic contexts.
+ *
+ * The node requested will be honored on a best effort basis. If the node
+ * has no CPUs associated with it then the work is distributed among all
+ * available CPUs.
+ */
+async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
+ int node, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
@@ -195,43 +213,30 @@ static async_cookie_t __async_schedule(async_func_t func, void *data, struct asy
current->flags |= PF_USED_ASYNC;
/* schedule for execution */
- queue_work(system_unbound_wq, &entry->work);
+ queue_work_node(node, system_unbound_wq, &entry->work);
return newcookie;
}
+EXPORT_SYMBOL_GPL(async_schedule_node_domain);
/**
- * async_schedule - schedule a function for asynchronous execution
+ * async_schedule_node - NUMA specific version of async_schedule
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
+ * @node: NUMA node that we want to schedule this on or close to
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
- */
-async_cookie_t async_schedule(async_func_t func, void *data)
-{
- return __async_schedule(func, data, &async_dfl_domain);
-}
-EXPORT_SYMBOL_GPL(async_schedule);
-
-/**
- * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
- * @func: function to execute asynchronously
- * @data: data pointer to pass to the function
- * @domain: the domain
*
- * Returns an async_cookie_t that may be used for checkpointing later.
- * @domain may be used in the async_synchronize_*_domain() functions to
- * wait within a certain synchronization domain rather than globally. A
- * synchronization domain is specified via @domain. Note: This function
- * may be called from atomic or non-atomic contexts.
+ * The node requested will be honored on a best effort basis. If the node
+ * has no CPUs associated with it then the work is distributed among all
+ * available CPUs.
*/
-async_cookie_t async_schedule_domain(async_func_t func, void *data,
- struct async_domain *domain)
+async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
{
- return __async_schedule(func, data, domain);
+ return async_schedule_node_domain(func, data, node, &async_dfl_domain);
}
-EXPORT_SYMBOL_GPL(async_schedule_domain);
+EXPORT_SYMBOL_GPL(async_schedule_node);
/**
* async_synchronize_full - synchronize all asynchronous function calls
diff --git a/kernel/audit.c b/kernel/audit.c
index 632d36059556..c89ea48c70a6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -396,10 +396,10 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old,
struct audit_buffer *ab;
int rc = 0;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return rc;
- audit_log_format(ab, "%s=%u old=%u ", function_name, new, old);
+ audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old);
audit_log_session_info(ab);
rc = audit_log_task_context(ab);
if (rc)
@@ -1053,7 +1053,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return err;
}
-static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
+static void audit_log_common_recv_msg(struct audit_context *context,
+ struct audit_buffer **ab, u16 msg_type)
{
uid_t uid = from_kuid(&init_user_ns, current_uid());
pid_t pid = task_tgid_nr(current);
@@ -1063,7 +1064,7 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
return;
}
- *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+ *ab = audit_log_start(context, GFP_KERNEL, msg_type);
if (unlikely(!*ab))
return;
audit_log_format(*ab, "pid=%d uid=%u ", pid, uid);
@@ -1071,6 +1072,12 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
audit_log_task_context(*ab);
}
+static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
+ u16 msg_type)
+{
+ audit_log_common_recv_msg(NULL, ab, msg_type);
+}
+
int is_audit_feature_set(int i)
{
return af.features & AUDIT_FEATURE_TO_MASK(i);
@@ -1338,7 +1345,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
break;
}
- audit_log_common_recv_msg(&ab, msg_type);
+ audit_log_user_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY)
audit_log_format(ab, " msg='%.*s'",
AUDIT_MESSAGE_TEXT_MAX,
@@ -1361,8 +1368,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " op=%s audit_enabled=%d res=0",
+ msg_type == AUDIT_ADD_RULE ?
+ "add_rule" : "remove_rule",
+ audit_enabled);
audit_log_end(ab);
return -EPERM;
}
@@ -1373,7 +1384,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
case AUDIT_TRIM:
audit_trim_trees();
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=trim res=1");
audit_log_end(ab);
break;
@@ -1403,8 +1415,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* OK, here comes... */
err = audit_tag_tree(old, new);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
-
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=make_equiv old=");
audit_log_untrustedstring(ab, old);
audit_log_format(ab, " new=");
@@ -1471,7 +1483,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
old.enabled = t & AUDIT_TTY_ENABLE;
old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
" old-log_passwd=%d new-log_passwd=%d res=%d",
old.enabled, s.enabled, old.log_passwd,
@@ -2054,153 +2067,6 @@ void audit_log_key(struct audit_buffer *ab, char *key)
audit_log_format(ab, "(null)");
}
-void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
-{
- int i;
-
- if (cap_isclear(*cap)) {
- audit_log_format(ab, " %s=0", prefix);
- return;
- }
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i)
- audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
-}
-
-static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
-{
- audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
- audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
- audit_log_format(ab, " cap_fe=%d cap_fver=%x",
- name->fcap.fE, name->fcap_ver);
-}
-
-static inline int audit_copy_fcaps(struct audit_names *name,
- const struct dentry *dentry)
-{
- struct cpu_vfs_cap_data caps;
- int rc;
-
- if (!dentry)
- return 0;
-
- rc = get_vfs_caps_from_disk(dentry, &caps);
- if (rc)
- return rc;
-
- name->fcap.permitted = caps.permitted;
- name->fcap.inheritable = caps.inheritable;
- name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
- name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
- VFS_CAP_REVISION_SHIFT;
-
- return 0;
-}
-
-/* Copy inode data into an audit_names. */
-void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- struct inode *inode)
-{
- name->ino = inode->i_ino;
- name->dev = inode->i_sb->s_dev;
- name->mode = inode->i_mode;
- name->uid = inode->i_uid;
- name->gid = inode->i_gid;
- name->rdev = inode->i_rdev;
- security_inode_getsecid(inode, &name->osid);
- audit_copy_fcaps(name, dentry);
-}
-
-/**
- * audit_log_name - produce AUDIT_PATH record from struct audit_names
- * @context: audit_context for the task
- * @n: audit_names structure with reportable details
- * @path: optional path to report instead of audit_names->name
- * @record_num: record number to report when handling a list of names
- * @call_panic: optional pointer to int that will be updated if secid fails
- */
-void audit_log_name(struct audit_context *context, struct audit_names *n,
- const struct path *path, int record_num, int *call_panic)
-{
- struct audit_buffer *ab;
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
- if (!ab)
- return;
-
- audit_log_format(ab, "item=%d", record_num);
-
- if (path)
- audit_log_d_path(ab, " name=", path);
- else if (n->name) {
- switch (n->name_len) {
- case AUDIT_NAME_FULL:
- /* log the full path */
- audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name->name);
- break;
- case 0:
- /* name was specified as a relative path and the
- * directory component is the cwd */
- audit_log_d_path(ab, " name=", &context->pwd);
- break;
- default:
- /* log the name's directory component */
- audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name->name,
- n->name_len);
- }
- } else
- audit_log_format(ab, " name=(null)");
-
- if (n->ino != AUDIT_INO_UNSET)
- audit_log_format(ab, " inode=%lu"
- " dev=%02x:%02x mode=%#ho"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- n->ino,
- MAJOR(n->dev),
- MINOR(n->dev),
- n->mode,
- from_kuid(&init_user_ns, n->uid),
- from_kgid(&init_user_ns, n->gid),
- MAJOR(n->rdev),
- MINOR(n->rdev));
- if (n->osid != 0) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
- if (call_panic)
- *call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
-
- /* log the audit_names record type */
- switch(n->type) {
- case AUDIT_TYPE_NORMAL:
- audit_log_format(ab, " nametype=NORMAL");
- break;
- case AUDIT_TYPE_PARENT:
- audit_log_format(ab, " nametype=PARENT");
- break;
- case AUDIT_TYPE_CHILD_DELETE:
- audit_log_format(ab, " nametype=DELETE");
- break;
- case AUDIT_TYPE_CHILD_CREATE:
- audit_log_format(ab, " nametype=CREATE");
- break;
- default:
- audit_log_format(ab, " nametype=UNKNOWN");
- break;
- }
-
- audit_log_fcaps(ab, n);
- audit_log_end(ab);
-}
-
int audit_log_task_context(struct audit_buffer *ab)
{
char *ctx = NULL;
@@ -2322,6 +2188,91 @@ void audit_log_link_denied(const char *operation)
audit_log_end(ab);
}
+/* global counter which is incremented every time something logs in */
+static atomic_t session_id = ATOMIC_INIT(0);
+
+static int audit_set_loginuid_perm(kuid_t loginuid)
+{
+ /* if we are unset, we don't need privs */
+ if (!audit_loginuid_set(current))
+ return 0;
+ /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
+ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
+ return -EPERM;
+ /* it is set, you need permission */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+ /* reject if this is not an unset and we don't allow that */
+ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID)
+ && uid_valid(loginuid))
+ return -EPERM;
+ return 0;
+}
+
+static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
+ unsigned int oldsessionid,
+ unsigned int sessionid, int rc)
+{
+ struct audit_buffer *ab;
+ uid_t uid, oldloginuid, loginuid;
+ struct tty_struct *tty;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+ if (!ab)
+ return;
+
+ uid = from_kuid(&init_user_ns, task_uid(current));
+ oldloginuid = from_kuid(&init_user_ns, koldloginuid);
+ loginuid = from_kuid(&init_user_ns, kloginuid),
+ tty = audit_get_tty();
+
+ audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
+ oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
+ oldsessionid, sessionid, !rc);
+ audit_put_tty(tty);
+ audit_log_end(ab);
+}
+
+/**
+ * audit_set_loginuid - set current task's loginuid
+ * @loginuid: loginuid value
+ *
+ * Returns 0.
+ *
+ * Called (set) from fs/proc/base.c::proc_loginuid_write().
+ */
+int audit_set_loginuid(kuid_t loginuid)
+{
+ unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
+ kuid_t oldloginuid;
+ int rc;
+
+ oldloginuid = audit_get_loginuid(current);
+ oldsessionid = audit_get_sessionid(current);
+
+ rc = audit_set_loginuid_perm(loginuid);
+ if (rc)
+ goto out;
+
+ /* are we setting or clearing? */
+ if (uid_valid(loginuid)) {
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
+ if (unlikely(sessionid == AUDIT_SID_UNSET))
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
+ }
+
+ current->sessionid = sessionid;
+ current->loginuid = loginuid;
+out:
+ audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
+ return rc;
+}
+
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
diff --git a/kernel/audit.h b/kernel/audit.h
index 91421679a168..958d5b8fc1b3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -69,6 +69,7 @@ struct audit_cap_data {
kernel_cap_t effective; /* effective set of process */
};
kernel_cap_t ambient;
+ kuid_t rootid;
};
/* When fs/namei.c:getname() is called, we store the pointer in name and bump
@@ -212,15 +213,6 @@ extern bool audit_ever_enabled;
extern void audit_log_session_info(struct audit_buffer *ab);
-extern void audit_copy_inode(struct audit_names *name,
- const struct dentry *dentry,
- struct inode *inode);
-extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
- kernel_cap_t *cap);
-extern void audit_log_name(struct audit_context *context,
- struct audit_names *n, const struct path *path,
- int record_num, int *call_panic);
-
extern int auditd_test_task(struct task_struct *task);
#define AUDIT_INODE_BUCKETS 32
@@ -267,25 +259,52 @@ extern void audit_log_d_path_exe(struct audit_buffer *ab,
extern struct tty_struct *audit_get_tty(void);
extern void audit_put_tty(struct tty_struct *tty);
-/* audit watch functions */
+/* audit watch/mark/tree functions */
#ifdef CONFIG_AUDITSYSCALL
+extern unsigned int audit_serial(void);
+extern int auditsc_get_stamp(struct audit_context *ctx,
+ struct timespec64 *t, unsigned int *serial);
+
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
-extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len,
+ u32 op);
extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
-extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino,
+ dev_t dev);
-extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len);
+extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule,
+ char *pathname, int len);
extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
extern void audit_remove_mark_rule(struct audit_krule *krule);
-extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev);
+extern int audit_mark_compare(struct audit_fsnotify_mark *mark,
+ unsigned long ino, dev_t dev);
extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
-extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
+extern int audit_exe_compare(struct task_struct *tsk,
+ struct audit_fsnotify_mark *mark);
+
+extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
+extern void audit_put_chunk(struct audit_chunk *chunk);
+extern bool audit_tree_match(struct audit_chunk *chunk,
+ struct audit_tree *tree);
+extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
+extern int audit_add_tree_rule(struct audit_krule *rule);
+extern int audit_remove_tree_rule(struct audit_krule *rule);
+extern void audit_trim_trees(void);
+extern int audit_tag_tree(char *old, char *new);
+extern const char *audit_tree_path(struct audit_tree *tree);
+extern void audit_put_tree(struct audit_tree *tree);
+extern void audit_kill_trees(struct audit_context *context);
-#else
+extern int audit_signal_info(int sig, struct task_struct *t);
+extern void audit_filter_inodes(struct task_struct *tsk,
+ struct audit_context *ctx);
+extern struct list_head *audit_killed_trees(void);
+#else /* CONFIG_AUDITSYSCALL */
+#define auditsc_get_stamp(c, t, s) 0
#define audit_put_watch(w) {}
#define audit_get_watch(w) {}
#define audit_to_watch(k, p, l, o) (-EINVAL)
@@ -301,21 +320,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
#define audit_mark_compare(m, i, d) 0
#define audit_exe_compare(t, m) (-EINVAL)
#define audit_dupe_exe(n, o) (-EINVAL)
-#endif /* CONFIG_AUDITSYSCALL */
-#ifdef CONFIG_AUDITSYSCALL
-extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
-extern void audit_put_chunk(struct audit_chunk *chunk);
-extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
-extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
-extern int audit_add_tree_rule(struct audit_krule *rule);
-extern int audit_remove_tree_rule(struct audit_krule *rule);
-extern void audit_trim_trees(void);
-extern int audit_tag_tree(char *old, char *new);
-extern const char *audit_tree_path(struct audit_tree *tree);
-extern void audit_put_tree(struct audit_tree *tree);
-extern void audit_kill_trees(struct list_head *list);
-#else
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
#define audit_make_tree(rule, str, op) -EINVAL
@@ -323,8 +328,11 @@ extern void audit_kill_trees(struct list_head *list);
#define audit_put_tree(tree) (void)0
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
-#define audit_kill_trees(list) BUG()
-#endif
+#define audit_kill_trees(context) BUG()
+
+#define audit_signal_info(s, t) AUDIT_DISABLED
+#define audit_filter_inodes(t, c) AUDIT_DISABLED
+#endif /* CONFIG_AUDITSYSCALL */
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
@@ -334,14 +342,5 @@ extern u32 audit_sig_sid;
extern int audit_filter(int msgtype, unsigned int listtype);
-#ifdef CONFIG_AUDITSYSCALL
-extern int audit_signal_info(int sig, struct task_struct *t);
-extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx);
-extern struct list_head *audit_killed_trees(void);
-#else
-#define audit_signal_info(s,t) AUDIT_DISABLED
-#define audit_filter_inodes(t,c) AUDIT_DISABLED
-#endif
-
extern void audit_ctl_lock(void);
extern void audit_ctl_unlock(void);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index cf4512a33675..37ae95cfb7f4 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -127,7 +127,7 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
audit_log_session_info(ab);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index d4af4d97f847..abfb112f26aa 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -524,13 +524,14 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
return 0;
}
-static void audit_tree_log_remove_rule(struct audit_krule *rule)
+static void audit_tree_log_remove_rule(struct audit_context *context,
+ struct audit_krule *rule)
{
struct audit_buffer *ab;
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
audit_log_format(ab, "op=remove_rule dir=");
@@ -540,7 +541,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule)
audit_log_end(ab);
}
-static void kill_rules(struct audit_tree *tree)
+static void kill_rules(struct audit_context *context, struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
@@ -551,7 +552,7 @@ static void kill_rules(struct audit_tree *tree)
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
- audit_tree_log_remove_rule(rule);
+ audit_tree_log_remove_rule(context, rule);
if (entry->rule.exe)
audit_remove_mark(entry->rule.exe);
rule->tree = NULL;
@@ -633,7 +634,7 @@ static void trim_marked(struct audit_tree *tree)
tree->goner = 1;
spin_unlock(&hash_lock);
mutex_lock(&audit_filter_mutex);
- kill_rules(tree);
+ kill_rules(audit_context(), tree);
list_del_init(&tree->list);
mutex_unlock(&audit_filter_mutex);
prune_one(tree);
@@ -973,8 +974,10 @@ static void audit_schedule_prune(void)
* ... and that one is done if evict_chunk() decides to delay until the end
* of syscall. Runs synchronously.
*/
-void audit_kill_trees(struct list_head *list)
+void audit_kill_trees(struct audit_context *context)
{
+ struct list_head *list = &context->killed_trees;
+
audit_ctl_lock();
mutex_lock(&audit_filter_mutex);
@@ -982,7 +985,7 @@ void audit_kill_trees(struct list_head *list)
struct audit_tree *victim;
victim = list_entry(list->next, struct audit_tree, list);
- kill_rules(victim);
+ kill_rules(context, victim);
list_del_init(&victim->list);
mutex_unlock(&audit_filter_mutex);
@@ -1017,7 +1020,7 @@ static void evict_chunk(struct audit_chunk *chunk)
list_del_init(&owner->same_root);
spin_unlock(&hash_lock);
if (!postponed) {
- kill_rules(owner);
+ kill_rules(audit_context(), owner);
list_move(&owner->list, &prune_list);
need_prune = 1;
} else {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 20ef9ba134b0..e8d1adeb2223 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -242,7 +242,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
audit_log_session_info(ab);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index bf309f2592c4..63f8b3f26fab 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -670,7 +670,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->values[i] = AUDIT_UID_UNSET;
break;
}
- /* fallthrough if set */
+ /* fall through - if set */
default:
data->values[i] = f->val;
}
@@ -1091,7 +1091,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
audit_log_session_info(ab);
@@ -1355,7 +1355,7 @@ int audit_filter(int msgtype, unsigned int listtype)
if (f->lsm_rule) {
security_task_getsecid(current, &sid);
result = security_audit_rule_match(sid,
- f->type, f->op, f->lsm_rule, NULL);
+ f->type, f->op, f->lsm_rule);
}
break;
case AUDIT_EXE:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 6593a5207fb0..d1eab1d4a930 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -631,9 +631,8 @@ static int audit_filter_rules(struct task_struct *tsk,
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
- f->op,
- f->lsm_rule,
- ctx);
+ f->op,
+ f->lsm_rule);
}
break;
case AUDIT_OBJ_USER:
@@ -647,13 +646,17 @@ static int audit_filter_rules(struct task_struct *tsk,
/* Find files that match */
if (name) {
result = security_audit_rule_match(
- name->osid, f->type, f->op,
- f->lsm_rule, ctx);
+ name->osid,
+ f->type,
+ f->op,
+ f->lsm_rule);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- if (security_audit_rule_match(n->osid, f->type,
- f->op, f->lsm_rule,
- ctx)) {
+ if (security_audit_rule_match(
+ n->osid,
+ f->type,
+ f->op,
+ f->lsm_rule)) {
++result;
break;
}
@@ -664,7 +667,7 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
if (security_audit_rule_match(ctx->ipc.osid,
f->type, f->op,
- f->lsm_rule, ctx))
+ f->lsm_rule))
++result;
}
break;
@@ -1136,6 +1139,32 @@ out:
kfree(buf_head);
}
+void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+ int i;
+
+ if (cap_isclear(*cap)) {
+ audit_log_format(ab, " %s=0", prefix);
+ return;
+ }
+ audit_log_format(ab, " %s=", prefix);
+ CAP_FOR_EACH_U32(i)
+ audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
+}
+
+static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+ if (name->fcap_ver == -1) {
+ audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?");
+ return;
+ }
+ audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
+ audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d",
+ name->fcap.fE, name->fcap_ver,
+ from_kuid(&init_user_ns, name->fcap.rootid));
+}
+
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1258,6 +1287,97 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len)
return len;
}
+/*
+ * audit_log_name - produce AUDIT_PATH record from struct audit_names
+ * @context: audit_context for the task
+ * @n: audit_names structure with reportable details
+ * @path: optional path to report instead of audit_names->name
+ * @record_num: record number to report when handling a list of names
+ * @call_panic: optional pointer to int that will be updated if secid fails
+ */
+static void audit_log_name(struct audit_context *context, struct audit_names *n,
+ const struct path *path, int record_num, int *call_panic)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ return;
+
+ audit_log_format(ab, "item=%d", record_num);
+
+ if (path)
+ audit_log_d_path(ab, " name=", path);
+ else if (n->name) {
+ switch (n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd
+ */
+ audit_log_d_path(ab, " name=", &context->pwd);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != AUDIT_INO_UNSET)
+ audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ from_kuid(&init_user_ns, n->uid),
+ from_kgid(&init_user_ns, n->gid),
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ if (n->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+
+ if (security_secid_to_secctx(
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
+ if (call_panic)
+ *call_panic = 2;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ /* log the audit_names record type */
+ switch (n->type) {
+ case AUDIT_TYPE_NORMAL:
+ audit_log_format(ab, " nametype=NORMAL");
+ break;
+ case AUDIT_TYPE_PARENT:
+ audit_log_format(ab, " nametype=PARENT");
+ break;
+ case AUDIT_TYPE_CHILD_DELETE:
+ audit_log_format(ab, " nametype=DELETE");
+ break;
+ case AUDIT_TYPE_CHILD_CREATE:
+ audit_log_format(ab, " nametype=CREATE");
+ break;
+ default:
+ audit_log_format(ab, " nametype=UNKNOWN");
+ break;
+ }
+
+ audit_log_fcaps(ab, n);
+ audit_log_end(ab);
+}
+
static void audit_log_proctitle(void)
{
int res;
@@ -1358,6 +1478,9 @@ static void audit_log_exit(void)
audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
audit_log_cap(ab, "pe", &axs->new_pcap.effective);
audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
+ audit_log_format(ab, " frootid=%d",
+ from_kuid(&init_user_ns,
+ axs->fcap.rootid));
break; }
}
@@ -1444,6 +1567,9 @@ void __audit_free(struct task_struct *tsk)
if (!context)
return;
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(context);
+
/* We are called either by do_exit() or the fork() error handling code;
* in the former case tsk == current and in the latter tsk is a
* random task_struct that doesn't doesn't have any meaningful data we
@@ -1460,9 +1586,6 @@ void __audit_free(struct task_struct *tsk)
audit_log_exit();
}
- if (!list_empty(&context->killed_trees))
- audit_kill_trees(&context->killed_trees);
-
audit_set_context(tsk, NULL);
audit_free_context(context);
}
@@ -1537,6 +1660,9 @@ void __audit_syscall_exit(int success, long return_code)
if (!context)
return;
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(context);
+
if (!context->dummy && context->in_syscall) {
if (success)
context->return_valid = AUDITSC_SUCCESS;
@@ -1571,9 +1697,6 @@ void __audit_syscall_exit(int success, long return_code)
context->in_syscall = 0;
context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
- if (!list_empty(&context->killed_trees))
- audit_kill_trees(&context->killed_trees);
-
audit_free_names(context);
unroll_tree_refs(context, NULL, 0);
audit_free_aux(context);
@@ -1750,6 +1873,47 @@ void __audit_getname(struct filename *name)
get_fs_pwd(current->fs, &context->pwd);
}
+static inline int audit_copy_fcaps(struct audit_names *name,
+ const struct dentry *dentry)
+{
+ struct cpu_vfs_cap_data caps;
+ int rc;
+
+ if (!dentry)
+ return 0;
+
+ rc = get_vfs_caps_from_disk(dentry, &caps);
+ if (rc)
+ return rc;
+
+ name->fcap.permitted = caps.permitted;
+ name->fcap.inheritable = caps.inheritable;
+ name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ name->fcap.rootid = caps.rootid;
+ name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
+ VFS_CAP_REVISION_SHIFT;
+
+ return 0;
+}
+
+/* Copy inode data into an audit_names. */
+void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+ struct inode *inode, unsigned int flags)
+{
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ security_inode_getsecid(inode, &name->osid);
+ if (flags & AUDIT_INODE_NOEVAL) {
+ name->fcap_ver = -1;
+ return;
+ }
+ audit_copy_fcaps(name, dentry);
+}
+
/**
* __audit_inode - store the inode and device from a lookup
* @name: name being audited
@@ -1763,10 +1927,31 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
struct inode *inode = d_backing_inode(dentry);
struct audit_names *n;
bool parent = flags & AUDIT_INODE_PARENT;
+ struct audit_entry *e;
+ struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+ int i;
if (!context->in_syscall)
return;
+ rcu_read_lock();
+ if (!list_empty(list)) {
+ list_for_each_entry_rcu(e, list, list) {
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+
+ if (f->type == AUDIT_FSTYPE
+ && audit_comparator(inode->i_sb->s_magic,
+ f->op, f->val)
+ && e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ }
+ }
+ rcu_read_unlock();
+
if (!name)
goto out_alloc;
@@ -1832,7 +2017,7 @@ out:
n->type = AUDIT_TYPE_NORMAL;
}
handle_path(dentry);
- audit_copy_inode(n, dentry, inode);
+ audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL);
}
void __audit_file(const struct file *file)
@@ -1875,14 +2060,12 @@ void __audit_inode_child(struct inode *parent,
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
- if (f->type == AUDIT_FSTYPE) {
- if (audit_comparator(parent->i_sb->s_magic,
- f->op, f->val)) {
- if (e->rule.action == AUDIT_NEVER) {
- rcu_read_unlock();
- return;
- }
- }
+ if (f->type == AUDIT_FSTYPE
+ && audit_comparator(parent->i_sb->s_magic,
+ f->op, f->val)
+ && e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
}
}
}
@@ -1933,7 +2116,7 @@ void __audit_inode_child(struct inode *parent,
n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
if (!n)
return;
- audit_copy_inode(n, NULL, parent);
+ audit_copy_inode(n, NULL, parent, 0);
}
if (!found_child) {
@@ -1952,7 +2135,7 @@ void __audit_inode_child(struct inode *parent,
}
if (inode)
- audit_copy_inode(found_child, dentry, inode);
+ audit_copy_inode(found_child, dentry, inode, 0);
else
found_child->ino = AUDIT_INO_UNSET;
}
@@ -1983,90 +2166,6 @@ int auditsc_get_stamp(struct audit_context *ctx,
return 1;
}
-/* global counter which is incremented every time something logs in */
-static atomic_t session_id = ATOMIC_INIT(0);
-
-static int audit_set_loginuid_perm(kuid_t loginuid)
-{
- /* if we are unset, we don't need privs */
- if (!audit_loginuid_set(current))
- return 0;
- /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
- if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
- return -EPERM;
- /* it is set, you need permission */
- if (!capable(CAP_AUDIT_CONTROL))
- return -EPERM;
- /* reject if this is not an unset and we don't allow that */
- if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
- return -EPERM;
- return 0;
-}
-
-static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
- unsigned int oldsessionid, unsigned int sessionid,
- int rc)
-{
- struct audit_buffer *ab;
- uid_t uid, oldloginuid, loginuid;
- struct tty_struct *tty;
-
- if (!audit_enabled)
- return;
-
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
- if (!ab)
- return;
-
- uid = from_kuid(&init_user_ns, task_uid(current));
- oldloginuid = from_kuid(&init_user_ns, koldloginuid);
- loginuid = from_kuid(&init_user_ns, kloginuid),
- tty = audit_get_tty();
-
- audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
- audit_log_task_context(ab);
- audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
- oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
- oldsessionid, sessionid, !rc);
- audit_put_tty(tty);
- audit_log_end(ab);
-}
-
-/**
- * audit_set_loginuid - set current task's audit_context loginuid
- * @loginuid: loginuid value
- *
- * Returns 0.
- *
- * Called (set) from fs/proc/base.c::proc_loginuid_write().
- */
-int audit_set_loginuid(kuid_t loginuid)
-{
- unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
- kuid_t oldloginuid;
- int rc;
-
- oldloginuid = audit_get_loginuid(current);
- oldsessionid = audit_get_sessionid(current);
-
- rc = audit_set_loginuid_perm(loginuid);
- if (rc)
- goto out;
-
- /* are we setting or clearing? */
- if (uid_valid(loginuid)) {
- sessionid = (unsigned int)atomic_inc_return(&session_id);
- if (unlikely(sessionid == AUDIT_SID_UNSET))
- sessionid = (unsigned int)atomic_inc_return(&session_id);
- }
-
- current->sessionid = sessionid;
- current->loginuid = loginuid;
-out:
- audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
- return rc;
-}
-
/**
* __audit_mq_open - record audit data for a POSIX MQ open
* @oflag: open flag
@@ -2355,6 +2454,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ ax->fcap.rootid = vcaps.rootid;
ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
ax->old_pcap.permitted = old->cap_permitted;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 25632a75d630..c72e0d8e1e65 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -253,8 +253,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
+ char *val;
- if (unlikely(map_flags > BPF_EXIST))
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
/* unknown flags */
return -EINVAL;
@@ -262,17 +263,25 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
/* all elements were pre-allocated, cannot insert a new one */
return -E2BIG;
- if (unlikely(map_flags == BPF_NOEXIST))
+ if (unlikely(map_flags & BPF_NOEXIST))
/* all elements already exist */
return -EEXIST;
- if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ if (unlikely((map_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)))
+ return -EINVAL;
+
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
value, map->value_size);
- else
- memcpy(array->value +
- array->elem_size * (index & array->index_mask),
- value, map->value_size);
+ } else {
+ val = array->value +
+ array->elem_size * (index & array->index_mask);
+ if (map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, val, value, false);
+ else
+ copy_map_value(map, val, value);
+ }
return 0;
}
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c57bd10340ed..bd3921b1514b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -157,7 +157,7 @@
*
*/
-#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
+#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2)
#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
@@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t)
return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
}
+static bool __btf_type_is_struct(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
+}
+
static bool btf_type_is_array(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
@@ -525,7 +530,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
/*
* Regular int is not a bit field and it must be either
- * u8/u16/u32/u64.
+ * u8/u16/u32/u64 or __int128.
*/
static bool btf_type_int_is_regular(const struct btf_type *t)
{
@@ -538,7 +543,8 @@ static bool btf_type_int_is_regular(const struct btf_type *t)
if (BITS_PER_BYTE_MASKED(nr_bits) ||
BTF_INT_OFFSET(int_data) ||
(nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
- nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+ nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) &&
+ nr_bytes != (2 * sizeof(u64)))) {
return false;
}
@@ -1063,9 +1069,9 @@ static int btf_int_check_member(struct btf_verifier_env *env,
nr_copy_bits = BTF_INT_BITS(int_data) +
BITS_PER_BYTE_MASKED(struct_bits_off);
- if (nr_copy_bits > BITS_PER_U64) {
+ if (nr_copy_bits > BITS_PER_U128) {
btf_verifier_log_member(env, struct_type, member,
- "nr_copy_bits exceeds 64");
+ "nr_copy_bits exceeds 128");
return -EINVAL;
}
@@ -1119,9 +1125,9 @@ static int btf_int_check_kflag_member(struct btf_verifier_env *env,
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off);
- if (nr_copy_bits > BITS_PER_U64) {
+ if (nr_copy_bits > BITS_PER_U128) {
btf_verifier_log_member(env, struct_type, member,
- "nr_copy_bits exceeds 64");
+ "nr_copy_bits exceeds 128");
return -EINVAL;
}
@@ -1168,9 +1174,9 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
- if (nr_bits > BITS_PER_U64) {
+ if (nr_bits > BITS_PER_U128) {
btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
- BITS_PER_U64);
+ BITS_PER_U128);
return -EINVAL;
}
@@ -1211,31 +1217,93 @@ static void btf_int_log(struct btf_verifier_env *env,
btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
}
+static void btf_int128_print(struct seq_file *m, void *data)
+{
+ /* data points to a __int128 number.
+ * Suppose
+ * int128_num = *(__int128 *)data;
+ * The below formulas shows what upper_num and lower_num represents:
+ * upper_num = int128_num >> 64;
+ * lower_num = int128_num & 0xffffffffFFFFFFFFULL;
+ */
+ u64 upper_num, lower_num;
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ upper_num = *(u64 *)data;
+ lower_num = *(u64 *)(data + 8);
+#else
+ upper_num = *(u64 *)(data + 8);
+ lower_num = *(u64 *)data;
+#endif
+ if (upper_num == 0)
+ seq_printf(m, "0x%llx", lower_num);
+ else
+ seq_printf(m, "0x%llx%016llx", upper_num, lower_num);
+}
+
+static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
+ u16 right_shift_bits)
+{
+ u64 upper_num, lower_num;
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ upper_num = print_num[0];
+ lower_num = print_num[1];
+#else
+ upper_num = print_num[1];
+ lower_num = print_num[0];
+#endif
+
+ /* shake out un-needed bits by shift/or operations */
+ if (left_shift_bits >= 64) {
+ upper_num = lower_num << (left_shift_bits - 64);
+ lower_num = 0;
+ } else {
+ upper_num = (upper_num << left_shift_bits) |
+ (lower_num >> (64 - left_shift_bits));
+ lower_num = lower_num << left_shift_bits;
+ }
+
+ if (right_shift_bits >= 64) {
+ lower_num = upper_num >> (right_shift_bits - 64);
+ upper_num = 0;
+ } else {
+ lower_num = (lower_num >> right_shift_bits) |
+ (upper_num << (64 - right_shift_bits));
+ upper_num = upper_num >> right_shift_bits;
+ }
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ print_num[0] = upper_num;
+ print_num[1] = lower_num;
+#else
+ print_num[0] = lower_num;
+ print_num[1] = upper_num;
+#endif
+}
+
static void btf_bitfield_seq_show(void *data, u8 bits_offset,
u8 nr_bits, struct seq_file *m)
{
u16 left_shift_bits, right_shift_bits;
u8 nr_copy_bytes;
u8 nr_copy_bits;
- u64 print_num;
+ u64 print_num[2] = {};
nr_copy_bits = nr_bits + bits_offset;
nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
- print_num = 0;
- memcpy(&print_num, data, nr_copy_bytes);
+ memcpy(print_num, data, nr_copy_bytes);
#ifdef __BIG_ENDIAN_BITFIELD
left_shift_bits = bits_offset;
#else
- left_shift_bits = BITS_PER_U64 - nr_copy_bits;
+ left_shift_bits = BITS_PER_U128 - nr_copy_bits;
#endif
- right_shift_bits = BITS_PER_U64 - nr_bits;
+ right_shift_bits = BITS_PER_U128 - nr_bits;
- print_num <<= left_shift_bits;
- print_num >>= right_shift_bits;
-
- seq_printf(m, "0x%llx", print_num);
+ btf_int128_shift(print_num, left_shift_bits, right_shift_bits);
+ btf_int128_print(m, print_num);
}
@@ -1250,7 +1318,7 @@ static void btf_int_bits_seq_show(const struct btf *btf,
/*
* bits_offset is at most 7.
- * BTF_INT_OFFSET() cannot exceed 64 bits.
+ * BTF_INT_OFFSET() cannot exceed 128 bits.
*/
total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
@@ -1274,6 +1342,9 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
}
switch (nr_bits) {
+ case 128:
+ btf_int128_print(m, data);
+ break;
case 64:
if (sign)
seq_printf(m, "%lld", *(s64 *)data);
@@ -1980,6 +2051,43 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+ const struct btf_member *member;
+ u32 i, off = -ENOENT;
+
+ if (!__btf_type_is_struct(t))
+ return -EINVAL;
+
+ for_each_member(i, t, member) {
+ const struct btf_type *member_type = btf_type_by_id(btf,
+ member->type);
+ if (!__btf_type_is_struct(member_type))
+ continue;
+ if (member_type->size != sizeof(struct bpf_spin_lock))
+ continue;
+ if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
+ "bpf_spin_lock"))
+ continue;
+ if (off != -ENOENT)
+ /* only one 'struct bpf_spin_lock' is allowed */
+ return -E2BIG;
+ off = btf_member_bit_offset(t, member);
+ if (off % 8)
+ /* valid C code cannot generate such BTF */
+ return -EINVAL;
+ off /= 8;
+ if (off % __alignof__(struct bpf_spin_lock))
+ /* valid struct bpf_spin_lock will be 4 byte aligned */
+ return -EINVAL;
+ }
+ return off;
+}
+
static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct seq_file *m)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index d17d05570a3f..4e807973aa80 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -230,6 +230,7 @@ cleanup:
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to attach
* @type: Type of attach operation
+ * @flags: Option flags
*
* Must be called with cgroup_mutex held.
*/
@@ -363,7 +364,7 @@ cleanup:
* Must be called with cgroup_mutex held.
*/
int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, u32 unused_flags)
+ enum bpf_attach_type type)
{
struct list_head *progs = &cgrp->bpf.progs[type];
enum bpf_cgroup_storage_type stype;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f908b9356025..ff09d32a8a1b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -78,7 +78,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
return NULL;
}
-struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog_aux *aux;
@@ -104,6 +104,32 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
return fp;
}
+
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+ struct bpf_prog *prog;
+ int cpu;
+
+ prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
+ if (!prog)
+ return NULL;
+
+ prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+ if (!prog->aux->stats) {
+ kfree(prog->aux);
+ vfree(prog);
+ return NULL;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_prog_stats *pstats;
+
+ pstats = per_cpu_ptr(prog->aux->stats, cpu);
+ u64_stats_init(&pstats->syncp);
+ }
+ return prog;
+}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
@@ -231,7 +257,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
void __bpf_prog_free(struct bpf_prog *fp)
{
- kfree(fp->aux);
+ if (fp->aux) {
+ free_percpu(fp->aux->stats);
+ kfree(fp->aux);
+ }
vfree(fp);
}
@@ -307,15 +336,16 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
return 0;
}
-static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
- u32 curr, const bool probe_pass)
+static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
+ s32 end_new, u32 curr, const bool probe_pass)
{
const s64 imm_min = S32_MIN, imm_max = S32_MAX;
+ s32 delta = end_new - end_old;
s64 imm = insn->imm;
- if (curr < pos && curr + imm + 1 > pos)
+ if (curr < pos && curr + imm + 1 >= end_old)
imm += delta;
- else if (curr > pos + delta && curr + imm + 1 <= pos + delta)
+ else if (curr >= end_new && curr + imm + 1 < end_new)
imm -= delta;
if (imm < imm_min || imm > imm_max)
return -ERANGE;
@@ -324,15 +354,16 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
return 0;
}
-static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
- u32 curr, const bool probe_pass)
+static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
+ s32 end_new, u32 curr, const bool probe_pass)
{
const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s32 delta = end_new - end_old;
s32 off = insn->off;
- if (curr < pos && curr + off + 1 > pos)
+ if (curr < pos && curr + off + 1 >= end_old)
off += delta;
- else if (curr > pos + delta && curr + off + 1 <= pos + delta)
+ else if (curr >= end_new && curr + off + 1 < end_new)
off -= delta;
if (off < off_min || off > off_max)
return -ERANGE;
@@ -341,10 +372,10 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
return 0;
}
-static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
- const bool probe_pass)
+static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
+ s32 end_new, const bool probe_pass)
{
- u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0);
+ u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
struct bpf_insn *insn = prog->insnsi;
int ret = 0;
@@ -356,22 +387,23 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
* do any other adjustments. Therefore skip the patchlet.
*/
if (probe_pass && i == pos) {
- i += delta + 1;
- insn++;
+ i = end_new;
+ insn = prog->insnsi + end_old;
}
code = insn->code;
- if (BPF_CLASS(code) != BPF_JMP ||
+ if ((BPF_CLASS(code) != BPF_JMP &&
+ BPF_CLASS(code) != BPF_JMP32) ||
BPF_OP(code) == BPF_EXIT)
continue;
/* Adjust offset of jmps if we cross patch boundaries. */
if (BPF_OP(code) == BPF_CALL) {
if (insn->src_reg != BPF_PSEUDO_CALL)
continue;
- ret = bpf_adj_delta_to_imm(insn, pos, delta, i,
- probe_pass);
+ ret = bpf_adj_delta_to_imm(insn, pos, end_old,
+ end_new, i, probe_pass);
} else {
- ret = bpf_adj_delta_to_off(insn, pos, delta, i,
- probe_pass);
+ ret = bpf_adj_delta_to_off(insn, pos, end_old,
+ end_new, i, probe_pass);
}
if (ret)
break;
@@ -421,7 +453,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
* we afterwards may not fail anymore.
*/
if (insn_adj_cnt > cnt_max &&
- bpf_adj_branches(prog, off, insn_delta, true))
+ bpf_adj_branches(prog, off, off + 1, off + len, true))
return NULL;
/* Several new instructions need to be inserted. Make room
@@ -453,13 +485,25 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
* the ship has sailed to reverse to the original state. An
* overflow cannot happen at this point.
*/
- BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
+ BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));
bpf_adj_linfo(prog_adj, off, insn_delta);
return prog_adj;
}
+int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
+{
+ /* Branch offsets can't overflow when program is shrinking, no need
+ * to call bpf_adj_branches(..., true) here
+ */
+ memmove(prog->insnsi + off, prog->insnsi + off + cnt,
+ sizeof(struct bpf_insn) * (prog->len - off - cnt));
+ prog->len -= cnt;
+
+ return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
+}
+
void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
int i;
@@ -495,7 +539,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
*symbol_end = addr + hdr->pages * PAGE_SIZE;
}
-static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
const char *end = sym + KSYM_NAME_LEN;
const struct btf_type *type;
@@ -934,6 +978,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
break;
+ case BPF_JMP32 | BPF_JEQ | BPF_K:
+ case BPF_JMP32 | BPF_JNE | BPF_K:
+ case BPF_JMP32 | BPF_JGT | BPF_K:
+ case BPF_JMP32 | BPF_JLT | BPF_K:
+ case BPF_JMP32 | BPF_JGE | BPF_K:
+ case BPF_JMP32 | BPF_JLE | BPF_K:
+ case BPF_JMP32 | BPF_JSGT | BPF_K:
+ case BPF_JMP32 | BPF_JSLT | BPF_K:
+ case BPF_JMP32 | BPF_JSGE | BPF_K:
+ case BPF_JMP32 | BPF_JSLE | BPF_K:
+ case BPF_JMP32 | BPF_JSET | BPF_K:
+ /* Accommodate for extra offset in case of a backjump. */
+ off = from->off;
+ if (off < 0)
+ off -= 2;
+ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
+ off);
+ break;
+
case BPF_LD | BPF_IMM | BPF_DW:
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
@@ -1130,6 +1195,31 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_2(JMP, CALL), \
/* Exit instruction. */ \
INSN_2(JMP, EXIT), \
+ /* 32-bit Jump instructions. */ \
+ /* Register based. */ \
+ INSN_3(JMP32, JEQ, X), \
+ INSN_3(JMP32, JNE, X), \
+ INSN_3(JMP32, JGT, X), \
+ INSN_3(JMP32, JLT, X), \
+ INSN_3(JMP32, JGE, X), \
+ INSN_3(JMP32, JLE, X), \
+ INSN_3(JMP32, JSGT, X), \
+ INSN_3(JMP32, JSLT, X), \
+ INSN_3(JMP32, JSGE, X), \
+ INSN_3(JMP32, JSLE, X), \
+ INSN_3(JMP32, JSET, X), \
+ /* Immediate based. */ \
+ INSN_3(JMP32, JEQ, K), \
+ INSN_3(JMP32, JNE, K), \
+ INSN_3(JMP32, JGT, K), \
+ INSN_3(JMP32, JLT, K), \
+ INSN_3(JMP32, JGE, K), \
+ INSN_3(JMP32, JLE, K), \
+ INSN_3(JMP32, JSGT, K), \
+ INSN_3(JMP32, JSLT, K), \
+ INSN_3(JMP32, JSGE, K), \
+ INSN_3(JMP32, JSLE, K), \
+ INSN_3(JMP32, JSET, K), \
/* Jump instructions. */ \
/* Register based. */ \
INSN_3(JMP, JEQ, X), \
@@ -1202,8 +1292,9 @@ bool bpf_opcode_in_insntable(u8 code)
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
* __bpf_prog_run - run eBPF program on a given context
- * @ctx: is the data we are operating on
+ * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
+ * @stack: is the eBPF storage stack
*
* Decode and execute eBPF instructions.
*/
@@ -1390,145 +1481,49 @@ select_insn:
out:
CONT;
}
- /* JMP */
JMP_JA:
insn += insn->off;
CONT;
- JMP_JEQ_X:
- if (DST == SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JEQ_K:
- if (DST == IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JNE_X:
- if (DST != SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JNE_K:
- if (DST != IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGT_X:
- if (DST > SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGT_K:
- if (DST > IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLT_X:
- if (DST < SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLT_K:
- if (DST < IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGE_X:
- if (DST >= SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGE_K:
- if (DST >= IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLE_X:
- if (DST <= SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLE_K:
- if (DST <= IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGT_X:
- if (((s64) DST) > ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGT_K:
- if (((s64) DST) > ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLT_X:
- if (((s64) DST) < ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLT_K:
- if (((s64) DST) < ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGE_X:
- if (((s64) DST) >= ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGE_K:
- if (((s64) DST) >= ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLE_X:
- if (((s64) DST) <= ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLE_K:
- if (((s64) DST) <= ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSET_X:
- if (DST & SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSET_K:
- if (DST & IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
JMP_EXIT:
return BPF_R0;
-
+ /* JMP */
+#define COND_JMP(SIGN, OPCODE, CMP_OP) \
+ JMP_##OPCODE##_X: \
+ if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP32_##OPCODE##_X: \
+ if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP_##OPCODE##_K: \
+ if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP32_##OPCODE##_K: \
+ if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT;
+ COND_JMP(u, JEQ, ==)
+ COND_JMP(u, JNE, !=)
+ COND_JMP(u, JGT, >)
+ COND_JMP(u, JLT, <)
+ COND_JMP(u, JGE, >=)
+ COND_JMP(u, JLE, <=)
+ COND_JMP(u, JSET, &)
+ COND_JMP(s, JSGT, >)
+ COND_JMP(s, JSLT, <)
+ COND_JMP(s, JSGE, >=)
+ COND_JMP(s, JSLE, <=)
+#undef COND_JMP
/* STX and ST and LDX*/
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
@@ -2036,6 +2031,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_spin_lock_proto __weak;
+const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
@@ -2101,6 +2098,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
return -EFAULT;
}
+DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+EXPORT_SYMBOL(bpf_stats_enabled_key);
+int sysctl_bpf_stats_enabled __read_mostly;
+
/* All definitions of tracepoints related to BPF. */
#define CREATE_TRACE_POINTS
#include <linux/bpf_trace.h>
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8974b3755670..3c18260403dd 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -162,10 +162,14 @@ static void cpu_map_kthread_stop(struct work_struct *work)
static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
struct xdp_frame *xdpf)
{
+ unsigned int hard_start_headroom;
unsigned int frame_size;
void *pkt_data_start;
struct sk_buff *skb;
+ /* Part of headroom was reserved to xdpf */
+ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom;
+
/* build_skb need to place skb_shared_info after SKB end, and
* also want to know the memory "truesize". Thus, need to
* know the memory frame size backing xdp_buff.
@@ -183,15 +187,15 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* is not at a fixed memory location, with mixed length
* packets, which is bad for cache-line hotness.
*/
- frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) +
+ frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- pkt_data_start = xdpf->data - xdpf->headroom;
+ pkt_data_start = xdpf->data - hard_start_headroom;
skb = build_skb(pkt_data_start, frame_size);
if (!skb)
return NULL;
- skb_reserve(skb, xdpf->headroom);
+ skb_reserve(skb, hard_start_headroom);
__skb_put(skb, xdpf->len);
if (xdpf->metasize)
skb_metadata_set(skb, xdpf->metasize);
@@ -205,6 +209,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* - RX ring dev queue index (skb_record_rx_queue)
*/
+ /* Allow SKB to reuse area used by xdp_frame */
+ xdp_scrub_frame(xdpf);
+
return skb;
}
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index d6b76377cb6e..de73f55e42fd 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -67,7 +67,7 @@ const char *const bpf_class_string[8] = {
[BPF_STX] = "stx",
[BPF_ALU] = "alu",
[BPF_JMP] = "jmp",
- [BPF_RET] = "BUG",
+ [BPF_JMP32] = "jmp32",
[BPF_ALU64] = "alu64",
};
@@ -136,23 +136,22 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
else
print_bpf_end_insn(verbose, cbs->private_data, insn);
} else if (BPF_OP(insn->code) == BPF_NEG) {
- verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n",
- insn->code, insn->dst_reg,
- class == BPF_ALU ? "(u32) " : "",
+ verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
+ insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
+ verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
+ class == BPF_ALU ? 'w' : 'r',
insn->src_reg);
} else {
- verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
+ verbose(cbs->private_data, "(%02x) %c%d %s %d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
insn->imm);
}
} else if (class == BPF_STX) {
@@ -220,7 +219,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code);
return;
}
- } else if (class == BPF_JMP) {
+ } else if (class == BPF_JMP32 || class == BPF_JMP) {
u8 opcode = BPF_OP(insn->code);
if (opcode == BPF_CALL) {
@@ -244,13 +243,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
verbose(cbs->private_data, "(%02x) exit\n", insn->code);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n",
- insn->code, insn->dst_reg,
+ verbose(cbs->private_data,
+ "(%02x) if %c%d %s %c%d goto pc%+d\n",
+ insn->code, class == BPF_JMP32 ? 'w' : 'r',
+ insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ class == BPF_JMP32 ? 'w' : 'r',
insn->src_reg, insn->off);
} else {
- verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n",
- insn->code, insn->dst_reg,
+ verbose(cbs->private_data,
+ "(%02x) if %c%d %s 0x%x goto pc%+d\n",
+ insn->code, class == BPF_JMP32 ? 'w' : 'r',
+ insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
insn->imm, insn->off);
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index f9274114c88d..fed15cf94dca 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
BITS_PER_LONG == 64;
}
-static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
-{
- u32 size = htab->map.value_size;
-
- if (percpu || fd_htab_map_needs_adjust(htab))
- size = round_up(size, 8);
- return size;
-}
-
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
struct htab_elem *old_elem)
{
- u32 size = htab_size_value(htab, percpu);
+ u32 size = htab->map.value_size;
bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
@@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
+ check_and_init_map_lock(&htab->map,
+ l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
if (percpu) {
+ size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
@@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
- } else {
+ } else if (fd_htab_map_needs_adjust(htab)) {
+ size = round_up(size, 8);
memcpy(l_new->key + round_up(key_size, 8), value, size);
+ } else {
+ copy_map_value(&htab->map,
+ l_new->key + round_up(key_size, 8),
+ value);
}
l_new->hash = hash;
@@ -805,11 +804,11 @@ dec_count:
static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
u64 map_flags)
{
- if (l_old && map_flags == BPF_NOEXIST)
+ if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
/* elem already exists */
return -EEXIST;
- if (!l_old && map_flags == BPF_EXIST)
+ if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
/* elem doesn't exist, cannot update it */
return -ENOENT;
@@ -828,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
u32 key_size, hash;
int ret;
- if (unlikely(map_flags > BPF_EXIST))
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
/* unknown flags */
return -EINVAL;
@@ -841,6 +840,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
b = __select_bucket(htab, hash);
head = &b->head;
+ if (unlikely(map_flags & BPF_F_LOCK)) {
+ if (unlikely(!map_value_has_spin_lock(map)))
+ return -EINVAL;
+ /* find an element without taking the bucket lock */
+ l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
+ htab->n_buckets);
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ return ret;
+ if (l_old) {
+ /* grab the element lock and update value in place */
+ copy_map_value_locked(map,
+ l_old->key + round_up(key_size, 8),
+ value, false);
+ return 0;
+ }
+ /* fall through, grab the bucket lock and lookup again.
+ * 99.9% chance that the element won't be found,
+ * but second lookup under lock has to be done.
+ */
+ }
+
/* bpf_map_update_elem() can be called in_irq() */
raw_spin_lock_irqsave(&b->lock, flags);
@@ -850,6 +871,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (ret)
goto err;
+ if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
+ /* first lookup without the bucket lock didn't find the element,
+ * but second lookup with the bucket lock found it.
+ * This case is highly unlikely, but has to be dealt with:
+ * grab the element lock in addition to the bucket lock
+ * and update element in place
+ */
+ copy_map_value_locked(map,
+ l_old->key + round_up(key_size, 8),
+ value, false);
+ ret = 0;
+ goto err;
+ }
+
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
l_old);
if (IS_ERR(l_new)) {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a74972b07e74..a411fc17d265 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -221,6 +221,102 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg2_type = ARG_CONST_SIZE,
};
+#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+ arch_spinlock_t *l = (void *)lock;
+ union {
+ __u32 val;
+ arch_spinlock_t lock;
+ } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
+
+ compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
+ BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
+ BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
+ arch_spin_lock(l);
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+ arch_spinlock_t *l = (void *)lock;
+
+ arch_spin_unlock(l);
+}
+
+#else
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+ atomic_t *l = (void *)lock;
+
+ BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
+ do {
+ atomic_cond_read_relaxed(l, !VAL);
+ } while (atomic_xchg(l, 1));
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+ atomic_t *l = (void *)lock;
+
+ atomic_set_release(l, 0);
+}
+
+#endif
+
+static DEFINE_PER_CPU(unsigned long, irqsave_flags);
+
+notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __bpf_spin_lock(lock);
+ __this_cpu_write(irqsave_flags, flags);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_spin_lock_proto = {
+ .func = bpf_spin_lock,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_SPIN_LOCK,
+};
+
+notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+ unsigned long flags;
+
+ flags = __this_cpu_read(irqsave_flags);
+ __bpf_spin_unlock(lock);
+ local_irq_restore(flags);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_spin_unlock_proto = {
+ .func = bpf_spin_unlock,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_SPIN_LOCK,
+};
+
+void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
+ bool lock_src)
+{
+ struct bpf_spin_lock *lock;
+
+ if (lock_src)
+ lock = src + map->spin_lock_off;
+ else
+ lock = dst + map->spin_lock_off;
+ preempt_disable();
+ ____bpf_spin_lock(lock);
+ copy_map_value(map, dst, src);
+ ____bpf_spin_unlock(lock);
+ preempt_enable();
+}
+
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 2ada5e21dfa6..4a8f390a2b82 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -554,19 +554,6 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
}
EXPORT_SYMBOL(bpf_prog_get_type_path);
-static void bpf_evict_inode(struct inode *inode)
-{
- enum bpf_type type;
-
- truncate_inode_pages_final(&inode->i_data);
- clear_inode(inode);
-
- if (S_ISLNK(inode->i_mode))
- kfree(inode->i_link);
- if (!bpf_inode_type(inode, &type))
- bpf_any_put(inode->i_private, type);
-}
-
/*
* Display the mount options in /proc/mounts.
*/
@@ -579,11 +566,28 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
+static void bpf_destroy_inode_deferred(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ enum bpf_type type;
+
+ if (S_ISLNK(inode->i_mode))
+ kfree(inode->i_link);
+ if (!bpf_inode_type(inode, &type))
+ bpf_any_put(inode->i_private, type);
+ free_inode_nonrcu(inode);
+}
+
+static void bpf_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred);
+}
+
static const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.show_options = bpf_show_options,
- .evict_inode = bpf_evict_inode,
+ .destroy_inode = bpf_destroy_inode,
};
enum {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 07a34ef562a0..6b572e2de7fb 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
struct bpf_cgroup_storage *storage;
struct bpf_storage_buffer *new;
- if (flags != BPF_ANY && flags != BPF_EXIST)
+ if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
+ return -EINVAL;
+
+ if (unlikely(flags & BPF_NOEXIST))
+ return -EINVAL;
+
+ if (unlikely((flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)))
return -EINVAL;
storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
@@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
if (!storage)
return -ENOENT;
+ if (flags & BPF_F_LOCK) {
+ copy_map_value_locked(map, storage->buf->data, value, false);
+ return 0;
+ }
+
new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
map->value_size,
__GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
@@ -147,6 +159,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
return -ENOMEM;
memcpy(&new->data[0], value, map->value_size);
+ check_and_init_map_lock(map, new->data);
new = xchg(&storage->buf, new);
kfree_rcu(new, rcu);
@@ -483,6 +496,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
storage->buf = kmalloc_node(size, flags, map->numa_node);
if (!storage->buf)
goto enomem;
+ check_and_init_map_lock(map, storage->buf->data);
} else {
storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
if (!storage->percpu_buf)
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 52378d3e34b3..3dff41403583 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
return ERR_PTR(-EINVAL);
}
+ if (map_value_has_spin_lock(inner_map)) {
+ fdput(f);
+ return ERR_PTR(-ENOTSUPP);
+ }
+
inner_map_meta_size = sizeof(*inner_map_meta);
/* In some cases verifier needs to access beyond just base map. */
if (inner_map->ops == &array_map_ops)
@@ -53,6 +58,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->value_size = inner_map->value_size;
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
+ inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 54cf2b9c44a4..ba635209ae9a 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -35,6 +35,7 @@ static DECLARE_RWSEM(bpf_devs_lock);
struct bpf_offload_dev {
const struct bpf_prog_offload_ops *ops;
struct list_head netdevs;
+ void *priv;
};
struct bpf_offload_netdev {
@@ -173,6 +174,41 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
return ret;
}
+void
+bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
+ struct bpf_insn *insn)
+{
+ const struct bpf_prog_offload_ops *ops;
+ struct bpf_prog_offload *offload;
+ int ret = -EOPNOTSUPP;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload) {
+ ops = offload->offdev->ops;
+ if (!offload->opt_failed && ops->replace_insn)
+ ret = ops->replace_insn(env, off, insn);
+ offload->opt_failed |= ret;
+ }
+ up_read(&bpf_devs_lock);
+}
+
+void
+bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
+{
+ struct bpf_prog_offload *offload;
+ int ret = -EOPNOTSUPP;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload) {
+ if (!offload->opt_failed && offload->offdev->ops->remove_insns)
+ ret = offload->offdev->ops->remove_insns(env, off, cnt);
+ offload->opt_failed |= ret;
+ }
+ up_read(&bpf_devs_lock);
+}
+
static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload = prog->aux->offload;
@@ -634,7 +670,7 @@ unlock:
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
struct bpf_offload_dev *
-bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops)
+bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
{
struct bpf_offload_dev *offdev;
int err;
@@ -653,6 +689,7 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops)
return ERR_PTR(-ENOMEM);
offdev->ops = ops;
+ offdev->priv = priv;
INIT_LIST_HEAD(&offdev->netdevs);
return offdev;
@@ -665,3 +702,9 @@ void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev)
kfree(offdev);
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy);
+
+void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
+{
+ return offdev->priv;
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 84470d1480aa..afca36f53c49 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -136,21 +136,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
void *bpf_map_area_alloc(size_t size, int numa_node)
{
- /* We definitely need __GFP_NORETRY, so OOM killer doesn't
- * trigger under memory pressure as we really just want to
- * fail instead.
+ /* We really just want to fail instead of triggering OOM killer
+ * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
+ * which is used for lower order allocation requests.
+ *
+ * It has been observed that higher order allocation requests done by
+ * vmalloc with __GFP_NORETRY being set might fail due to not trying
+ * to reclaim memory from the page cache, thus we set
+ * __GFP_RETRY_MAYFAIL to avoid such situations.
*/
- const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
+
+ const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
void *area;
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- area = kmalloc_node(size, GFP_USER | flags, numa_node);
+ area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
+ numa_node);
if (area != NULL)
return area;
}
- return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
- __builtin_return_address(0));
+ return __vmalloc_node_flags_caller(size, numa_node,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL |
+ flags, __builtin_return_address(0));
}
void bpf_map_area_free(void *area)
@@ -463,7 +471,7 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP;
}
-static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
+static int map_check_btf(struct bpf_map *map, const struct btf *btf,
u32 btf_key_id, u32 btf_value_id)
{
const struct btf_type *key_type, *value_type;
@@ -478,6 +486,22 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
if (!value_type || value_size != map->value_size)
return -EINVAL;
+ map->spin_lock_off = btf_find_spin_lock(btf, value_type);
+
+ if (map_value_has_spin_lock(map)) {
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
+ return -ENOTSUPP;
+ if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
+ map->value_size) {
+ WARN_ONCE(1,
+ "verifier bug spin_lock_off %d value_size %d\n",
+ map->spin_lock_off, map->value_size);
+ return -EFAULT;
+ }
+ }
+
if (map->ops->map_check_btf)
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
@@ -542,6 +566,8 @@ static int map_create(union bpf_attr *attr)
map->btf = btf;
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
+ } else {
+ map->spin_lock_off = -EINVAL;
}
err = security_bpf_map_alloc(map);
@@ -664,7 +690,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
static int map_lookup_elem(union bpf_attr *attr)
{
@@ -680,6 +706,9 @@ static int map_lookup_elem(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
+ if (attr->flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
@@ -690,6 +719,12 @@ static int map_lookup_elem(union bpf_attr *attr)
goto err_put;
}
+ if ((attr->flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -745,7 +780,13 @@ static int map_lookup_elem(union bpf_attr *attr)
err = -ENOENT;
} else {
err = 0;
- memcpy(value, ptr, value_size);
+ if (attr->flags & BPF_F_LOCK)
+ /* lock 'ptr' and copy everything but lock */
+ copy_map_value_locked(map, value, ptr, true);
+ else
+ copy_map_value(map, value, ptr);
+ /* mask lock, since value wasn't zero inited */
+ check_and_init_map_lock(map, value);
}
rcu_read_unlock();
}
@@ -808,6 +849,12 @@ static int map_update_elem(union bpf_attr *attr)
goto err_put;
}
+ if ((attr->flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -1219,6 +1266,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
bpf_prog_kallsyms_del_all(prog);
@@ -1244,24 +1292,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
return 0;
}
+static void bpf_prog_get_stats(const struct bpf_prog *prog,
+ struct bpf_prog_stats *stats)
+{
+ u64 nsecs = 0, cnt = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ const struct bpf_prog_stats *st;
+ unsigned int start;
+ u64 tnsecs, tcnt;
+
+ st = per_cpu_ptr(prog->aux->stats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&st->syncp);
+ tnsecs = st->nsecs;
+ tcnt = st->cnt;
+ } while (u64_stats_fetch_retry_irq(&st->syncp, start));
+ nsecs += tnsecs;
+ cnt += tcnt;
+ }
+ stats->nsecs = nsecs;
+ stats->cnt = cnt;
+}
+
#ifdef CONFIG_PROC_FS
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
+ struct bpf_prog_stats stats;
+ bpf_prog_get_stats(prog, &stats);
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"prog_type:\t%u\n"
"prog_jited:\t%u\n"
"prog_tag:\t%s\n"
"memlock:\t%llu\n"
- "prog_id:\t%u\n",
+ "prog_id:\t%u\n"
+ "run_time_ns:\t%llu\n"
+ "run_cnt:\t%llu\n",
prog->type,
prog->jited,
prog_tag,
prog->pages * 1ULL << PAGE_SHIFT,
- prog->aux->id);
+ prog->aux->id,
+ stats.nsecs,
+ stats.cnt);
}
#endif
@@ -1562,6 +1640,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
}
bpf_prog_kallsyms_add(prog);
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
return err;
free_used_maps:
@@ -2083,6 +2162,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
struct bpf_prog_info info = {};
u32 info_len = attr->info.info_len;
+ struct bpf_prog_stats stats;
char __user *uinsns;
u32 ulen;
int err;
@@ -2122,6 +2202,10 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
if (err)
return err;
+ bpf_prog_get_stats(prog, &stats);
+ info.run_time_ns = stats.nsecs;
+ info.run_cnt = stats.cnt;
+
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5fcce2f4209d..6c5a41f7f338 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -212,7 +212,8 @@ struct bpf_call_arg_meta {
int access_size;
s64 msize_smax_value;
u64 msize_umax_value;
- int ptr_id;
+ int ref_obj_id;
+ int func_id;
};
static DEFINE_MUTEX(bpf_verifier_lock);
@@ -330,35 +331,38 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
type == PTR_TO_PACKET_META;
}
-static bool reg_type_may_be_null(enum bpf_reg_type type)
-{
- return type == PTR_TO_MAP_VALUE_OR_NULL ||
- type == PTR_TO_SOCKET_OR_NULL;
-}
-
-static bool type_is_refcounted(enum bpf_reg_type type)
+static bool type_is_sk_pointer(enum bpf_reg_type type)
{
- return type == PTR_TO_SOCKET;
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_SOCK_COMMON ||
+ type == PTR_TO_TCP_SOCK;
}
-static bool type_is_refcounted_or_null(enum bpf_reg_type type)
+static bool reg_type_may_be_null(enum bpf_reg_type type)
{
- return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL;
+ return type == PTR_TO_MAP_VALUE_OR_NULL ||
+ type == PTR_TO_SOCKET_OR_NULL ||
+ type == PTR_TO_SOCK_COMMON_OR_NULL ||
+ type == PTR_TO_TCP_SOCK_OR_NULL;
}
-static bool reg_is_refcounted(const struct bpf_reg_state *reg)
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
- return type_is_refcounted(reg->type);
+ return reg->type == PTR_TO_MAP_VALUE &&
+ map_value_has_spin_lock(reg->map_ptr);
}
-static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
+static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
{
- return type_is_refcounted_or_null(reg->type);
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_SOCKET_OR_NULL ||
+ type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_TCP_SOCK_OR_NULL;
}
-static bool arg_type_is_refcounted(enum bpf_arg_type type)
+static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
{
- return type == ARG_PTR_TO_SOCKET;
+ return type == ARG_PTR_TO_SOCK_COMMON;
}
/* Determine whether the function releases some resources allocated by another
@@ -370,6 +374,18 @@ static bool is_release_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_sk_release;
}
+static bool is_acquire_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_sk_lookup_tcp ||
+ func_id == BPF_FUNC_sk_lookup_udp;
+}
+
+static bool is_ptr_cast_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_tcp_sock ||
+ func_id == BPF_FUNC_sk_fullsock;
+}
+
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
@@ -385,6 +401,10 @@ static const char * const reg_type_str[] = {
[PTR_TO_FLOW_KEYS] = "flow_keys",
[PTR_TO_SOCKET] = "sock",
[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
+ [PTR_TO_SOCK_COMMON] = "sock_common",
+ [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
+ [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
};
static char slot_type_char[] = {
@@ -440,6 +460,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
verbose(env, ",call_%d", func(env, reg)->callsite);
} else {
verbose(env, "(id=%d", reg->id);
+ if (reg_type_may_be_refcounted_or_null(t))
+ verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
if (t != SCALAR_VALUE)
verbose(env, ",off=%d", reg->off);
if (type_is_pkt_pointer(t))
@@ -611,13 +633,10 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
}
/* release function corresponding to acquire_reference_state(). Idempotent. */
-static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
+static int release_reference_state(struct bpf_func_state *state, int ptr_id)
{
int i, last_idx;
- if (!ptr_id)
- return -EFAULT;
-
last_idx = state->acquired_refs - 1;
for (i = 0; i < state->acquired_refs; i++) {
if (state->refs[i].id == ptr_id) {
@@ -629,21 +648,7 @@ static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
return 0;
}
}
- return -EFAULT;
-}
-
-/* variation on the above for cases where we expect that there must be an
- * outstanding reference for the specified ptr_id.
- */
-static int release_reference_state(struct bpf_verifier_env *env, int ptr_id)
-{
- struct bpf_func_state *state = cur_func(env);
- int err;
-
- err = __release_reference_state(state, ptr_id);
- if (WARN_ON_ONCE(err != 0))
- verbose(env, "verifier internal error: can't release reference\n");
- return err;
+ return -EINVAL;
}
static int transfer_reference_state(struct bpf_func_state *dst,
@@ -712,6 +717,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
}
dst_state->speculative = src->speculative;
dst_state->curframe = src->curframe;
+ dst_state->active_spin_lock = src->active_spin_lock;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -1095,7 +1101,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++) {
u8 code = insn[i].code;
- if (BPF_CLASS(code) != BPF_JMP)
+ if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
@@ -1201,6 +1207,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case CONST_PTR_TO_MAP:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
return true;
default:
return false;
@@ -1483,6 +1493,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
if (err)
verbose(env, "R%d max value is outside of the array range\n",
regno);
+
+ if (map_value_has_spin_lock(reg->map_ptr)) {
+ u32 lock = reg->map_ptr->spin_lock_off;
+
+ /* if any part of struct bpf_spin_lock can be touched by
+ * load/store reject this program.
+ * To check that [x1, x2) overlaps with [y1, y2)
+ * it is sufficient to check x1 < y2 && y1 < x2.
+ */
+ if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
+ lock < reg->umax_value + off + size) {
+ verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
+ return -EACCES;
+ }
+ }
return err;
}
@@ -1624,6 +1649,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
struct bpf_insn_access_aux info = {};
+ bool valid;
if (reg->smin_value < 0) {
verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
@@ -1631,15 +1657,31 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
return -EACCES;
}
- if (!bpf_sock_is_valid_access(off, size, t, &info)) {
- verbose(env, "invalid bpf_sock access off=%d size=%d\n",
- off, size);
- return -EACCES;
+ switch (reg->type) {
+ case PTR_TO_SOCK_COMMON:
+ valid = bpf_sock_common_is_valid_access(off, size, t, &info);
+ break;
+ case PTR_TO_SOCKET:
+ valid = bpf_sock_is_valid_access(off, size, t, &info);
+ break;
+ case PTR_TO_TCP_SOCK:
+ valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
+ break;
+ default:
+ valid = false;
}
- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
- return 0;
+ if (valid) {
+ env->insn_aux_data[insn_idx].ctx_field_size =
+ info.ctx_field_size;
+ return 0;
+ }
+
+ verbose(env, "R%d invalid %s access off=%d size=%d\n",
+ regno, reg_type_str[reg->type], off, size);
+
+ return -EACCES;
}
static bool __is_pointer_value(bool allow_ptr_leaks,
@@ -1665,8 +1707,14 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
{
const struct bpf_reg_state *reg = reg_state(env, regno);
- return reg->type == PTR_TO_CTX ||
- reg->type == PTR_TO_SOCKET;
+ return reg->type == PTR_TO_CTX;
+}
+
+static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ return type_is_sk_pointer(reg->type);
}
static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
@@ -1777,6 +1825,12 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_SOCKET:
pointer_desc = "sock ";
break;
+ case PTR_TO_SOCK_COMMON:
+ pointer_desc = "sock_common ";
+ break;
+ case PTR_TO_TCP_SOCK:
+ pointer_desc = "tcp_sock ";
+ break;
default:
break;
}
@@ -1843,8 +1897,9 @@ continue_func:
}
frame++;
if (frame >= MAX_CALL_FRAMES) {
- WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
- return -EFAULT;
+ verbose(env, "the call stack of %d frames is too deep !\n",
+ frame);
+ return -E2BIG;
}
goto process_func;
}
@@ -1980,11 +2035,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* PTR_TO_PACKET[_META,_END]. In the latter
* case, we know the offset is zero.
*/
- if (reg_type == SCALAR_VALUE)
+ if (reg_type == SCALAR_VALUE) {
mark_reg_unknown(env, regs, value_regno);
- else
+ } else {
mark_reg_known_zero(env, regs,
value_regno);
+ if (reg_type_may_be_null(reg_type))
+ regs[value_regno].id = ++env->id_gen;
+ }
regs[value_regno].type = reg_type;
}
@@ -2030,9 +2088,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_flow_keys_access(env, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_SOCKET) {
+ } else if (type_is_sk_pointer(reg->type)) {
if (t == BPF_WRITE) {
- verbose(env, "cannot write into socket\n");
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str[reg->type]);
return -EACCES;
}
err = check_sock_access(env, insn_idx, regno, off, size, t);
@@ -2079,7 +2138,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
if (is_ctx_reg(env, insn->dst_reg) ||
is_pkt_reg(env, insn->dst_reg) ||
- is_flow_key_reg(env, insn->dst_reg)) {
+ is_flow_key_reg(env, insn->dst_reg) ||
+ is_sk_reg(env, insn->dst_reg)) {
verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str[reg_state(env, insn->dst_reg)->type]);
@@ -2195,6 +2255,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
}
}
+/* Implementation details:
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * Two bpf_map_lookups (even with the same key) will have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
+ * value_or_null->value transition, since the verifier only cares about
+ * the range of access to valid map value pointer and doesn't care about actual
+ * address of the map element.
+ * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
+ * reg->id > 0 after value_or_null->value transition. By doing so
+ * two bpf_map_lookups will be considered two different pointers that
+ * point to different bpf_spin_locks.
+ * The verifier allows taking only one bpf_spin_lock at a time to avoid
+ * dead-locks.
+ * Since only one bpf_spin_lock is allowed the checks are simpler than
+ * reg_is_refcounted() logic. The verifier needs to remember only
+ * one spin_lock instead of array of acquired_refs.
+ * cur_state->active_spin_lock remembers which map value element got locked
+ * and clears it after bpf_spin_unlock.
+ */
+static int process_spin_lock(struct bpf_verifier_env *env, int regno,
+ bool is_lock)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_verifier_state *cur = env->cur_state;
+ bool is_const = tnum_is_const(reg->var_off);
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "R%d is not a pointer to map_value\n", regno);
+ return -EINVAL;
+ }
+ if (!is_const) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map->btf) {
+ verbose(env,
+ "map '%s' has to have BTF in order to use bpf_spin_lock\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (!map_value_has_spin_lock(map)) {
+ if (map->spin_lock_off == -E2BIG)
+ verbose(env,
+ "map '%s' has more than one 'struct bpf_spin_lock'\n",
+ map->name);
+ else if (map->spin_lock_off == -ENOENT)
+ verbose(env,
+ "map '%s' doesn't have 'struct bpf_spin_lock'\n",
+ map->name);
+ else
+ verbose(env,
+ "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (map->spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
+ val + reg->off);
+ return -EINVAL;
+ }
+ if (is_lock) {
+ if (cur->active_spin_lock) {
+ verbose(env,
+ "Locking two bpf_spin_locks are not allowed\n");
+ return -EINVAL;
+ }
+ cur->active_spin_lock = reg->id;
+ } else {
+ if (!cur->active_spin_lock) {
+ verbose(env, "bpf_spin_unlock without taking a lock\n");
+ return -EINVAL;
+ }
+ if (cur->active_spin_lock != reg->id) {
+ verbose(env, "bpf_spin_unlock of different lock\n");
+ return -EINVAL;
+ }
+ cur->active_spin_lock = 0;
+ }
+ return 0;
+}
+
static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
{
return type == ARG_PTR_TO_MEM ||
@@ -2261,16 +2406,31 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
err = check_ctx_reg(env, reg, regno);
if (err < 0)
return err;
- } else if (arg_type == ARG_PTR_TO_SOCKET) {
- expected_type = PTR_TO_SOCKET;
- if (type != expected_type)
+ } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) {
+ expected_type = PTR_TO_SOCK_COMMON;
+ /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
+ if (!type_is_sk_pointer(type))
goto err_type;
- if (meta->ptr_id || !reg->id) {
- verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n",
- meta->ptr_id, reg->id);
+ if (reg->ref_obj_id) {
+ if (meta->ref_obj_id) {
+ verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
+ return -EFAULT;
+ }
+ meta->ref_obj_id = reg->ref_obj_id;
+ }
+ } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
+ if (meta->func_id == BPF_FUNC_spin_lock) {
+ if (process_spin_lock(env, regno, true))
+ return -EACCES;
+ } else if (meta->func_id == BPF_FUNC_spin_unlock) {
+ if (process_spin_lock(env, regno, false))
+ return -EACCES;
+ } else {
+ verbose(env, "verifier internal error\n");
return -EFAULT;
}
- meta->ptr_id = reg->id;
} else if (arg_type_is_mem_ptr(arg_type)) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
@@ -2576,32 +2736,38 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
return true;
}
-static bool check_refcount_ok(const struct bpf_func_proto *fn)
+static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
{
int count = 0;
- if (arg_type_is_refcounted(fn->arg1_type))
+ if (arg_type_may_be_refcounted(fn->arg1_type))
count++;
- if (arg_type_is_refcounted(fn->arg2_type))
+ if (arg_type_may_be_refcounted(fn->arg2_type))
count++;
- if (arg_type_is_refcounted(fn->arg3_type))
+ if (arg_type_may_be_refcounted(fn->arg3_type))
count++;
- if (arg_type_is_refcounted(fn->arg4_type))
+ if (arg_type_may_be_refcounted(fn->arg4_type))
count++;
- if (arg_type_is_refcounted(fn->arg5_type))
+ if (arg_type_may_be_refcounted(fn->arg5_type))
count++;
+ /* A reference acquiring function cannot acquire
+ * another refcounted ptr.
+ */
+ if (is_acquire_function(func_id) && count)
+ return false;
+
/* We only support one arg being unreferenced at the moment,
* which is sufficient for the helper functions we have right now.
*/
return count <= 1;
}
-static int check_func_proto(const struct bpf_func_proto *fn)
+static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
- check_refcount_ok(fn) ? 0 : -EINVAL;
+ check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
}
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
@@ -2635,19 +2801,20 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
}
static void release_reg_references(struct bpf_verifier_env *env,
- struct bpf_func_state *state, int id)
+ struct bpf_func_state *state,
+ int ref_obj_id)
{
struct bpf_reg_state *regs = state->regs, *reg;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].id == id)
+ if (regs[i].ref_obj_id == ref_obj_id)
mark_reg_unknown(env, regs, i);
bpf_for_each_spilled_reg(i, state, reg) {
if (!reg)
continue;
- if (reg_is_refcounted(reg) && reg->id == id)
+ if (reg->ref_obj_id == ref_obj_id)
__mark_reg_unknown(reg);
}
}
@@ -2656,15 +2823,20 @@ static void release_reg_references(struct bpf_verifier_env *env,
* resources. Identify all copies of the same pointer and clear the reference.
*/
static int release_reference(struct bpf_verifier_env *env,
- struct bpf_call_arg_meta *meta)
+ int ref_obj_id)
{
struct bpf_verifier_state *vstate = env->cur_state;
+ int err;
int i;
+ err = release_reference_state(cur_func(env), ref_obj_id);
+ if (err)
+ return err;
+
for (i = 0; i <= vstate->curframe; i++)
- release_reg_references(env, vstate->frame[i], meta->ptr_id);
+ release_reg_references(env, vstate->frame[i], ref_obj_id);
- return release_reference_state(env, meta->ptr_id);
+ return 0;
}
static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -2883,13 +3055,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
- err = check_func_proto(fn);
+ err = check_func_proto(fn, func_id);
if (err) {
verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
return err;
}
+ meta.func_id = func_id;
/* check args */
err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
if (err)
@@ -2928,9 +3101,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return err;
}
} else if (is_release_function(func_id)) {
- err = release_reference(env, &meta);
- if (err)
+ err = release_reference(env, meta.ref_obj_id);
+ if (err) {
+ verbose(env, "func %s#%d reference has not been acquired before\n",
+ func_id_name(func_id), func_id);
return err;
+ }
}
regs = cur_regs(env);
@@ -2972,23 +3148,42 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].map_ptr = meta.map_ptr;
if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
+ if (map_value_has_spin_lock(meta.map_ptr))
+ regs[BPF_REG_0].id = ++env->id_gen;
} else {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
regs[BPF_REG_0].id = ++env->id_gen;
}
} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
- int id = acquire_reference_state(env, insn_idx);
- if (id < 0)
- return id;
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
- regs[BPF_REG_0].id = id;
+ if (is_acquire_function(func_id)) {
+ int id = acquire_reference_state(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ /* For mark_ptr_or_null_reg() */
+ regs[BPF_REG_0].id = id;
+ /* For release_reference() */
+ regs[BPF_REG_0].ref_obj_id = id;
+ } else {
+ /* For mark_ptr_or_null_reg() */
+ regs[BPF_REG_0].id = ++env->id_gen;
+ }
+ } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
+ regs[BPF_REG_0].id = ++env->id_gen;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
return -EINVAL;
}
+ if (is_ptr_cast_function(func_id))
+ /* For release_reference() */
+ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+
do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
err = check_map_func_compatibility(env, meta.map_ptr, func_id);
@@ -3187,7 +3382,7 @@ do_sim:
*dst_reg = *ptr_reg;
}
ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
- if (!ptr_is_dst_reg)
+ if (!ptr_is_dst_reg && ret)
*dst_reg = tmp;
return !ret ? -EFAULT : 0;
}
@@ -3242,6 +3437,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
@@ -4034,11 +4233,50 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* 0 - branch will not be taken and fall-through to next insn
* -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10]
*/
-static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
+static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
+ bool is_jmp32)
{
+ struct bpf_reg_state reg_lo;
+ s64 sval;
+
if (__is_pointer_value(false, reg))
return -1;
+ if (is_jmp32) {
+ reg_lo = *reg;
+ reg = &reg_lo;
+ /* For JMP32, only low 32 bits are compared, coerce_reg_to_size
+ * could truncate high bits and update umin/umax according to
+ * information of low bits.
+ */
+ coerce_reg_to_size(reg, 4);
+ /* smin/smax need special handling. For example, after coerce,
+ * if smin_value is 0x00000000ffffffffLL, the value is -1 when
+ * used as operand to JMP32. It is a negative number from s32's
+ * point of view, while it is a positive number when seen as
+ * s64. The smin/smax are kept as s64, therefore, when used with
+ * JMP32, they need to be transformed into s32, then sign
+ * extended back to s64.
+ *
+ * Also, smin/smax were copied from umin/umax. If umin/umax has
+ * different sign bit, then min/max relationship doesn't
+ * maintain after casting into s32, for this case, set smin/smax
+ * to safest range.
+ */
+ if ((reg->umax_value ^ reg->umin_value) &
+ (1ULL << 31)) {
+ reg->smin_value = S32_MIN;
+ reg->smax_value = S32_MAX;
+ }
+ reg->smin_value = (s64)(s32)reg->smin_value;
+ reg->smax_value = (s64)(s32)reg->smax_value;
+
+ val = (u32)val;
+ sval = (s64)(s32)val;
+ } else {
+ sval = (s64)val;
+ }
+
switch (opcode) {
case BPF_JEQ:
if (tnum_is_const(reg->var_off))
@@ -4061,9 +4299,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return 0;
break;
case BPF_JSGT:
- if (reg->smin_value > (s64)val)
+ if (reg->smin_value > sval)
return 1;
- else if (reg->smax_value < (s64)val)
+ else if (reg->smax_value < sval)
return 0;
break;
case BPF_JLT:
@@ -4073,9 +4311,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return 0;
break;
case BPF_JSLT:
- if (reg->smax_value < (s64)val)
+ if (reg->smax_value < sval)
return 1;
- else if (reg->smin_value >= (s64)val)
+ else if (reg->smin_value >= sval)
return 0;
break;
case BPF_JGE:
@@ -4085,9 +4323,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return 0;
break;
case BPF_JSGE:
- if (reg->smin_value >= (s64)val)
+ if (reg->smin_value >= sval)
return 1;
- else if (reg->smax_value < (s64)val)
+ else if (reg->smax_value < sval)
return 0;
break;
case BPF_JLE:
@@ -4097,9 +4335,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return 0;
break;
case BPF_JSLE:
- if (reg->smax_value <= (s64)val)
+ if (reg->smax_value <= sval)
return 1;
- else if (reg->smin_value > (s64)val)
+ else if (reg->smin_value > sval)
return 0;
break;
}
@@ -4107,6 +4345,29 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return -1;
}
+/* Generate min value of the high 32-bit from TNUM info. */
+static u64 gen_hi_min(struct tnum var)
+{
+ return var.value & ~0xffffffffULL;
+}
+
+/* Generate max value of the high 32-bit from TNUM info. */
+static u64 gen_hi_max(struct tnum var)
+{
+ return (var.value | var.mask) & ~0xffffffffULL;
+}
+
+/* Return true if VAL is compared with a s64 sign extended from s32, and they
+ * are with the same signedness.
+ */
+static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg)
+{
+ return ((s32)sval >= 0 &&
+ reg->smin_value >= 0 && reg->smax_value <= S32_MAX) ||
+ ((s32)sval < 0 &&
+ reg->smax_value <= 0 && reg->smin_value >= S32_MIN);
+}
+
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
@@ -4114,8 +4375,10 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
*/
static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
- u8 opcode)
+ u8 opcode, bool is_jmp32)
{
+ s64 sval;
+
/* If the dst_reg is a pointer, we can't learn anything about its
* variable offset from the compare (unless src_reg were a pointer into
* the same object, but we don't bother with that.
@@ -4125,19 +4388,31 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
if (__is_pointer_value(false, false_reg))
return;
+ val = is_jmp32 ? (u32)val : val;
+ sval = is_jmp32 ? (s64)(s32)val : (s64)val;
+
switch (opcode) {
case BPF_JEQ:
- /* If this is false then we know nothing Jon Snow, but if it is
- * true then we know for sure.
- */
- __mark_reg_known(true_reg, val);
- break;
case BPF_JNE:
- /* If this is true we know nothing Jon Snow, but if it is false
- * we know the value for sure;
+ {
+ struct bpf_reg_state *reg =
+ opcode == BPF_JEQ ? true_reg : false_reg;
+
+ /* For BPF_JEQ, if this is false we know nothing Jon Snow, but
+ * if it is true we know the value for sure. Likewise for
+ * BPF_JNE.
*/
- __mark_reg_known(false_reg, val);
+ if (is_jmp32) {
+ u64 old_v = reg->var_off.value;
+ u64 hi_mask = ~0xffffffffULL;
+
+ reg->var_off.value = (old_v & hi_mask) | val;
+ reg->var_off.mask &= hi_mask;
+ } else {
+ __mark_reg_known(reg, val);
+ }
break;
+ }
case BPF_JSET:
false_reg->var_off = tnum_and(false_reg->var_off,
tnum_const(~val));
@@ -4145,38 +4420,61 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
true_reg->var_off = tnum_or(true_reg->var_off,
tnum_const(val));
break;
- case BPF_JGT:
- false_reg->umax_value = min(false_reg->umax_value, val);
- true_reg->umin_value = max(true_reg->umin_value, val + 1);
- break;
- case BPF_JSGT:
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
- break;
- case BPF_JLT:
- false_reg->umin_value = max(false_reg->umin_value, val);
- true_reg->umax_value = min(true_reg->umax_value, val - 1);
- break;
- case BPF_JSLT:
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
- break;
case BPF_JGE:
- false_reg->umax_value = min(false_reg->umax_value, val - 1);
- true_reg->umin_value = max(true_reg->umin_value, val);
+ case BPF_JGT:
+ {
+ u64 false_umax = opcode == BPF_JGT ? val : val - 1;
+ u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
+
+ if (is_jmp32) {
+ false_umax += gen_hi_max(false_reg->var_off);
+ true_umin += gen_hi_min(true_reg->var_off);
+ }
+ false_reg->umax_value = min(false_reg->umax_value, false_umax);
+ true_reg->umin_value = max(true_reg->umin_value, true_umin);
break;
+ }
case BPF_JSGE:
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ case BPF_JSGT:
+ {
+ s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1;
+ s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
+
+ /* If the full s64 was not sign-extended from s32 then don't
+ * deduct further info.
+ */
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smax_value = min(false_reg->smax_value, false_smax);
+ true_reg->smin_value = max(true_reg->smin_value, true_smin);
break;
+ }
case BPF_JLE:
- false_reg->umin_value = max(false_reg->umin_value, val + 1);
- true_reg->umax_value = min(true_reg->umax_value, val);
+ case BPF_JLT:
+ {
+ u64 false_umin = opcode == BPF_JLT ? val : val + 1;
+ u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
+
+ if (is_jmp32) {
+ false_umin += gen_hi_min(false_reg->var_off);
+ true_umax += gen_hi_max(true_reg->var_off);
+ }
+ false_reg->umin_value = max(false_reg->umin_value, false_umin);
+ true_reg->umax_value = min(true_reg->umax_value, true_umax);
break;
+ }
case BPF_JSLE:
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
+ case BPF_JSLT:
+ {
+ s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1;
+ s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
+
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smin_value = max(false_reg->smin_value, false_smin);
+ true_reg->smax_value = min(true_reg->smax_value, true_smax);
break;
+ }
default:
break;
}
@@ -4199,24 +4497,34 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
*/
static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
- u8 opcode)
+ u8 opcode, bool is_jmp32)
{
+ s64 sval;
+
if (__is_pointer_value(false, false_reg))
return;
+ val = is_jmp32 ? (u32)val : val;
+ sval = is_jmp32 ? (s64)(s32)val : (s64)val;
+
switch (opcode) {
case BPF_JEQ:
- /* If this is false then we know nothing Jon Snow, but if it is
- * true then we know for sure.
- */
- __mark_reg_known(true_reg, val);
- break;
case BPF_JNE:
- /* If this is true we know nothing Jon Snow, but if it is false
- * we know the value for sure;
- */
- __mark_reg_known(false_reg, val);
+ {
+ struct bpf_reg_state *reg =
+ opcode == BPF_JEQ ? true_reg : false_reg;
+
+ if (is_jmp32) {
+ u64 old_v = reg->var_off.value;
+ u64 hi_mask = ~0xffffffffULL;
+
+ reg->var_off.value = (old_v & hi_mask) | val;
+ reg->var_off.mask &= hi_mask;
+ } else {
+ __mark_reg_known(reg, val);
+ }
break;
+ }
case BPF_JSET:
false_reg->var_off = tnum_and(false_reg->var_off,
tnum_const(~val));
@@ -4224,38 +4532,58 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
true_reg->var_off = tnum_or(true_reg->var_off,
tnum_const(val));
break;
- case BPF_JGT:
- true_reg->umax_value = min(true_reg->umax_value, val - 1);
- false_reg->umin_value = max(false_reg->umin_value, val);
- break;
- case BPF_JSGT:
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
- break;
- case BPF_JLT:
- true_reg->umin_value = max(true_reg->umin_value, val + 1);
- false_reg->umax_value = min(false_reg->umax_value, val);
- break;
- case BPF_JSLT:
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
- break;
case BPF_JGE:
- true_reg->umax_value = min(true_reg->umax_value, val);
- false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ case BPF_JGT:
+ {
+ u64 false_umin = opcode == BPF_JGT ? val : val + 1;
+ u64 true_umax = opcode == BPF_JGT ? val - 1 : val;
+
+ if (is_jmp32) {
+ false_umin += gen_hi_min(false_reg->var_off);
+ true_umax += gen_hi_max(true_reg->var_off);
+ }
+ false_reg->umin_value = max(false_reg->umin_value, false_umin);
+ true_reg->umax_value = min(true_reg->umax_value, true_umax);
break;
+ }
case BPF_JSGE:
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ case BPF_JSGT:
+ {
+ s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1;
+ s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval;
+
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smin_value = max(false_reg->smin_value, false_smin);
+ true_reg->smax_value = min(true_reg->smax_value, true_smax);
break;
+ }
case BPF_JLE:
- true_reg->umin_value = max(true_reg->umin_value, val);
- false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ case BPF_JLT:
+ {
+ u64 false_umax = opcode == BPF_JLT ? val : val - 1;
+ u64 true_umin = opcode == BPF_JLT ? val + 1 : val;
+
+ if (is_jmp32) {
+ false_umax += gen_hi_max(false_reg->var_off);
+ true_umin += gen_hi_min(true_reg->var_off);
+ }
+ false_reg->umax_value = min(false_reg->umax_value, false_umax);
+ true_reg->umin_value = max(true_reg->umin_value, true_umin);
break;
+ }
case BPF_JSLE:
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
+ case BPF_JSLT:
+ {
+ s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1;
+ s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval;
+
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smax_value = min(false_reg->smax_value, false_smax);
+ true_reg->smin_value = max(true_reg->smin_value, true_smin);
break;
+ }
default:
break;
}
@@ -4346,11 +4674,24 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
}
} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
reg->type = PTR_TO_SOCKET;
+ } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
+ reg->type = PTR_TO_SOCK_COMMON;
+ } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
+ reg->type = PTR_TO_TCP_SOCK;
}
- if (is_null || !reg_is_refcounted(reg)) {
- /* We don't need id from this point onwards anymore,
- * thus we should better reset it, so that state
- * pruning has chances to take effect.
+ if (is_null) {
+ /* We don't need id and ref_obj_id from this point
+ * onwards anymore, thus we should better reset it,
+ * so that state pruning has chances to take effect.
+ */
+ reg->id = 0;
+ reg->ref_obj_id = 0;
+ } else if (!reg_may_point_to_spin_lock(reg)) {
+ /* For not-NULL ptr, reg->ref_obj_id will be reset
+ * in release_reg_references().
+ *
+ * reg->id is still used by spin_lock ptr. Other
+ * than spin_lock ptr type, reg->id can be reset.
*/
reg->id = 0;
}
@@ -4365,11 +4706,16 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
{
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg, *regs = state->regs;
+ u32 ref_obj_id = regs[regno].ref_obj_id;
u32 id = regs[regno].id;
int i, j;
- if (reg_is_refcounted_or_null(&regs[regno]) && is_null)
- __release_reference_state(state, id);
+ if (ref_obj_id && ref_obj_id == id && is_null)
+ /* regs[regno] is in the " == NULL" branch.
+ * No one could have freed the reference state before
+ * doing the NULL check.
+ */
+ WARN_ON_ONCE(release_reference_state(state, id));
for (i = 0; i < MAX_BPF_REG; i++)
mark_ptr_or_null_reg(state, &regs[i], id, is_null);
@@ -4393,6 +4739,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
if (BPF_SRC(insn->code) != BPF_X)
return false;
+ /* Pointers are always 64-bit. */
+ if (BPF_CLASS(insn->code) == BPF_JMP32)
+ return false;
+
switch (BPF_OP(insn->code)) {
case BPF_JGT:
if ((dst_reg->type == PTR_TO_PACKET &&
@@ -4485,16 +4835,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs;
u8 opcode = BPF_OP(insn->code);
+ bool is_jmp32;
int err;
- if (opcode > BPF_JSLE) {
- verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
+ /* Only conditional jumps are expected to reach here. */
+ if (opcode == BPF_JA || opcode > BPF_JSLE) {
+ verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
return -EINVAL;
}
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
- verbose(env, "BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
@@ -4510,7 +4862,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
} else {
if (insn->src_reg != BPF_REG_0) {
- verbose(env, "BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
}
@@ -4521,9 +4873,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return err;
dst_reg = &regs[insn->dst_reg];
+ is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
if (BPF_SRC(insn->code) == BPF_K) {
- int pred = is_branch_taken(dst_reg, insn->imm, opcode);
+ int pred = is_branch_taken(dst_reg, insn->imm, opcode,
+ is_jmp32);
if (pred == 1) {
/* only follow the goto, ignore fall-through */
@@ -4551,30 +4905,51 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
* comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ struct bpf_reg_state lo_reg0 = *dst_reg;
+ struct bpf_reg_state lo_reg1 = *src_reg;
+ struct bpf_reg_state *src_lo, *dst_lo;
+
+ dst_lo = &lo_reg0;
+ src_lo = &lo_reg1;
+ coerce_reg_to_size(dst_lo, 4);
+ coerce_reg_to_size(src_lo, 4);
+
if (dst_reg->type == SCALAR_VALUE &&
- regs[insn->src_reg].type == SCALAR_VALUE) {
- if (tnum_is_const(regs[insn->src_reg].var_off))
+ src_reg->type == SCALAR_VALUE) {
+ if (tnum_is_const(src_reg->var_off) ||
+ (is_jmp32 && tnum_is_const(src_lo->var_off)))
reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg, regs[insn->src_reg].var_off.value,
- opcode);
- else if (tnum_is_const(dst_reg->var_off))
+ dst_reg,
+ is_jmp32
+ ? src_lo->var_off.value
+ : src_reg->var_off.value,
+ opcode, is_jmp32);
+ else if (tnum_is_const(dst_reg->var_off) ||
+ (is_jmp32 && tnum_is_const(dst_lo->var_off)))
reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
- &regs[insn->src_reg],
- dst_reg->var_off.value, opcode);
- else if (opcode == BPF_JEQ || opcode == BPF_JNE)
+ src_reg,
+ is_jmp32
+ ? dst_lo->var_off.value
+ : dst_reg->var_off.value,
+ opcode, is_jmp32);
+ else if (!is_jmp32 &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE))
/* Comparing for equality, we can combine knowledge */
reg_combine_min_max(&other_branch_regs[insn->src_reg],
&other_branch_regs[insn->dst_reg],
- &regs[insn->src_reg],
- &regs[insn->dst_reg], opcode);
+ src_reg, dst_reg, opcode);
}
} else if (dst_reg->type == SCALAR_VALUE) {
reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg, insn->imm, opcode);
+ dst_reg, insn->imm, opcode, is_jmp32);
}
- /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
- if (BPF_SRC(insn->code) == BPF_K &&
+ /* detect if R == 0 where R is returned from bpf_map_lookup_elem().
+ * NOTE: these optimizations below are related with pointer comparison
+ * which will never be JMP32.
+ */
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
reg_type_may_be_null(dst_reg->type)) {
/* Mark all identical registers in each branch as either
@@ -4716,6 +5091,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
+ if (env->cur_state->active_spin_lock) {
+ verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
+ return -EINVAL;
+ }
+
if (regs[BPF_REG_6].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -4903,7 +5283,8 @@ peek_stack:
goto check_state;
t = insn_stack[cur_stack - 1];
- if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ if (BPF_CLASS(insns[t].code) == BPF_JMP ||
+ BPF_CLASS(insns[t].code) == BPF_JMP32) {
u8 opcode = BPF_OP(insns[t].code);
if (opcode == BPF_EXIT) {
@@ -5000,13 +5381,14 @@ static int check_btf_func(struct bpf_verifier_env *env,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- u32 i, nfuncs, urec_size, min_size, prev_offset;
+ u32 i, nfuncs, urec_size, min_size;
u32 krec_size = sizeof(struct bpf_func_info);
struct bpf_func_info *krecord;
const struct btf_type *type;
struct bpf_prog *prog;
const struct btf *btf;
void __user *urecord;
+ u32 prev_offset = 0;
int ret = 0;
nfuncs = attr->func_info_cnt;
@@ -5450,8 +5832,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_MAP_VALUE:
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
- * We don't care about the 'id' value, because nothing
- * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+ * 'id' is not compared, since it's only used for maps with
+ * bpf_spin_lock inside map element and in such cases if
+ * the rest of the prog is valid for one map element then
+ * it's valid for all map elements regardless of the key
+ * used in bpf_map_lookup()
*/
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
range_within(rold, rcur) &&
@@ -5499,6 +5884,10 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_FLOW_KEYS:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
/* Only valid matches are exact, which memcmp() above
* would have accepted
*/
@@ -5654,6 +6043,9 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->speculative && !cur->speculative)
return false;
+ if (old->active_spin_lock != cur->active_spin_lock)
+ return false;
+
/* for states to be equal callsites have to be the same
* and all frame states need to be equivalent
*/
@@ -5687,15 +6079,17 @@ static int propagate_liveness(struct bpf_verifier_env *env,
}
/* Propagate read liveness of registers... */
BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
- /* We don't need to worry about FP liveness because it's read-only */
- for (i = 0; i < BPF_REG_FP; i++) {
- if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
- continue;
- if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
- err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i],
- &vparent->frame[vstate->curframe]->regs[i]);
- if (err)
- return err;
+ for (frame = 0; frame <= vstate->curframe; frame++) {
+ /* We don't need to worry about FP liveness, it's read-only */
+ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
+ if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ)
+ continue;
+ if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) {
+ err = mark_reg_read(env, &vstate->frame[frame]->regs[i],
+ &vparent->frame[frame]->regs[i]);
+ if (err)
+ return err;
+ }
}
}
@@ -5816,6 +6210,10 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_CTX:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
return false;
default:
return true;
@@ -6058,7 +6456,7 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- } else if (class == BPF_JMP) {
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
u8 opcode = BPF_OP(insn->code);
if (opcode == BPF_CALL) {
@@ -6066,11 +6464,18 @@ static int do_check(struct bpf_verifier_env *env)
insn->off != 0 ||
(insn->src_reg != BPF_REG_0 &&
insn->src_reg != BPF_PSEUDO_CALL) ||
- insn->dst_reg != BPF_REG_0) {
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}
+ if (env->cur_state->active_spin_lock &&
+ (insn->src_reg == BPF_PSEUDO_CALL ||
+ insn->imm != BPF_FUNC_spin_unlock)) {
+ verbose(env, "function calls are not allowed while holding a lock\n");
+ return -EINVAL;
+ }
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
else
@@ -6082,7 +6487,8 @@ static int do_check(struct bpf_verifier_env *env)
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0) {
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
@@ -6094,11 +6500,17 @@ static int do_check(struct bpf_verifier_env *env)
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0) {
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
+ if (env->cur_state->active_spin_lock) {
+ verbose(env, "bpf_spin_unlock is missing\n");
+ return -EINVAL;
+ }
+
if (state->curframe) {
/* exit from nested function */
env->prev_insn_idx = env->insn_idx;
@@ -6196,6 +6608,19 @@ static int check_map_prealloc(struct bpf_map *map)
!(map->map_flags & BPF_F_NO_PREALLOC);
}
+static bool is_tracing_prog_type(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
@@ -6218,6 +6643,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
+ if ((is_tracing_prog_type(prog->type) ||
+ prog->type == BPF_PROG_TYPE_SOCKET_FILTER) &&
+ map_value_has_spin_lock(map)) {
+ verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
+ return -EINVAL;
+ }
+
if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
@@ -6275,17 +6707,17 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
/* valid generic load 64-bit imm */
goto next_insn;
- if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
- verbose(env,
- "unrecognized bpf_ld_imm64 insn\n");
+ if (insn[0].src_reg != BPF_PSEUDO_MAP_FD ||
+ insn[1].imm != 0) {
+ verbose(env, "unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
- f = fdget(insn->imm);
+ f = fdget(insn[0].imm);
map = __bpf_map_get(f);
if (IS_ERR(map)) {
verbose(env, "fd %d is not pointing to valid bpf_map\n",
- insn->imm);
+ insn[0].imm);
return PTR_ERR(map);
}
@@ -6434,6 +6866,153 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
return new_prog;
}
+static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
+ u32 off, u32 cnt)
+{
+ int i, j;
+
+ /* find first prog starting at or after off (first to remove) */
+ for (i = 0; i < env->subprog_cnt; i++)
+ if (env->subprog_info[i].start >= off)
+ break;
+ /* find first prog starting at or after off + cnt (first to stay) */
+ for (j = i; j < env->subprog_cnt; j++)
+ if (env->subprog_info[j].start >= off + cnt)
+ break;
+ /* if j doesn't start exactly at off + cnt, we are just removing
+ * the front of previous prog
+ */
+ if (env->subprog_info[j].start != off + cnt)
+ j--;
+
+ if (j > i) {
+ struct bpf_prog_aux *aux = env->prog->aux;
+ int move;
+
+ /* move fake 'exit' subprog as well */
+ move = env->subprog_cnt + 1 - j;
+
+ memmove(env->subprog_info + i,
+ env->subprog_info + j,
+ sizeof(*env->subprog_info) * move);
+ env->subprog_cnt -= j - i;
+
+ /* remove func_info */
+ if (aux->func_info) {
+ move = aux->func_info_cnt - j;
+
+ memmove(aux->func_info + i,
+ aux->func_info + j,
+ sizeof(*aux->func_info) * move);
+ aux->func_info_cnt -= j - i;
+ /* func_info->insn_off is set after all code rewrites,
+ * in adjust_btf_func() - no need to adjust
+ */
+ }
+ } else {
+ /* convert i from "first prog to remove" to "first to adjust" */
+ if (env->subprog_info[i].start == off)
+ i++;
+ }
+
+ /* update fake 'exit' subprog as well */
+ for (; i <= env->subprog_cnt; i++)
+ env->subprog_info[i].start -= cnt;
+
+ return 0;
+}
+
+static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
+ u32 cnt)
+{
+ struct bpf_prog *prog = env->prog;
+ u32 i, l_off, l_cnt, nr_linfo;
+ struct bpf_line_info *linfo;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo)
+ return 0;
+
+ linfo = prog->aux->linfo;
+
+ /* find first line info to remove, count lines to be removed */
+ for (i = 0; i < nr_linfo; i++)
+ if (linfo[i].insn_off >= off)
+ break;
+
+ l_off = i;
+ l_cnt = 0;
+ for (; i < nr_linfo; i++)
+ if (linfo[i].insn_off < off + cnt)
+ l_cnt++;
+ else
+ break;
+
+ /* First live insn doesn't match first live linfo, it needs to "inherit"
+ * last removed linfo. prog is already modified, so prog->len == off
+ * means no live instructions after (tail of the program was removed).
+ */
+ if (prog->len != off && l_cnt &&
+ (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
+ l_cnt--;
+ linfo[--i].insn_off = off + cnt;
+ }
+
+ /* remove the line info which refer to the removed instructions */
+ if (l_cnt) {
+ memmove(linfo + l_off, linfo + i,
+ sizeof(*linfo) * (nr_linfo - i));
+
+ prog->aux->nr_linfo -= l_cnt;
+ nr_linfo = prog->aux->nr_linfo;
+ }
+
+ /* pull all linfo[i].insn_off >= off + cnt in by cnt */
+ for (i = l_off; i < nr_linfo; i++)
+ linfo[i].insn_off -= cnt;
+
+ /* fix up all subprogs (incl. 'exit') which start >= off */
+ for (i = 0; i <= env->subprog_cnt; i++)
+ if (env->subprog_info[i].linfo_idx > l_off) {
+ /* program may have started in the removed region but
+ * may not be fully removed
+ */
+ if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
+ env->subprog_info[i].linfo_idx -= l_cnt;
+ else
+ env->subprog_info[i].linfo_idx = l_off;
+ }
+
+ return 0;
+}
+
+static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ unsigned int orig_prog_len = env->prog->len;
+ int err;
+
+ if (bpf_prog_is_dev_bound(env->prog->aux))
+ bpf_prog_offload_remove_insns(env, off, cnt);
+
+ err = bpf_remove_insns(env->prog, off, cnt);
+ if (err)
+ return err;
+
+ err = adjust_subprog_starts_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ err = bpf_adj_linfo_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ memmove(aux_data + off, aux_data + off + cnt,
+ sizeof(*aux_data) * (orig_prog_len - off - cnt));
+
+ return 0;
+}
+
/* The verifier does more data flow analysis than llvm and will not
* explore branches that are dead at run time. Malicious programs can
* have dead code too. Therefore replace all dead at-run-time code
@@ -6460,6 +7039,91 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
}
}
+static bool insn_is_cond_jump(u8 code)
+{
+ u8 op;
+
+ if (BPF_CLASS(code) == BPF_JMP32)
+ return true;
+
+ if (BPF_CLASS(code) != BPF_JMP)
+ return false;
+
+ op = BPF_OP(code);
+ return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
+}
+
+static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!insn_is_cond_jump(insn->code))
+ continue;
+
+ if (!aux_data[i + 1].seen)
+ ja.off = insn->off;
+ else if (!aux_data[i + 1 + insn->off].seen)
+ ja.off = 0;
+ else
+ continue;
+
+ if (bpf_prog_is_dev_bound(env->prog->aux))
+ bpf_prog_offload_replace_insn(env, i, &ja);
+
+ memcpy(insn, &ja, sizeof(ja));
+ }
+}
+
+static int opt_remove_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ int insn_cnt = env->prog->len;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ int j;
+
+ j = 0;
+ while (i + j < insn_cnt && !aux_data[i + j].seen)
+ j++;
+ if (!j)
+ continue;
+
+ err = verifier_remove_insns(env, i, j);
+ if (err)
+ return err;
+ insn_cnt = env->prog->len;
+ }
+
+ return 0;
+}
+
+static int opt_remove_nops(struct bpf_verifier_env *env)
+{
+ const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (memcmp(&insn[i], &ja, sizeof(ja)))
+ continue;
+
+ err = verifier_remove_insns(env, i, 1);
+ if (err)
+ return err;
+ insn_cnt--;
+ i--;
+ }
+
+ return 0;
+}
+
/* convert load instructions that access fields of a context type into a
* sequence of instructions that access fields of the underlying structure:
* struct __sk_buff -> struct sk_buff
@@ -6552,8 +7216,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
convert_ctx_access = ops->convert_ctx_access;
break;
case PTR_TO_SOCKET:
+ case PTR_TO_SOCK_COMMON:
convert_ctx_access = bpf_sock_convert_ctx_access;
break;
+ case PTR_TO_TCP_SOCK:
+ convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
+ break;
default:
continue;
}
@@ -6681,7 +7349,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
subprog_end = env->subprog_info[i + 1].start;
len = subprog_end - subprog_start;
- func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
+ /* BPF_PROG_RUN doesn't call subprogs directly,
+ * hence main prog stats include the runtime of subprogs.
+ * subprogs don't have IDs and not reachable via prog_get_next_id
+ * func[i]->aux->stats will never be accessed and stays NULL
+ */
+ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
if (!func[i])
goto out_free;
memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
@@ -7151,7 +7824,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
{
struct bpf_verifier_env *env;
struct bpf_verifier_log *log;
- int ret = -EINVAL;
+ int i, len, ret = -EINVAL;
+ bool is_priv;
/* no program is valid */
if (ARRAY_SIZE(bpf_verifier_ops) == 0)
@@ -7165,12 +7839,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
return -ENOMEM;
log = &env->log;
+ len = (*prog)->len;
env->insn_aux_data =
- vzalloc(array_size(sizeof(struct bpf_insn_aux_data),
- (*prog)->len));
+ vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
ret = -ENOMEM;
if (!env->insn_aux_data)
goto err_free_env;
+ for (i = 0; i < len; i++)
+ env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
@@ -7198,6 +7874,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;
+ is_priv = capable(CAP_SYS_ADMIN);
+ env->allow_ptr_leaks = is_priv;
+
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
@@ -7215,8 +7894,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (!env->explored_states)
goto skip_full_check;
- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
ret = check_subprogs(env);
if (ret < 0)
goto skip_full_check;
@@ -7246,8 +7923,17 @@ skip_full_check:
ret = check_max_stack_depth(env);
/* instruction rewrites happen after this point */
- if (ret == 0)
- sanitize_dead_code(env);
+ if (is_priv) {
+ if (ret == 0)
+ opt_hard_wire_dead_code_branches(env);
+ if (ret == 0)
+ ret = opt_remove_dead_code(env);
+ if (ret == 0)
+ ret = opt_remove_nops(env);
+ } else {
+ if (ret == 0)
+ sanitize_dead_code(env);
+ }
if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
diff --git a/kernel/capability.c b/kernel/capability.c
index 1e1c0236f55b..1444f3954d75 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -93,9 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
break;
case _LINUX_CAPABILITY_VERSION_2:
warn_deprecated_v2();
- /*
- * fall through - v3 is otherwise equivalent to v2.
- */
+ /* fall through - v3 is otherwise equivalent to v2. */
case _LINUX_CAPABILITY_VERSION_3:
*tocopy = _LINUX_CAPABILITY_U32S_3;
break;
@@ -299,7 +297,7 @@ bool has_ns_capability(struct task_struct *t,
int ret;
rcu_read_lock();
- ret = security_capable(__task_cred(t), ns, cap);
+ ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
rcu_read_unlock();
return (ret == 0);
@@ -340,7 +338,7 @@ bool has_ns_capability_noaudit(struct task_struct *t,
int ret;
rcu_read_lock();
- ret = security_capable_noaudit(__task_cred(t), ns, cap);
+ ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
rcu_read_unlock();
return (ret == 0);
@@ -363,7 +361,9 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
-static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
+static bool ns_capable_common(struct user_namespace *ns,
+ int cap,
+ unsigned int opts)
{
int capable;
@@ -372,8 +372,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
BUG();
}
- capable = audit ? security_capable(current_cred(), ns, cap) :
- security_capable_noaudit(current_cred(), ns, cap);
+ capable = security_capable(current_cred(), ns, cap, opts);
if (capable == 0) {
current->flags |= PF_SUPERPRIV;
return true;
@@ -394,7 +393,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
*/
bool ns_capable(struct user_namespace *ns, int cap)
{
- return ns_capable_common(ns, cap, true);
+ return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);
@@ -412,11 +411,30 @@ EXPORT_SYMBOL(ns_capable);
*/
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
- return ns_capable_common(ns, cap, false);
+ return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);
/**
+ * ns_capable_setid - Determine if the current task has a superior capability
+ * in effect, while signalling that this check is being done from within a
+ * setid syscall.
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_setid(struct user_namespace *ns, int cap)
+{
+ return ns_capable_common(ns, cap, CAP_OPT_INSETID);
+}
+EXPORT_SYMBOL(ns_capable_setid);
+
+/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
@@ -448,10 +466,11 @@ EXPORT_SYMBOL(capable);
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
int cap)
{
+
if (WARN_ON_ONCE(!cap_valid(cap)))
return false;
- if (security_capable(file->f_cred, ns, cap) == 0)
+ if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
return true;
return false;
@@ -500,10 +519,12 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
{
int ret = 0; /* An absent tracer adds no restrictions */
const struct cred *cred;
+
rcu_read_lock();
cred = rcu_dereference(tsk->ptracer_cred);
if (cred)
- ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE,
+ CAP_OPT_NOAUDIT);
rcu_read_unlock();
return (ret == 0);
}
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index c950864016e2..30e39f3932ad 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -7,6 +7,7 @@
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/refcount.h>
+#include <linux/fs_context.h>
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
@@ -37,6 +38,31 @@ extern void __init enable_debug_cgroup(void);
} while (0)
/*
+ * The cgroup filesystem superblock creation/mount context.
+ */
+struct cgroup_fs_context {
+ struct kernfs_fs_context kfc;
+ struct cgroup_root *root;
+ struct cgroup_namespace *ns;
+ unsigned int flags; /* CGRP_ROOT_* flags */
+
+ /* cgroup1 bits */
+ bool cpuset_clone_children;
+ bool none; /* User explicitly requested empty subsystem */
+ bool all_ss; /* Seen 'all' option */
+ u16 subsys_mask; /* Selected subsystems */
+ char *name; /* Hierarchy name */
+ char *release_agent; /* Path for release notifications */
+};
+
+static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ return container_of(kfc, struct cgroup_fs_context, kfc);
+}
+
+/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
* direction, a css_set is naturally associated with multiple cgroups.
@@ -117,16 +143,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-struct cgroup_sb_opts {
- u16 subsys_mask;
- unsigned int flags;
- char *release_agent;
- bool cpuset_clone_children;
- char *name;
- /* User explicitly requested empty subsystem */
- bool none;
-};
-
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
@@ -197,12 +213,10 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
void cgroup_free_root(struct cgroup_root *root);
-void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
+void init_cgroup_root(struct cgroup_fs_context *ctx);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
-struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
- struct cgroup_root *root, unsigned long magic,
- struct cgroup_namespace *ns);
+int cgroup_do_get_tree(struct fs_context *fc);
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
@@ -246,14 +260,15 @@ extern const struct proc_ns_operations cgroupns_operations;
*/
extern struct cftype cgroup1_base_files[];
extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
+extern const struct fs_parameter_description cgroup1_fs_parameters;
int proc_cgroupstats_show(struct seq_file *m, void *v);
bool cgroup1_ssid_disabled(int ssid);
void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
void cgroup1_release_agent(struct work_struct *work);
void cgroup1_check_for_release(struct cgroup *cgrp);
-struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
- void *data, unsigned long magic,
- struct cgroup_namespace *ns);
+int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param);
+int cgroup1_get_tree(struct fs_context *fc);
+int cgroup1_reconfigure(struct fs_context *ctx);
#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 583b969b0c0e..c126b34fd4ff 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -13,9 +13,12 @@
#include <linux/delayacct.h>
#include <linux/pid_namespace.h>
#include <linux/cgroupstats.h>
+#include <linux/fs_parser.h>
#include <trace/events/cgroup.h>
+#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__)
+
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
@@ -906,172 +909,195 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
return 0;
}
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
- char *token, *o = data;
- bool all_ss = false, one_ss = false;
- u16 mask = U16_MAX;
- struct cgroup_subsys *ss;
- int nr_opts = 0;
- int i;
-
-#ifdef CONFIG_CPUSETS
- mask = ~((u16)1 << cpuset_cgrp_id);
-#endif
-
- memset(opts, 0, sizeof(*opts));
+enum cgroup1_param {
+ Opt_all,
+ Opt_clone_children,
+ Opt_cpuset_v2_mode,
+ Opt_name,
+ Opt_none,
+ Opt_noprefix,
+ Opt_release_agent,
+ Opt_xattr,
+};
- while ((token = strsep(&o, ",")) != NULL) {
- nr_opts++;
+static const struct fs_parameter_spec cgroup1_param_specs[] = {
+ fsparam_flag ("all", Opt_all),
+ fsparam_flag ("clone_children", Opt_clone_children),
+ fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
+ fsparam_string("name", Opt_name),
+ fsparam_flag ("none", Opt_none),
+ fsparam_flag ("noprefix", Opt_noprefix),
+ fsparam_string("release_agent", Opt_release_agent),
+ fsparam_flag ("xattr", Opt_xattr),
+ {}
+};
- if (!*token)
- return -EINVAL;
- if (!strcmp(token, "none")) {
- /* Explicitly have no subsystems */
- opts->none = true;
- continue;
- }
- if (!strcmp(token, "all")) {
- /* Mutually exclusive option 'all' + subsystem name */
- if (one_ss)
- return -EINVAL;
- all_ss = true;
- continue;
- }
- if (!strcmp(token, "noprefix")) {
- opts->flags |= CGRP_ROOT_NOPREFIX;
- continue;
- }
- if (!strcmp(token, "clone_children")) {
- opts->cpuset_clone_children = true;
- continue;
- }
- if (!strcmp(token, "cpuset_v2_mode")) {
- opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
- continue;
- }
- if (!strcmp(token, "xattr")) {
- opts->flags |= CGRP_ROOT_XATTR;
- continue;
- }
- if (!strncmp(token, "release_agent=", 14)) {
- /* Specifying two release agents is forbidden */
- if (opts->release_agent)
- return -EINVAL;
- opts->release_agent =
- kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
- if (!opts->release_agent)
- return -ENOMEM;
- continue;
- }
- if (!strncmp(token, "name=", 5)) {
- const char *name = token + 5;
-
- /* blocked by boot param? */
- if (cgroup_no_v1_named)
- return -ENOENT;
- /* Can't specify an empty name */
- if (!strlen(name))
- return -EINVAL;
- /* Must match [\w.-]+ */
- for (i = 0; i < strlen(name); i++) {
- char c = name[i];
- if (isalnum(c))
- continue;
- if ((c == '.') || (c == '-') || (c == '_'))
- continue;
- return -EINVAL;
- }
- /* Specifying two names is forbidden */
- if (opts->name)
- return -EINVAL;
- opts->name = kstrndup(name,
- MAX_CGROUP_ROOT_NAMELEN - 1,
- GFP_KERNEL);
- if (!opts->name)
- return -ENOMEM;
+const struct fs_parameter_description cgroup1_fs_parameters = {
+ .name = "cgroup1",
+ .specs = cgroup1_param_specs,
+};
- continue;
+int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct cgroup_subsys *ss;
+ struct fs_parse_result result;
+ int opt, i;
+
+ opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result);
+ if (opt == -ENOPARAM) {
+ if (strcmp(param->key, "source") == 0) {
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
}
-
for_each_subsys(ss, i) {
- if (strcmp(token, ss->legacy_name))
+ if (strcmp(param->key, ss->legacy_name))
continue;
- if (!cgroup_ssid_enabled(i))
+ ctx->subsys_mask |= (1 << i);
+ return 0;
+ }
+ return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key);
+ }
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_none:
+ /* Explicitly have no subsystems */
+ ctx->none = true;
+ break;
+ case Opt_all:
+ ctx->all_ss = true;
+ break;
+ case Opt_noprefix:
+ ctx->flags |= CGRP_ROOT_NOPREFIX;
+ break;
+ case Opt_clone_children:
+ ctx->cpuset_clone_children = true;
+ break;
+ case Opt_cpuset_v2_mode:
+ ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ break;
+ case Opt_xattr:
+ ctx->flags |= CGRP_ROOT_XATTR;
+ break;
+ case Opt_release_agent:
+ /* Specifying two release agents is forbidden */
+ if (ctx->release_agent)
+ return cg_invalf(fc, "cgroup1: release_agent respecified");
+ ctx->release_agent = param->string;
+ param->string = NULL;
+ break;
+ case Opt_name:
+ /* blocked by boot param? */
+ if (cgroup_no_v1_named)
+ return -ENOENT;
+ /* Can't specify an empty name */
+ if (!param->size)
+ return cg_invalf(fc, "cgroup1: Empty name");
+ if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
+ return cg_invalf(fc, "cgroup1: Name too long");
+ /* Must match [\w.-]+ */
+ for (i = 0; i < param->size; i++) {
+ char c = param->string[i];
+ if (isalnum(c))
continue;
- if (cgroup1_ssid_disabled(i))
+ if ((c == '.') || (c == '-') || (c == '_'))
continue;
-
- /* Mutually exclusive option 'all' + subsystem name */
- if (all_ss)
- return -EINVAL;
- opts->subsys_mask |= (1 << i);
- one_ss = true;
-
- break;
+ return cg_invalf(fc, "cgroup1: Invalid name");
}
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
+ /* Specifying two names is forbidden */
+ if (ctx->name)
+ return cg_invalf(fc, "cgroup1: name respecified");
+ ctx->name = param->string;
+ param->string = NULL;
+ break;
}
+ return 0;
+}
+
+static int check_cgroupfs_options(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ u16 mask = U16_MAX;
+ u16 enabled = 0;
+ struct cgroup_subsys *ss;
+ int i;
+
+#ifdef CONFIG_CPUSETS
+ mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
+ for_each_subsys(ss, i)
+ if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
+ enabled |= 1 << i;
+
+ ctx->subsys_mask &= enabled;
/*
- * If the 'all' option was specified select all the subsystems,
- * otherwise if 'none', 'name=' and a subsystem name options were
- * not specified, let's default to 'all'
+ * In absense of 'none', 'name=' or subsystem name options,
+ * let's default to 'all'.
*/
- if (all_ss || (!one_ss && !opts->none && !opts->name))
- for_each_subsys(ss, i)
- if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
- opts->subsys_mask |= (1 << i);
+ if (!ctx->subsys_mask && !ctx->none && !ctx->name)
+ ctx->all_ss = true;
+
+ if (ctx->all_ss) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (ctx->subsys_mask)
+ return cg_invalf(fc, "cgroup1: subsys name conflicts with all");
+ /* 'all' => select all the subsystems */
+ ctx->subsys_mask = enabled;
+ }
/*
* We either have to specify by name or by subsystems. (So all
* empty hierarchies must have a name).
*/
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
+ if (!ctx->subsys_mask && !ctx->name)
+ return cg_invalf(fc, "cgroup1: Need name or subsystem set");
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
- if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
- return -EINVAL;
+ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
+ return cg_invalf(fc, "cgroup1: noprefix used incorrectly");
/* Can't specify "none" and some subsystems */
- if (opts->subsys_mask && opts->none)
- return -EINVAL;
+ if (ctx->subsys_mask && ctx->none)
+ return cg_invalf(fc, "cgroup1: none used incorrectly");
return 0;
}
-static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
+int cgroup1_reconfigure(struct fs_context *fc)
{
- int ret = 0;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- struct cgroup_sb_opts opts;
+ int ret = 0;
u16 added_mask, removed_mask;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* See what subsystems are wanted */
- ret = parse_cgroupfs_options(data, &opts);
+ ret = check_cgroupfs_options(fc);
if (ret)
goto out_unlock;
- if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+ if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
task_tgid_nr(current), current->comm);
- added_mask = opts.subsys_mask & ~root->subsys_mask;
- removed_mask = root->subsys_mask & ~opts.subsys_mask;
+ added_mask = ctx->subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~ctx->subsys_mask;
/* Don't allow flags or name to change at remount */
- if ((opts.flags ^ root->flags) ||
- (opts.name && strcmp(opts.name, root->name))) {
- pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
- opts.flags, opts.name ?: "", root->flags, root->name);
+ if ((ctx->flags ^ root->flags) ||
+ (ctx->name && strcmp(ctx->name, root->name))) {
+ cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
+ ctx->flags, ctx->name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
@@ -1088,17 +1114,15 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
- if (opts.release_agent) {
+ if (ctx->release_agent) {
spin_lock(&release_agent_path_lock);
- strcpy(root->release_agent_path, opts.release_agent);
+ strcpy(root->release_agent_path, ctx->release_agent);
spin_unlock(&release_agent_path_lock);
}
trace_cgroup_remount(root);
out_unlock:
- kfree(opts.release_agent);
- kfree(opts.name);
mutex_unlock(&cgroup_mutex);
return ret;
}
@@ -1106,30 +1130,30 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
.rename = cgroup1_rename,
.show_options = cgroup1_show_options,
- .remount_fs = cgroup1_remount,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.show_path = cgroup_show_path,
};
-struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
- void *data, unsigned long magic,
- struct cgroup_namespace *ns)
+/*
+ * The guts of cgroup1 mount - find or create cgroup_root to use.
+ * Called with cgroup_mutex held; returns 0 on success, -E... on
+ * error and positive - in case when the candidate is busy dying.
+ * On success it stashes a reference to cgroup_root into given
+ * cgroup_fs_context; that reference is *NOT* counting towards the
+ * cgroup_root refcount.
+ */
+static int cgroup1_root_to_use(struct fs_context *fc)
{
- struct super_block *pinned_sb = NULL;
- struct cgroup_sb_opts opts;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct cgroup_root *root;
struct cgroup_subsys *ss;
- struct dentry *dentry;
int i, ret;
- bool new_root = false;
-
- cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* First find the desired set of subsystems */
- ret = parse_cgroupfs_options(data, &opts);
+ ret = check_cgroupfs_options(fc);
if (ret)
- goto out_unlock;
+ return ret;
/*
* Destruction of cgroup root is asynchronous, so subsystems may
@@ -1139,16 +1163,12 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* starting. Testing ref liveliness is good enough.
*/
for_each_subsys(ss, i) {
- if (!(opts.subsys_mask & (1 << i)) ||
+ if (!(ctx->subsys_mask & (1 << i)) ||
ss->root == &cgrp_dfl_root)
continue;
- if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
+ return 1; /* restart */
cgroup_put(&ss->root->cgrp);
}
@@ -1163,8 +1183,8 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* name matches but sybsys_mask doesn't, we should fail.
* Remember whether name matched.
*/
- if (opts.name) {
- if (strcmp(opts.name, root->name))
+ if (ctx->name) {
+ if (strcmp(ctx->name, root->name))
continue;
name_match = true;
}
@@ -1173,42 +1193,18 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* If we asked for subsystems (or explicitly for no
* subsystems) then they must match.
*/
- if ((opts.subsys_mask || opts.none) &&
- (opts.subsys_mask != root->subsys_mask)) {
+ if ((ctx->subsys_mask || ctx->none) &&
+ (ctx->subsys_mask != root->subsys_mask)) {
if (!name_match)
continue;
- ret = -EBUSY;
- goto out_unlock;
+ return -EBUSY;
}
- if (root->flags ^ opts.flags)
+ if (root->flags ^ ctx->flags)
pr_warn("new mount options do not match the existing superblock, will be ignored\n");
- /*
- * We want to reuse @root whose lifetime is governed by its
- * ->cgrp. Let's check whether @root is alive and keep it
- * that way. As cgroup_kill_sb() can happen anytime, we
- * want to block it by pinning the sb so that @root doesn't
- * get killed before mount is complete.
- *
- * With the sb pinned, tryget_live can reliably indicate
- * whether @root can be reused. If it's being killed,
- * drain it. We can use wait_queue for the wait but this
- * path is super cold. Let's just sleep a bit and retry.
- */
- pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
- if (IS_ERR(pinned_sb) ||
- !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- if (!IS_ERR_OR_NULL(pinned_sb))
- deactivate_super(pinned_sb);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
-
- ret = 0;
- goto out_unlock;
+ ctx->root = root;
+ return 0;
}
/*
@@ -1216,62 +1212,58 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* specification is allowed for already existing hierarchies but we
* can't create new one without subsys specification.
*/
- if (!opts.subsys_mask && !opts.none) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ if (!ctx->subsys_mask && !ctx->none)
+ return cg_invalf(fc, "cgroup1: No subsys list or none specified");
/* Hierarchies may only be created in the initial cgroup namespace. */
- if (ns != &init_cgroup_ns) {
- ret = -EPERM;
- goto out_unlock;
- }
+ if (ctx->ns != &init_cgroup_ns)
+ return -EPERM;
root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root) {
- ret = -ENOMEM;
- goto out_unlock;
- }
- new_root = true;
+ if (!root)
+ return -ENOMEM;
- init_cgroup_root(root, &opts);
+ ctx->root = root;
+ init_cgroup_root(ctx);
- ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
+ ret = cgroup_setup_root(root, ctx->subsys_mask);
if (ret)
cgroup_free_root(root);
+ return ret;
+}
-out_unlock:
- mutex_unlock(&cgroup_mutex);
-out_free:
- kfree(opts.release_agent);
- kfree(opts.name);
+int cgroup1_get_tree(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ int ret;
- if (ret)
- return ERR_PTR(ret);
+ /* Check if the caller has permission to mount. */
+ if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
- dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
- CGROUP_SUPER_MAGIC, ns);
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
- /*
- * There's a race window after we release cgroup_mutex and before
- * allocating a superblock. Make sure a concurrent process won't
- * be able to re-use the root during this window by delaying the
- * initialization of root refcnt.
- */
- if (new_root) {
- mutex_lock(&cgroup_mutex);
- percpu_ref_reinit(&root->cgrp.self.refcnt);
- mutex_unlock(&cgroup_mutex);
- }
+ ret = cgroup1_root_to_use(fc);
+ if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
+ ret = 1; /* restart */
- /*
- * If @pinned_sb, we're reusing an existing root and holding an
- * extra ref on its sb. Mount is complete. Put the extra ref.
- */
- if (pinned_sb)
- deactivate_super(pinned_sb);
+ mutex_unlock(&cgroup_mutex);
- return dentry;
+ if (!ret)
+ ret = cgroup_do_get_tree(fc);
+
+ if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
+ struct super_block *sb = fc->root->d_sb;
+ dput(fc->root);
+ deactivate_locked_super(sb);
+ ret = 1;
+ }
+
+ if (unlikely(ret > 0)) {
+ msleep(10);
+ return restart_syscall();
+ }
+ return ret;
}
static int __init cgroup1_wq_init(void)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f31bd61c9466..3f2b4bde0f9c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -54,6 +54,7 @@
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
+#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
#include <linux/psi.h>
#include <net/sock.h>
@@ -197,7 +198,7 @@ static u64 css_serial_nr_next = 1;
*/
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
-static u16 have_free_callback __read_mostly;
+static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
@@ -1772,26 +1773,37 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
return len;
}
-static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
-{
- char *token;
+enum cgroup2_param {
+ Opt_nsdelegate,
+ nr__cgroup2_params
+};
- *root_flags = 0;
+static const struct fs_parameter_spec cgroup2_param_specs[] = {
+ fsparam_flag ("nsdelegate", Opt_nsdelegate),
+ {}
+};
- if (!data || *data == '\0')
- return 0;
+static const struct fs_parameter_description cgroup2_fs_parameters = {
+ .name = "cgroup2",
+ .specs = cgroup2_param_specs,
+};
- while ((token = strsep(&data, ",")) != NULL) {
- if (!strcmp(token, "nsdelegate")) {
- *root_flags |= CGRP_ROOT_NS_DELEGATE;
- continue;
- }
+static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct fs_parse_result result;
+ int opt;
- pr_err("cgroup2: unknown option \"%s\"\n", token);
- return -EINVAL;
- }
+ opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
- return 0;
+ switch (opt) {
+ case Opt_nsdelegate:
+ ctx->flags |= CGRP_ROOT_NS_DELEGATE;
+ return 0;
+ }
+ return -EINVAL;
}
static void apply_cgroup_root_flags(unsigned int root_flags)
@@ -1811,16 +1823,11 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
return 0;
}
-static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
+static int cgroup_reconfigure(struct fs_context *fc)
{
- unsigned int root_flags;
- int ret;
-
- ret = parse_cgroup_root_flags(data, &root_flags);
- if (ret)
- return ret;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
- apply_cgroup_root_flags(root_flags);
+ apply_cgroup_root_flags(ctx->flags);
return 0;
}
@@ -1908,8 +1915,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
-void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
+void init_cgroup_root(struct cgroup_fs_context *ctx)
{
+ struct cgroup_root *root = ctx->root;
struct cgroup *cgrp = &root->cgrp;
INIT_LIST_HEAD(&root->root_list);
@@ -1918,16 +1926,16 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
init_cgroup_housekeeping(cgrp);
idr_init(&root->cgroup_idr);
- root->flags = opts->flags;
- if (opts->release_agent)
- strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
- if (opts->name)
- strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
- if (opts->cpuset_clone_children)
+ root->flags = ctx->flags;
+ if (ctx->release_agent)
+ strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
+ if (ctx->name)
+ strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
+ if (ctx->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1944,7 +1952,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
- ref_flags, GFP_KERNEL);
+ 0, GFP_KERNEL);
if (ret)
goto out;
@@ -2028,57 +2036,104 @@ out:
return ret;
}
-struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
- struct cgroup_root *root, unsigned long magic,
- struct cgroup_namespace *ns)
+int cgroup_do_get_tree(struct fs_context *fc)
{
- struct dentry *dentry;
- bool new_sb;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ int ret;
- dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
+ ctx->kfc.root = ctx->root->kf_root;
+ if (fc->fs_type == &cgroup2_fs_type)
+ ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
+ else
+ ctx->kfc.magic = CGROUP_SUPER_MAGIC;
+ ret = kernfs_get_tree(fc);
/*
* In non-init cgroup namespace, instead of root cgroup's dentry,
* we return the dentry corresponding to the cgroupns->root_cgrp.
*/
- if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+ if (!ret && ctx->ns != &init_cgroup_ns) {
struct dentry *nsdentry;
+ struct super_block *sb = fc->root->d_sb;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- cgrp = cset_cgroup_from_root(ns->root_cset, root);
+ cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
- nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
- dput(dentry);
- dentry = nsdentry;
+ nsdentry = kernfs_node_dentry(cgrp->kn, sb);
+ dput(fc->root);
+ fc->root = nsdentry;
+ if (IS_ERR(nsdentry)) {
+ ret = PTR_ERR(nsdentry);
+ deactivate_locked_super(sb);
+ }
}
- if (IS_ERR(dentry) || !new_sb)
- cgroup_put(&root->cgrp);
+ if (!ctx->kfc.new_sb_created)
+ cgroup_put(&ctx->root->cgrp);
- return dentry;
+ return ret;
}
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data)
+/*
+ * Destroy a cgroup filesystem context.
+ */
+static void cgroup_fs_context_free(struct fs_context *fc)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- struct dentry *dentry;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+
+ kfree(ctx->name);
+ kfree(ctx->release_agent);
+ put_cgroup_ns(ctx->ns);
+ kernfs_free_fs_context(fc);
+ kfree(ctx);
+}
+
+static int cgroup_get_tree(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
- get_cgroup_ns(ns);
+ cgrp_dfl_visible = true;
+ cgroup_get_live(&cgrp_dfl_root.cgrp);
+ ctx->root = &cgrp_dfl_root;
- /* Check if the caller has permission to mount. */
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
- put_cgroup_ns(ns);
- return ERR_PTR(-EPERM);
- }
+ ret = cgroup_do_get_tree(fc);
+ if (!ret)
+ apply_cgroup_root_flags(ctx->flags);
+ return ret;
+}
+
+static const struct fs_context_operations cgroup_fs_context_ops = {
+ .free = cgroup_fs_context_free,
+ .parse_param = cgroup2_parse_param,
+ .get_tree = cgroup_get_tree,
+ .reconfigure = cgroup_reconfigure,
+};
+
+static const struct fs_context_operations cgroup1_fs_context_ops = {
+ .free = cgroup_fs_context_free,
+ .parse_param = cgroup1_parse_param,
+ .get_tree = cgroup1_get_tree,
+ .reconfigure = cgroup1_reconfigure,
+};
+
+/*
+ * Initialise the cgroup filesystem creation/reconfiguration context. Notably,
+ * we select the namespace we're going to use.
+ */
+static int cgroup_init_fs_context(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
/*
* The first time anyone tries to mount a cgroup, enable the list
@@ -2087,29 +2142,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
- if (fs_type == &cgroup2_fs_type) {
- unsigned int root_flags;
-
- ret = parse_cgroup_root_flags(data, &root_flags);
- if (ret) {
- put_cgroup_ns(ns);
- return ERR_PTR(ret);
- }
-
- cgrp_dfl_visible = true;
- cgroup_get_live(&cgrp_dfl_root.cgrp);
-
- dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
- CGROUP2_SUPER_MAGIC, ns);
- if (!IS_ERR(dentry))
- apply_cgroup_root_flags(root_flags);
- } else {
- dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
- CGROUP_SUPER_MAGIC, ns);
- }
-
- put_cgroup_ns(ns);
- return dentry;
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ fc->fs_private = &ctx->kfc;
+ if (fc->fs_type == &cgroup2_fs_type)
+ fc->ops = &cgroup_fs_context_ops;
+ else
+ fc->ops = &cgroup1_fs_context_ops;
+ if (fc->user_ns)
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->ns->user_ns);
+ fc->global = true;
+ return 0;
}
static void cgroup_kill_sb(struct super_block *sb)
@@ -2118,33 +2162,33 @@ static void cgroup_kill_sb(struct super_block *sb)
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
/*
- * If @root doesn't have any mounts or children, start killing it.
+ * If @root doesn't have any children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
* cgroup_mount() may wait for @root's release.
*
* And don't kill the default root.
*/
- if (!list_empty(&root->cgrp.self.children) ||
- root == &cgrp_dfl_root)
- cgroup_put(&root->cgrp);
- else
+ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt))
percpu_ref_kill(&root->cgrp.self.refcnt);
-
+ cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
struct file_system_type cgroup_fs_type = {
- .name = "cgroup",
- .mount = cgroup_mount,
- .kill_sb = cgroup_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "cgroup",
+ .init_fs_context = cgroup_init_fs_context,
+ .parameters = &cgroup1_fs_parameters,
+ .kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
static struct file_system_type cgroup2_fs_type = {
- .name = "cgroup2",
- .mount = cgroup_mount,
- .kill_sb = cgroup_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "cgroup2",
+ .init_fs_context = cgroup_init_fs_context,
+ .parameters = &cgroup2_fs_parameters,
+ .kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
@@ -3533,6 +3577,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
return ret ?: nbytes;
}
+static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->poll)
+ return cft->poll(of, pt);
+
+ return kernfs_generic_poll(of, pt);
+}
+
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
return seq_cft(seq)->seq_start(seq, ppos);
@@ -3571,6 +3625,7 @@ static struct kernfs_ops cgroup_kf_single_ops = {
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_show = cgroup_seqfile_show,
};
@@ -3579,6 +3634,7 @@ static struct kernfs_ops cgroup_kf_ops = {
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
.seq_stop = cgroup_seqfile_stop,
@@ -5267,7 +5323,6 @@ int cgroup_rmdir(struct kernfs_node *kn)
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.show_options = cgroup_show_options,
- .remount_fs = cgroup_remount,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.show_path = cgroup_show_path,
@@ -5313,7 +5368,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
have_fork_callback |= (bool)ss->fork << ss->id;
have_exit_callback |= (bool)ss->exit << ss->id;
- have_free_callback |= (bool)ss->free << ss->id;
+ have_release_callback |= (bool)ss->release << ss->id;
have_canfork_callback |= (bool)ss->can_fork << ss->id;
/* At system boot, before all subsystems have been
@@ -5334,11 +5389,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
*/
int __init cgroup_init_early(void)
{
- static struct cgroup_sb_opts __initdata opts;
+ static struct cgroup_fs_context __initdata ctx;
struct cgroup_subsys *ss;
int i;
- init_cgroup_root(&cgrp_dfl_root, &opts);
+ ctx.root = &cgrp_dfl_root;
+ init_cgroup_root(&ctx);
cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
@@ -5399,7 +5455,7 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
- BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
mutex_unlock(&cgroup_mutex);
@@ -5749,16 +5805,19 @@ void cgroup_exit(struct task_struct *tsk)
} while_each_subsys_mask();
}
-void cgroup_free(struct task_struct *task)
+void cgroup_release(struct task_struct *task)
{
- struct css_set *cset = task_css_set(task);
struct cgroup_subsys *ss;
int ssid;
- do_each_subsys_mask(ss, ssid, have_free_callback) {
- ss->free(task);
+ do_each_subsys_mask(ss, ssid, have_release_callback) {
+ ss->release(task);
} while_each_subsys_mask();
+}
+void cgroup_free(struct task_struct *task)
+{
+ struct css_set *cset = task_css_set(task);
put_css_set(cset);
}
@@ -5996,7 +6055,7 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ ret = __cgroup_bpf_detach(cgrp, prog, type);
mutex_unlock(&cgroup_mutex);
return ret;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 479743db6c37..4834c4214e9c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -39,6 +39,7 @@
#include <linux/memory.h>
#include <linux/export.h>
#include <linux/mount.h>
+#include <linux/fs_context.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
@@ -203,19 +204,6 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
return css_cs(cs->css.parent);
}
-#ifdef CONFIG_NUMA
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
- return task->mempolicy;
-}
-#else
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
- return false;
-}
-#endif
-
-
/* bits in struct cpuset flags field */
typedef enum {
CS_ONLINE,
@@ -372,25 +360,52 @@ static inline bool is_in_v2_mode(void)
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
*/
-static struct dentry *cpuset_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name, void *data)
-{
- struct file_system_type *cgroup_fs = get_fs_type("cgroup");
- struct dentry *ret = ERR_PTR(-ENODEV);
- if (cgroup_fs) {
- char mountopts[] =
- "cpuset,noprefix,"
- "release_agent=/sbin/cpuset_release_agent";
- ret = cgroup_fs->mount(cgroup_fs, flags,
- unused_dev_name, mountopts);
- put_filesystem(cgroup_fs);
+static int cpuset_get_tree(struct fs_context *fc)
+{
+ struct file_system_type *cgroup_fs;
+ struct fs_context *new_fc;
+ int ret;
+
+ cgroup_fs = get_fs_type("cgroup");
+ if (!cgroup_fs)
+ return -ENODEV;
+
+ new_fc = fs_context_for_mount(cgroup_fs, fc->sb_flags);
+ if (IS_ERR(new_fc)) {
+ ret = PTR_ERR(new_fc);
+ } else {
+ static const char agent_path[] = "/sbin/cpuset_release_agent";
+ ret = vfs_parse_fs_string(new_fc, "cpuset", NULL, 0);
+ if (!ret)
+ ret = vfs_parse_fs_string(new_fc, "noprefix", NULL, 0);
+ if (!ret)
+ ret = vfs_parse_fs_string(new_fc, "release_agent",
+ agent_path, sizeof(agent_path) - 1);
+ if (!ret)
+ ret = vfs_get_tree(new_fc);
+ if (!ret) { /* steal the result */
+ fc->root = new_fc->root;
+ new_fc->root = NULL;
+ }
+ put_fs_context(new_fc);
}
+ put_filesystem(cgroup_fs);
return ret;
}
+static const struct fs_context_operations cpuset_fs_context_ops = {
+ .get_tree = cpuset_get_tree,
+};
+
+static int cpuset_init_fs_context(struct fs_context *fc)
+{
+ fc->ops = &cpuset_fs_context_ops;
+ return 0;
+}
+
static struct file_system_type cpuset_fs_type = {
- .name = "cpuset",
- .mount = cpuset_mount,
+ .name = "cpuset",
+ .init_fs_context = cpuset_init_fs_context,
};
/*
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 9829c67ebc0a..c9960baaa14f 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -247,7 +247,7 @@ static void pids_cancel_fork(struct task_struct *task)
pids_uncharge(pids, 1);
}
-static void pids_free(struct task_struct *task)
+static void pids_release(struct task_struct *task)
{
struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
@@ -342,7 +342,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
.cancel_attach = pids_cancel_attach,
.can_fork = pids_can_fork,
.cancel_fork = pids_cancel_fork,
- .free = pids_free,
+ .release = pids_release,
.legacy_cftypes = pids_files,
.dfl_cftypes = pids_files,
.threaded = true,
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index d3bbb757ee49..1d75ae7f1cb7 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -313,10 +313,8 @@ EXPORT_SYMBOL(rdmacg_try_charge);
* If IB stack wish a device to participate in rdma cgroup resource
* tracking, it must invoke this API to register with rdma cgroup before
* any user space application can start using the RDMA resources.
- * Returns 0 on success or EINVAL when table length given is beyond
- * supported size.
*/
-int rdmacg_register_device(struct rdmacg_device *device)
+void rdmacg_register_device(struct rdmacg_device *device)
{
INIT_LIST_HEAD(&device->dev_node);
INIT_LIST_HEAD(&device->rpools);
@@ -324,7 +322,6 @@ int rdmacg_register_device(struct rdmacg_device *device)
mutex_lock(&rdmacg_mutex);
list_add_tail(&device->dev_node, &rdmacg_devices);
mutex_unlock(&rdmacg_mutex);
- return 0;
}
EXPORT_SYMBOL(rdmacg_register_device);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d503d1a9007c..bb95a35e8c2d 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -87,7 +87,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_rstat_cpu *rstatc;
- struct cgroup *parent;
if (pos == root)
return NULL;
@@ -115,8 +114,8 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
- parent = cgroup_parent(pos);
- if (parent && rstatc->updated_next) {
+ if (rstatc->updated_next) {
+ struct cgroup *parent = cgroup_parent(pos);
struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
struct cgroup_rstat_cpu *nrstatc;
struct cgroup **nextp;
@@ -140,9 +139,12 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* updated stat.
*/
smp_mb();
+
+ return pos;
}
- return pos;
+ /* only happens for @root */
+ return NULL;
}
/* see cgroup_rstat_flush() */
diff --git a/kernel/compat.c b/kernel/compat.c
index f01affa17e22..d8a36c6ad7c9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -20,7 +20,6 @@
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/security.h>
-#include <linux/timex.h>
#include <linux/export.h>
#include <linux/migrate.h>
#include <linux/posix-timers.h>
@@ -30,69 +29,6 @@
#include <linux/uaccess.h>
-int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp)
-{
- struct compat_timex tx32;
-
- memset(txc, 0, sizeof(struct timex));
- if (copy_from_user(&tx32, utp, sizeof(struct compat_timex)))
- return -EFAULT;
-
- txc->modes = tx32.modes;
- txc->offset = tx32.offset;
- txc->freq = tx32.freq;
- txc->maxerror = tx32.maxerror;
- txc->esterror = tx32.esterror;
- txc->status = tx32.status;
- txc->constant = tx32.constant;
- txc->precision = tx32.precision;
- txc->tolerance = tx32.tolerance;
- txc->time.tv_sec = tx32.time.tv_sec;
- txc->time.tv_usec = tx32.time.tv_usec;
- txc->tick = tx32.tick;
- txc->ppsfreq = tx32.ppsfreq;
- txc->jitter = tx32.jitter;
- txc->shift = tx32.shift;
- txc->stabil = tx32.stabil;
- txc->jitcnt = tx32.jitcnt;
- txc->calcnt = tx32.calcnt;
- txc->errcnt = tx32.errcnt;
- txc->stbcnt = tx32.stbcnt;
-
- return 0;
-}
-
-int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc)
-{
- struct compat_timex tx32;
-
- memset(&tx32, 0, sizeof(struct compat_timex));
- tx32.modes = txc->modes;
- tx32.offset = txc->offset;
- tx32.freq = txc->freq;
- tx32.maxerror = txc->maxerror;
- tx32.esterror = txc->esterror;
- tx32.status = txc->status;
- tx32.constant = txc->constant;
- tx32.precision = txc->precision;
- tx32.tolerance = txc->tolerance;
- tx32.time.tv_sec = txc->time.tv_sec;
- tx32.time.tv_usec = txc->time.tv_usec;
- tx32.tick = txc->tick;
- tx32.ppsfreq = txc->ppsfreq;
- tx32.jitter = txc->jitter;
- tx32.shift = txc->shift;
- tx32.stabil = txc->stabil;
- tx32.jitcnt = txc->jitcnt;
- tx32.calcnt = txc->calcnt;
- tx32.errcnt = txc->errcnt;
- tx32.stbcnt = txc->stbcnt;
- tx32.tai = txc->tai;
- if (copy_to_user(utp, &tx32, sizeof(struct compat_timex)))
- return -EFAULT;
- return 0;
-}
-
static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv)
{
return (!access_ok(ctv, sizeof(*ctv)) ||
diff --git a/kernel/configs.c b/kernel/configs.c
index 2df132b20217..b062425ccf8d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -30,37 +30,35 @@
#include <linux/init.h>
#include <linux/uaccess.h>
-/**************************************************/
-/* the actual current config file */
-
/*
- * Define kernel_config_data and kernel_config_data_size, which contains the
- * wrapped and compressed configuration file. The file is first compressed
- * with gzip and then bounded by two eight byte magic numbers to allow
- * extraction from a binary kernel image:
- *
- * IKCFG_ST
- * <image>
- * IKCFG_ED
+ * "IKCFG_ST" and "IKCFG_ED" are used to extract the config data from
+ * a binary kernel image or a module. See scripts/extract-ikconfig.
*/
-#define MAGIC_START "IKCFG_ST"
-#define MAGIC_END "IKCFG_ED"
-#include "config_data.h"
-
-
-#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
-#define kernel_config_data_size \
- (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
+asm (
+" .pushsection .rodata, \"a\" \n"
+" .ascii \"IKCFG_ST\" \n"
+" .global kernel_config_data \n"
+"kernel_config_data: \n"
+" .incbin \"kernel/config_data.gz\" \n"
+" .global kernel_config_data_end \n"
+"kernel_config_data_end: \n"
+" .ascii \"IKCFG_ED\" \n"
+" .popsection \n"
+);
#ifdef CONFIG_IKCONFIG_PROC
+extern char kernel_config_data;
+extern char kernel_config_data_end;
+
static ssize_t
ikconfig_read_current(struct file *file, char __user *buf,
size_t len, loff_t * offset)
{
return simple_read_from_buffer(buf, len, offset,
- kernel_config_data + MAGIC_SIZE,
- kernel_config_data_size);
+ &kernel_config_data,
+ &kernel_config_data_end -
+ &kernel_config_data);
}
static const struct file_operations ikconfig_file_ops = {
@@ -79,7 +77,7 @@ static int __init ikconfig_init(void)
if (!entry)
return -ENOMEM;
- proc_set_size(entry, kernel_config_data_size);
+ proc_set_size(entry, &kernel_config_data_end - &kernel_config_data);
return 0;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d1c6d152da89..6754f3ecfd94 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -313,6 +313,15 @@ void cpus_write_unlock(void)
void lockdep_assert_cpus_held(void)
{
+ /*
+ * We can't have hotplug operations before userspace starts running,
+ * and some init codepaths will knowingly not take the hotplug lock.
+ * This is all valid, so mute lockdep until it makes sense to report
+ * unheld locks.
+ */
+ if (system_state < SYSTEM_RUNNING)
+ return;
+
percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
@@ -555,6 +564,20 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
}
+static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
+{
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ /*
+ * When CPU hotplug is disabled, then taking the CPU down is not
+ * possible because takedown_cpu() and the architecture and
+ * subsystem specific mechanisms are not available. So the CPU
+ * which would be completely unplugged again needs to stay around
+ * in the current state.
+ */
+ return st->state <= CPUHP_BRINGUP_CPU;
+}
+
static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
@@ -565,8 +588,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
if (ret) {
- st->target = prev_state;
- undo_cpu_up(cpu, st);
+ if (can_rollback_cpu(st)) {
+ st->target = prev_state;
+ undo_cpu_up(cpu, st);
+ }
break;
}
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 933cb3e45b98..093c9f917ed0 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
+ VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
arch_crash_save_vmcoreinfo();
diff --git a/kernel/cred.c b/kernel/cred.c
index 21f4a97085b4..45d77284aed0 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -760,19 +760,6 @@ bool creds_are_invalid(const struct cred *cred)
{
if (cred->magic != CRED_MAGIC)
return true;
-#ifdef CONFIG_SECURITY_SELINUX
- /*
- * cred->security == NULL if security_cred_alloc_blank() or
- * security_prepare_creds() returned an error.
- */
- if (selinux_is_enabled() && cred->security) {
- if ((unsigned long) cred->security < PAGE_SIZE)
- return true;
- if ((*(u32 *)cred->security & 0xffffff00) ==
- (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
- return true;
- }
-#endif
return false;
}
EXPORT_SYMBOL(creds_are_invalid);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index ca88b867e7fe..a06ba3013b3b 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -16,7 +16,16 @@ config ARCH_DMA_ADDR_T_64BIT
config ARCH_HAS_DMA_COHERENCE_H
bool
-config HAVE_GENERIC_DMA_COHERENT
+config ARCH_HAS_DMA_SET_MASK
+ bool
+
+config DMA_DECLARE_COHERENT
+ bool
+
+config ARCH_HAS_SETUP_DMA_OPS
+ bool
+
+config ARCH_HAS_TEARDOWN_DMA_OPS
bool
config ARCH_HAS_SYNC_DMA_FOR_DEVICE
@@ -53,3 +62,116 @@ config DMA_REMAP
config DMA_DIRECT_REMAP
bool
select DMA_REMAP
+
+config DMA_CMA
+ bool "DMA Contiguous Memory Allocator"
+ depends on HAVE_DMA_CONTIGUOUS && CMA
+ help
+ This enables the Contiguous Memory Allocator which allows drivers
+ to allocate big physically-contiguous blocks of memory for use with
+ hardware components that do not support I/O map nor scatter-gather.
+
+ You can disable CMA by specifying "cma=0" on the kernel's command
+ line.
+
+ For more information see <include/linux/dma-contiguous.h>.
+ If unsure, say "n".
+
+if DMA_CMA
+comment "Default contiguous memory area size:"
+
+config CMA_SIZE_MBYTES
+ int "Size in Mega Bytes"
+ depends on !CMA_SIZE_SEL_PERCENTAGE
+ default 0 if X86
+ default 16
+ help
+ Defines the size (in MiB) of the default memory area for Contiguous
+ Memory Allocator. If the size of 0 is selected, CMA is disabled by
+ default, but it can be enabled by passing cma=size[MG] to the kernel.
+
+
+config CMA_SIZE_PERCENTAGE
+ int "Percentage of total memory"
+ depends on !CMA_SIZE_SEL_MBYTES
+ default 0 if X86
+ default 10
+ help
+ Defines the size of the default memory area for Contiguous Memory
+ Allocator as a percentage of the total memory in the system.
+ If 0 percent is selected, CMA is disabled by default, but it can be
+ enabled by passing cma=size[MG] to the kernel.
+
+choice
+ prompt "Selected region size"
+ default CMA_SIZE_SEL_MBYTES
+
+config CMA_SIZE_SEL_MBYTES
+ bool "Use mega bytes value only"
+
+config CMA_SIZE_SEL_PERCENTAGE
+ bool "Use percentage value only"
+
+config CMA_SIZE_SEL_MIN
+ bool "Use lower value (minimum)"
+
+config CMA_SIZE_SEL_MAX
+ bool "Use higher value (maximum)"
+
+endchoice
+
+config CMA_ALIGNMENT
+ int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
+ range 4 12
+ default 8
+ help
+ DMA mapping framework by default aligns all buffers to the smallest
+ PAGE_SIZE order which is greater than or equal to the requested buffer
+ size. This works well for buffers up to a few hundreds kilobytes, but
+ for larger buffers it just a memory waste. With this parameter you can
+ specify the maximum PAGE_SIZE order for contiguous buffers. Larger
+ buffers will be aligned only to this specified order. The order is
+ expressed as a power of two multiplied by the PAGE_SIZE.
+
+ For example, if your system defaults to 4KiB pages, the order value
+ of 8 means that the buffers will be aligned up to 1MiB only.
+
+ If unsure, leave the default value "8".
+
+endif
+
+config DMA_API_DEBUG
+ bool "Enable debugging of DMA-API usage"
+ select NEED_DMA_MAP_STATE
+ help
+ Enable this option to debug the use of the DMA API by device drivers.
+ With this option you will be able to detect common bugs in device
+ drivers like double-freeing of DMA mappings or freeing mappings that
+ were never allocated.
+
+ This also attempts to catch cases where a page owned by DMA is
+ accessed by the cpu in a way that could cause data corruption. For
+ example, this enables cow_user_page() to check that the source page is
+ not undergoing DMA.
+
+ This option causes a performance degradation. Use only if you want to
+ debug device drivers and dma interactions.
+
+ If unsure, say N.
+
+config DMA_API_DEBUG_SG
+ bool "Debug DMA scatter-gather usage"
+ default y
+ depends on DMA_API_DEBUG
+ help
+ Perform extra checking that callers of dma_map_sg() have respected the
+ appropriate segment length/boundary limits for the given device when
+ preparing DMA scatterlists.
+
+ This is particularly likely to have been overlooked in cases where the
+ dma_map_sg() API is used for general bulk mapping of pages rather than
+ preparing literal scatter-gather descriptors, where there is a risk of
+ unexpected behaviour from DMA API implementations if the scatterlist
+ is technically out-of-spec.
+
+ If unsure, say N.
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 72ff6e46aa86..d237cf3dc181 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -2,7 +2,7 @@
obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o
obj-$(CONFIG_DMA_CMA) += contiguous.o
-obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o
+obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 66f0fb7e9a3a..29fd6590dc1e 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -14,7 +14,6 @@ struct dma_coherent_mem {
dma_addr_t device_base;
unsigned long pfn_base;
int size;
- int flags;
unsigned long *bitmap;
spinlock_t spinlock;
bool use_dev_dma_pfn_offset;
@@ -38,12 +37,12 @@ static inline dma_addr_t dma_get_device_base(struct device *dev,
return mem->device_base;
}
-static int dma_init_coherent_memory(
- phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags,
- struct dma_coherent_mem **mem)
+static int dma_init_coherent_memory(phys_addr_t phys_addr,
+ dma_addr_t device_addr, size_t size,
+ struct dma_coherent_mem **mem)
{
struct dma_coherent_mem *dma_mem = NULL;
- void __iomem *mem_base = NULL;
+ void *mem_base = NULL;
int pages = size >> PAGE_SHIFT;
int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
int ret;
@@ -73,7 +72,6 @@ static int dma_init_coherent_memory(
dma_mem->device_base = device_addr;
dma_mem->pfn_base = PFN_DOWN(phys_addr);
dma_mem->size = pages;
- dma_mem->flags = flags;
spin_lock_init(&dma_mem->spinlock);
*mem = dma_mem;
@@ -110,12 +108,12 @@ static int dma_assign_coherent_memory(struct device *dev,
}
int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
- dma_addr_t device_addr, size_t size, int flags)
+ dma_addr_t device_addr, size_t size)
{
struct dma_coherent_mem *mem;
int ret;
- ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem);
+ ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem);
if (ret)
return ret;
@@ -137,29 +135,6 @@ void dma_release_declared_memory(struct device *dev)
}
EXPORT_SYMBOL(dma_release_declared_memory);
-void *dma_mark_declared_memory_occupied(struct device *dev,
- dma_addr_t device_addr, size_t size)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
- unsigned long flags;
- int pos, err;
-
- size += device_addr & ~PAGE_MASK;
-
- if (!mem)
- return ERR_PTR(-EINVAL);
-
- spin_lock_irqsave(&mem->spinlock, flags);
- pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem));
- err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
- spin_unlock_irqrestore(&mem->spinlock, flags);
-
- if (err != 0)
- return ERR_PTR(err);
- return mem->virt_base + (pos << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-
static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
ssize_t size, dma_addr_t *dma_handle)
{
@@ -213,15 +188,7 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
return 0;
*ret = __dma_alloc_from_coherent(mem, size, dma_handle);
- if (*ret)
- return 1;
-
- /*
- * In the case where the allocation can not be satisfied from the
- * per-device area, try to fall back to generic memory if the
- * constraints allow it.
- */
- return mem->flags & DMA_MEMORY_EXCLUSIVE;
+ return 1;
}
void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle)
@@ -350,8 +317,7 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
if (!mem) {
ret = dma_init_coherent_memory(rmem->base, rmem->base,
- rmem->size,
- DMA_MEMORY_EXCLUSIVE, &mem);
+ rmem->size, &mem);
if (ret) {
pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n",
&rmem->base, (unsigned long)rmem->size / SZ_1M);
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 23cf5361bcf1..a218e43cc382 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -134,17 +134,6 @@ static u32 nr_total_entries;
/* number of preallocated entries requested by kernel cmdline */
static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
-/* debugfs dentry's for the stuff above */
-static struct dentry *dma_debug_dent __read_mostly;
-static struct dentry *global_disable_dent __read_mostly;
-static struct dentry *error_count_dent __read_mostly;
-static struct dentry *show_all_errors_dent __read_mostly;
-static struct dentry *show_num_errors_dent __read_mostly;
-static struct dentry *num_free_entries_dent __read_mostly;
-static struct dentry *min_free_entries_dent __read_mostly;
-static struct dentry *nr_total_entries_dent __read_mostly;
-static struct dentry *filter_dent __read_mostly;
-
/* per-driver filter related state */
#define NAME_MAX_LEN 64
@@ -717,7 +706,7 @@ static struct dma_debug_entry *dma_entry_alloc(void)
#ifdef CONFIG_STACKTRACE
entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
entry->stacktrace.entries = entry->st_entries;
- entry->stacktrace.skip = 2;
+ entry->stacktrace.skip = 1;
save_stack_trace(&entry->stacktrace);
#endif
@@ -840,66 +829,46 @@ static const struct file_operations filter_fops = {
.llseek = default_llseek,
};
-static int dma_debug_fs_init(void)
+static int dump_show(struct seq_file *seq, void *v)
{
- dma_debug_dent = debugfs_create_dir("dma-api", NULL);
- if (!dma_debug_dent) {
- pr_err("can not create debugfs directory\n");
- return -ENOMEM;
- }
+ int idx;
- global_disable_dent = debugfs_create_bool("disabled", 0444,
- dma_debug_dent,
- &global_disable);
- if (!global_disable_dent)
- goto out_err;
-
- error_count_dent = debugfs_create_u32("error_count", 0444,
- dma_debug_dent, &error_count);
- if (!error_count_dent)
- goto out_err;
-
- show_all_errors_dent = debugfs_create_u32("all_errors", 0644,
- dma_debug_dent,
- &show_all_errors);
- if (!show_all_errors_dent)
- goto out_err;
-
- show_num_errors_dent = debugfs_create_u32("num_errors", 0644,
- dma_debug_dent,
- &show_num_errors);
- if (!show_num_errors_dent)
- goto out_err;
-
- num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444,
- dma_debug_dent,
- &num_free_entries);
- if (!num_free_entries_dent)
- goto out_err;
-
- min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444,
- dma_debug_dent,
- &min_free_entries);
- if (!min_free_entries_dent)
- goto out_err;
-
- nr_total_entries_dent = debugfs_create_u32("nr_total_entries", 0444,
- dma_debug_dent,
- &nr_total_entries);
- if (!nr_total_entries_dent)
- goto out_err;
-
- filter_dent = debugfs_create_file("driver_filter", 0644,
- dma_debug_dent, NULL, &filter_fops);
- if (!filter_dent)
- goto out_err;
+ for (idx = 0; idx < HASH_SIZE; idx++) {
+ struct hash_bucket *bucket = &dma_entry_hash[idx];
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+ spin_lock_irqsave(&bucket->lock, flags);
+ list_for_each_entry(entry, &bucket->list, list) {
+ seq_printf(seq,
+ "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx %s %s\n",
+ dev_name(entry->dev),
+ dev_driver_string(entry->dev),
+ type2name[entry->type], idx,
+ phys_addr(entry), entry->pfn,
+ entry->dev_addr, entry->size,
+ dir2name[entry->direction],
+ maperr2str[entry->map_err_type]);
+ }
+ spin_unlock_irqrestore(&bucket->lock, flags);
+ }
return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dump);
-out_err:
- debugfs_remove_recursive(dma_debug_dent);
-
- return -ENOMEM;
+static void dma_debug_fs_init(void)
+{
+ struct dentry *dentry = debugfs_create_dir("dma-api", NULL);
+
+ debugfs_create_bool("disabled", 0444, dentry, &global_disable);
+ debugfs_create_u32("error_count", 0444, dentry, &error_count);
+ debugfs_create_u32("all_errors", 0644, dentry, &show_all_errors);
+ debugfs_create_u32("num_errors", 0644, dentry, &show_num_errors);
+ debugfs_create_u32("num_free_entries", 0444, dentry, &num_free_entries);
+ debugfs_create_u32("min_free_entries", 0444, dentry, &min_free_entries);
+ debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries);
+ debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops);
+ debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops);
}
static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry)
@@ -985,12 +954,7 @@ static int dma_debug_init(void)
spin_lock_init(&dma_entry_hash[i].lock);
}
- if (dma_debug_fs_init() != 0) {
- pr_err("error creating debugfs entries - disabling\n");
- global_disable = true;
-
- return 0;
- }
+ dma_debug_fs_init();
nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES);
for (i = 0; i < nr_pages; ++i)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 355d16acee6d..fcdb23e8d2fc 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -132,8 +132,7 @@ again:
goto again;
}
- if (IS_ENABLED(CONFIG_ZONE_DMA) &&
- phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) {
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) {
gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
goto again;
}
@@ -356,6 +355,20 @@ out_unmap:
}
EXPORT_SYMBOL(dma_direct_map_sg);
+dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ dma_addr_t dma_addr = paddr;
+
+ if (unlikely(!dma_direct_possible(dev, dma_addr, size))) {
+ report_addr(dev, dma_addr, size);
+ return DMA_MAPPING_ERROR;
+ }
+
+ return dma_addr;
+}
+EXPORT_SYMBOL(dma_direct_map_resource);
+
/*
* Because 32-bit DMA masks are so common we expect every architecture to be
* able to satisfy them - either by not supporting more physical memory, or by
@@ -380,3 +393,14 @@ int dma_direct_supported(struct device *dev, u64 mask)
*/
return mask >= __phys_to_dma(dev, min_mask);
}
+
+size_t dma_direct_max_mapping_size(struct device *dev)
+{
+ size_t size = SIZE_MAX;
+
+ /* If SWIOTLB is active, use its maximum mapping size */
+ if (is_swiotlb_active())
+ size = swiotlb_max_mapping_size(dev);
+
+ return size;
+}
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index a11006b6d8e8..c000906348c9 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -207,7 +207,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
}
EXPORT_SYMBOL(dma_mmap_attrs);
-#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK
static u64 dma_default_get_required_mask(struct device *dev)
{
u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
@@ -238,7 +237,6 @@ u64 dma_get_required_mask(struct device *dev)
return dma_default_get_required_mask(dev);
}
EXPORT_SYMBOL_GPL(dma_get_required_mask);
-#endif
#ifndef arch_dma_alloc_attrs
#define arch_dma_alloc_attrs(dev) (true)
@@ -318,18 +316,23 @@ int dma_supported(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_supported);
-#ifndef HAVE_ARCH_DMA_SET_MASK
+#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
+void arch_dma_set_mask(struct device *dev, u64 mask);
+#else
+#define arch_dma_set_mask(dev, mask) do { } while (0)
+#endif
+
int dma_set_mask(struct device *dev, u64 mask)
{
if (!dev->dma_mask || !dma_supported(dev, mask))
return -EIO;
+ arch_dma_set_mask(dev, mask);
dma_check_mask(dev, mask);
*dev->dma_mask = mask;
return 0;
}
EXPORT_SYMBOL(dma_set_mask);
-#endif
#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
int dma_set_coherent_mask(struct device *dev, u64 mask)
@@ -357,3 +360,17 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
ops->cache_sync(dev, vaddr, size, dir);
}
EXPORT_SYMBOL(dma_cache_sync);
+
+size_t dma_max_mapping_size(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ size_t size = SIZE_MAX;
+
+ if (dma_is_direct(ops))
+ size = dma_direct_max_mapping_size(dev);
+ else if (ops && ops->max_mapping_size)
+ size = ops->max_mapping_size(dev);
+
+ return size;
+}
+EXPORT_SYMBOL_GPL(dma_max_mapping_size);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 1fb6fd68b9c7..53012db1e53c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -34,6 +34,9 @@
#include <linux/scatterlist.h>
#include <linux/mem_encrypt.h>
#include <linux/set_memory.h>
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#endif
#include <asm/io.h>
#include <asm/dma.h>
@@ -73,6 +76,11 @@ phys_addr_t io_tlb_start, io_tlb_end;
static unsigned long io_tlb_nslabs;
/*
+ * The number of used IO TLB block
+ */
+static unsigned long io_tlb_used;
+
+/*
* This is a free list describing the number of free entries available from
* each index
*/
@@ -191,6 +199,7 @@ void __init swiotlb_update_mem_attributes(void)
int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
{
unsigned long i, bytes;
+ size_t alloc_size;
bytes = nslabs << IO_TLB_SHIFT;
@@ -203,12 +212,18 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
* to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
* between io_tlb_start and io_tlb_end.
*/
- io_tlb_list = memblock_alloc(
- PAGE_ALIGN(io_tlb_nslabs * sizeof(int)),
- PAGE_SIZE);
- io_tlb_orig_addr = memblock_alloc(
- PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)),
- PAGE_SIZE);
+ alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int));
+ io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!io_tlb_list)
+ panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+
+ alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t));
+ io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!io_tlb_orig_addr)
+ panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+
for (i = 0; i < io_tlb_nslabs; i++) {
io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
@@ -241,7 +256,7 @@ swiotlb_init(int verbose)
bytes = io_tlb_nslabs << IO_TLB_SHIFT;
/* Get IO TLB memory from the low pages */
- vstart = memblock_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE);
+ vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE);
if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
return;
@@ -385,7 +400,7 @@ void __init swiotlb_exit(void)
}
/*
- * Bounce: copy the swiotlb buffer back to the original dma location
+ * Bounce: copy the swiotlb buffer from or back to the original dma location
*/
static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir)
@@ -475,6 +490,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
* request and allocate a buffer from that IO TLB pool.
*/
spin_lock_irqsave(&io_tlb_lock, flags);
+
+ if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
+ goto not_found;
+
index = ALIGN(io_tlb_index, stride);
if (index >= io_tlb_nslabs)
index = 0;
@@ -524,6 +543,7 @@ not_found:
dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size);
return DMA_MAPPING_ERROR;
found:
+ io_tlb_used += nslots;
spin_unlock_irqrestore(&io_tlb_lock, flags);
/*
@@ -584,6 +604,8 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
*/
for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
io_tlb_list[i] = ++count;
+
+ io_tlb_used -= nslots;
}
spin_unlock_irqrestore(&io_tlb_lock, flags);
}
@@ -651,14 +673,49 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
return true;
}
-/*
- * Return whether the given device DMA address mask can be supported
- * properly. For example, if your device can only drive the low 24-bits
- * during bus mastering, then you would pass 0x00ffffff as the mask to
- * this function.
- */
-int
-swiotlb_dma_supported(struct device *hwdev, u64 mask)
+size_t swiotlb_max_mapping_size(struct device *dev)
{
- return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
+ return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
}
+
+bool is_swiotlb_active(void)
+{
+ /*
+ * When SWIOTLB is initialized, even if io_tlb_start points to physical
+ * address zero, io_tlb_end surely doesn't.
+ */
+ return io_tlb_end != 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static int __init swiotlb_create_debugfs(void)
+{
+ struct dentry *d_swiotlb_usage;
+ struct dentry *ent;
+
+ d_swiotlb_usage = debugfs_create_dir("swiotlb", NULL);
+
+ if (!d_swiotlb_usage)
+ return -ENOMEM;
+
+ ent = debugfs_create_ulong("io_tlb_nslabs", 0400,
+ d_swiotlb_usage, &io_tlb_nslabs);
+ if (!ent)
+ goto fail;
+
+ ent = debugfs_create_ulong("io_tlb_used", 0400,
+ d_swiotlb_usage, &io_tlb_used);
+ if (!ent)
+ goto fail;
+
+ return 0;
+
+fail:
+ debugfs_remove_recursive(d_swiotlb_usage);
+ return -ENOMEM;
+}
+
+late_initcall(swiotlb_create_debugfs);
+
+#endif
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 24a77c34e9ad..c2b41a263166 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events callchain code, extracted from core.c:
*
@@ -5,8 +6,6 @@
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 26d6edab051a..534e01e7bc36 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events core code:
*
@@ -5,8 +6,6 @@
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
*/
#include <linux/fs.h>
@@ -385,6 +384,8 @@ static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
+static atomic_t nr_ksymbol_events __read_mostly;
+static atomic_t nr_bpf_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -1171,7 +1172,7 @@ static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
static void get_ctx(struct perf_event_context *ctx)
{
- WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+ refcount_inc(&ctx->refcount);
}
static void free_ctx(struct rcu_head *head)
@@ -1185,7 +1186,7 @@ static void free_ctx(struct rcu_head *head)
static void put_ctx(struct perf_event_context *ctx)
{
- if (atomic_dec_and_test(&ctx->refcount)) {
+ if (refcount_dec_and_test(&ctx->refcount)) {
if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx);
if (ctx->task && ctx->task != TASK_TOMBSTONE)
@@ -1254,6 +1255,7 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
+ * perf_addr_filters_head::lock
*
* cpu_hotplug_lock
* pmus_lock
@@ -1267,7 +1269,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
again:
rcu_read_lock();
ctx = READ_ONCE(event->ctx);
- if (!atomic_inc_not_zero(&ctx->refcount)) {
+ if (!refcount_inc_not_zero(&ctx->refcount)) {
rcu_read_unlock();
goto again;
}
@@ -1400,7 +1402,7 @@ retry:
}
if (ctx->task == TASK_TOMBSTONE ||
- !atomic_inc_not_zero(&ctx->refcount)) {
+ !refcount_inc_not_zero(&ctx->refcount)) {
raw_spin_unlock(&ctx->lock);
ctx = NULL;
} else {
@@ -2007,8 +2009,8 @@ event_sched_out(struct perf_event *event,
event->pmu->del(event, 0);
event->oncpu = -1;
- if (event->pending_disable) {
- event->pending_disable = 0;
+ if (READ_ONCE(event->pending_disable) >= 0) {
+ WRITE_ONCE(event->pending_disable, -1);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
@@ -2196,7 +2198,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
void perf_event_disable_inatomic(struct perf_event *event)
{
- event->pending_disable = 1;
+ WRITE_ONCE(event->pending_disable, smp_processor_id());
+ /* can fail, see perf_pending_event_disable() */
irq_work_queue(&event->pending);
}
@@ -2797,7 +2800,7 @@ static int perf_event_stop(struct perf_event *event, int restart)
*
* (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
* we update the addresses of corresponding vmas in
- * event::addr_filters_offs array and bump the event::addr_filters_gen;
+ * event::addr_filter_ranges array and bump the event::addr_filters_gen;
* (p2) when an event is scheduled in (pmu::add), it calls
* perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
* if the generation has changed since the previous call.
@@ -4056,7 +4059,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
INIT_LIST_HEAD(&ctx->event_list);
INIT_LIST_HEAD(&ctx->pinned_active);
INIT_LIST_HEAD(&ctx->flexible_active);
- atomic_set(&ctx->refcount, 1);
+ refcount_set(&ctx->refcount, 1);
}
static struct perf_event_context *
@@ -4235,8 +4238,9 @@ static bool is_sb_event(struct perf_event *event)
if (attr->mmap || attr->mmap_data || attr->mmap2 ||
attr->comm || attr->comm_exec ||
- attr->task ||
- attr->context_switch)
+ attr->task || attr->ksymbol ||
+ attr->context_switch ||
+ attr->bpf_event)
return true;
return false;
}
@@ -4305,6 +4309,10 @@ static void unaccount_event(struct perf_event *event)
dec = true;
if (has_branch_stack(event))
dec = true;
+ if (event->attr.ksymbol)
+ atomic_dec(&nr_ksymbol_events);
+ if (event->attr.bpf_event)
+ atomic_dec(&nr_bpf_events);
if (dec) {
if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -4440,7 +4448,7 @@ static void _free_event(struct perf_event *event)
perf_event_free_bpf_prog(event);
perf_addr_filters_splice(event, NULL);
- kfree(event->addr_filters_offs);
+ kfree(event->addr_filter_ranges);
if (event->destroy)
event->destroy(event);
@@ -5396,7 +5404,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event)
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
- if (!atomic_inc_not_zero(&rb->refcount))
+ if (!refcount_inc_not_zero(&rb->refcount))
rb = NULL;
}
rcu_read_unlock();
@@ -5406,7 +5414,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event)
void ring_buffer_put(struct ring_buffer *rb)
{
- if (!atomic_dec_and_test(&rb->refcount))
+ if (!refcount_dec_and_test(&rb->refcount))
return;
WARN_ON_ONCE(!list_empty(&rb->event_list));
@@ -5467,11 +5475,11 @@ static void perf_mmap_close(struct vm_area_struct *vma)
/* now it's safe to free the pages */
atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
- vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+ atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
/* this has to be the last one */
rb_free_aux(rb);
- WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+ WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
mutex_unlock(&event->mmap_mutex);
}
@@ -5540,7 +5548,7 @@ again:
*/
atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
- vma->vm_mm->pinned_vm -= mmap_locked;
+ atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
free_uid(mmap_user);
out_put:
@@ -5688,7 +5696,7 @@ accounting:
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
- locked = vma->vm_mm->pinned_vm + extra;
+ locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
!capable(CAP_IPC_LOCK)) {
@@ -5729,7 +5737,7 @@ accounting:
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
- vma->vm_mm->pinned_vm += extra;
+ atomic64_add(extra, &vma->vm_mm->pinned_vm);
atomic_inc(&event->mmap_count);
} else if (rb) {
@@ -5803,10 +5811,45 @@ void perf_event_wakeup(struct perf_event *event)
}
}
+static void perf_pending_event_disable(struct perf_event *event)
+{
+ int cpu = READ_ONCE(event->pending_disable);
+
+ if (cpu < 0)
+ return;
+
+ if (cpu == smp_processor_id()) {
+ WRITE_ONCE(event->pending_disable, -1);
+ perf_event_disable_local(event);
+ return;
+ }
+
+ /*
+ * CPU-A CPU-B
+ *
+ * perf_event_disable_inatomic()
+ * @pending_disable = CPU-A;
+ * irq_work_queue();
+ *
+ * sched-out
+ * @pending_disable = -1;
+ *
+ * sched-in
+ * perf_event_disable_inatomic()
+ * @pending_disable = CPU-B;
+ * irq_work_queue(); // FAILS
+ *
+ * irq_work_run()
+ * perf_pending_event()
+ *
+ * But the event runs on CPU-B and wants disabling there.
+ */
+ irq_work_queue_on(&event->pending, cpu);
+}
+
static void perf_pending_event(struct irq_work *entry)
{
- struct perf_event *event = container_of(entry,
- struct perf_event, pending);
+ struct perf_event *event = container_of(entry, struct perf_event, pending);
int rctx;
rctx = perf_swevent_get_recursion_context();
@@ -5815,10 +5858,7 @@ static void perf_pending_event(struct irq_work *entry)
* and we won't recurse 'further'.
*/
- if (event->pending_disable) {
- event->pending_disable = 0;
- perf_event_disable_local(event);
- }
+ perf_pending_event_disable(event);
if (event->pending_wakeup) {
event->pending_wakeup = 0;
@@ -6497,7 +6537,7 @@ void perf_prepare_sample(struct perf_event_header *header,
data->phys_addr = perf_virt_to_phys(data->addr);
}
-static __always_inline void
+static __always_inline int
__perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs,
@@ -6507,13 +6547,15 @@ __perf_event_output(struct perf_event *event,
{
struct perf_output_handle handle;
struct perf_event_header header;
+ int err;
/* protect the callchain buffers */
rcu_read_lock();
perf_prepare_sample(&header, data, event, regs);
- if (output_begin(&handle, event, header.size))
+ err = output_begin(&handle, event, header.size);
+ if (err)
goto exit;
perf_output_sample(&handle, &header, data, event);
@@ -6522,6 +6564,7 @@ __perf_event_output(struct perf_event *event,
exit:
rcu_read_unlock();
+ return err;
}
void
@@ -6540,12 +6583,12 @@ perf_event_output_backward(struct perf_event *event,
__perf_event_output(event, data, regs, perf_output_begin_backward);
}
-void
+int
perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
- __perf_event_output(event, data, regs, perf_output_begin);
+ return __perf_event_output(event, data, regs, perf_output_begin);
}
/*
@@ -6686,7 +6729,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
if (filter->path.dentry) {
- event->addr_filters_offs[count] = 0;
+ event->addr_filter_ranges[count].start = 0;
+ event->addr_filter_ranges[count].size = 0;
restart++;
}
@@ -7178,6 +7222,7 @@ static void perf_event_mmap_output(struct perf_event *event,
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
+ u32 type = mmap_event->event_id.header.type;
int ret;
if (!perf_event_mmap_match(event, data))
@@ -7221,6 +7266,7 @@ static void perf_event_mmap_output(struct perf_event *event,
perf_output_end(&handle);
out:
mmap_event->event_id.header.size = size;
+ mmap_event->event_id.header.type = type;
}
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -7366,28 +7412,47 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
return true;
}
+static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
+ struct vm_area_struct *vma,
+ struct perf_addr_filter_range *fr)
+{
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ struct file *file = vma->vm_file;
+
+ if (!perf_addr_filter_match(filter, file, off, vma_size))
+ return false;
+
+ if (filter->offset < off) {
+ fr->start = vma->vm_start;
+ fr->size = min(vma_size, filter->size - (off - filter->offset));
+ } else {
+ fr->start = vma->vm_start + filter->offset - off;
+ fr->size = min(vma->vm_end - fr->start, filter->size);
+ }
+
+ return true;
+}
+
static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
{
struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
struct vm_area_struct *vma = data;
- unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
- struct file *file = vma->vm_file;
struct perf_addr_filter *filter;
unsigned int restart = 0, count = 0;
+ unsigned long flags;
if (!has_addr_filter(event))
return;
- if (!file)
+ if (!vma->vm_file)
return;
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- if (perf_addr_filter_match(filter, file, off,
- vma->vm_end - vma->vm_start)) {
- event->addr_filters_offs[count] = vma->vm_start;
+ if (perf_addr_filter_vma_adjust(filter, vma,
+ &event->addr_filter_ranges[count]))
restart++;
- }
count++;
}
@@ -7658,6 +7723,207 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}
+/*
+ * ksymbol register/unregister tracking
+ */
+
+struct perf_ksymbol_event {
+ const char *name;
+ int name_len;
+ struct {
+ struct perf_event_header header;
+ u64 addr;
+ u32 len;
+ u16 ksym_type;
+ u16 flags;
+ } event_id;
+};
+
+static int perf_event_ksymbol_match(struct perf_event *event)
+{
+ return event->attr.ksymbol;
+}
+
+static void perf_event_ksymbol_output(struct perf_event *event, void *data)
+{
+ struct perf_ksymbol_event *ksymbol_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_ksymbol_match(event))
+ return;
+
+ perf_event_header__init_id(&ksymbol_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ ksymbol_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, ksymbol_event->event_id);
+ __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
+ const char *sym)
+{
+ struct perf_ksymbol_event ksymbol_event;
+ char name[KSYM_NAME_LEN];
+ u16 flags = 0;
+ int name_len;
+
+ if (!atomic_read(&nr_ksymbol_events))
+ return;
+
+ if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
+ ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
+ goto err;
+
+ strlcpy(name, sym, KSYM_NAME_LEN);
+ name_len = strlen(name) + 1;
+ while (!IS_ALIGNED(name_len, sizeof(u64)))
+ name[name_len++] = '\0';
+ BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
+
+ if (unregister)
+ flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
+
+ ksymbol_event = (struct perf_ksymbol_event){
+ .name = name,
+ .name_len = name_len,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_KSYMBOL,
+ .size = sizeof(ksymbol_event.event_id) +
+ name_len,
+ },
+ .addr = addr,
+ .len = len,
+ .ksym_type = ksym_type,
+ .flags = flags,
+ },
+ };
+
+ perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
+ return;
+err:
+ WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
+}
+
+/*
+ * bpf program load/unload tracking
+ */
+
+struct perf_bpf_event {
+ struct bpf_prog *prog;
+ struct {
+ struct perf_event_header header;
+ u16 type;
+ u16 flags;
+ u32 id;
+ u8 tag[BPF_TAG_SIZE];
+ } event_id;
+};
+
+static int perf_event_bpf_match(struct perf_event *event)
+{
+ return event->attr.bpf_event;
+}
+
+static void perf_event_bpf_output(struct perf_event *event, void *data)
+{
+ struct perf_bpf_event *bpf_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_bpf_match(event))
+ return;
+
+ perf_event_header__init_id(&bpf_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ bpf_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, bpf_event->event_id);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
+ enum perf_bpf_event_type type)
+{
+ bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
+ char sym[KSYM_NAME_LEN];
+ int i;
+
+ if (prog->aux->func_cnt == 0) {
+ bpf_get_prog_name(prog, sym);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)prog->bpf_func,
+ prog->jited_len, unregister, sym);
+ } else {
+ for (i = 0; i < prog->aux->func_cnt; i++) {
+ struct bpf_prog *subprog = prog->aux->func[i];
+
+ bpf_get_prog_name(subprog, sym);
+ perf_event_ksymbol(
+ PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)subprog->bpf_func,
+ subprog->jited_len, unregister, sym);
+ }
+ }
+}
+
+void perf_event_bpf_event(struct bpf_prog *prog,
+ enum perf_bpf_event_type type,
+ u16 flags)
+{
+ struct perf_bpf_event bpf_event;
+
+ if (type <= PERF_BPF_EVENT_UNKNOWN ||
+ type >= PERF_BPF_EVENT_MAX)
+ return;
+
+ switch (type) {
+ case PERF_BPF_EVENT_PROG_LOAD:
+ case PERF_BPF_EVENT_PROG_UNLOAD:
+ if (atomic_read(&nr_ksymbol_events))
+ perf_event_bpf_emit_ksymbols(prog, type);
+ break;
+ default:
+ break;
+ }
+
+ if (!atomic_read(&nr_bpf_events))
+ return;
+
+ bpf_event = (struct perf_bpf_event){
+ .prog = prog,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_BPF_EVENT,
+ .size = sizeof(bpf_event.event_id),
+ },
+ .type = type,
+ .flags = flags,
+ .id = prog->aux->id,
+ },
+ };
+
+ BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
+
+ memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
+ perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
+}
+
void perf_event_itrace_started(struct perf_event *event)
{
event->attach_state |= PERF_ATTACH_ITRACE;
@@ -8776,26 +9042,19 @@ static void perf_addr_filters_splice(struct perf_event *event,
* @filter; if so, adjust filter's address range.
* Called with mm::mmap_sem down for reading.
*/
-static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
- struct mm_struct *mm)
+static void perf_addr_filter_apply(struct perf_addr_filter *filter,
+ struct mm_struct *mm,
+ struct perf_addr_filter_range *fr)
{
struct vm_area_struct *vma;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- struct file *file = vma->vm_file;
- unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
- unsigned long vma_size = vma->vm_end - vma->vm_start;
-
- if (!file)
- continue;
-
- if (!perf_addr_filter_match(filter, file, off, vma_size))
+ if (!vma->vm_file)
continue;
- return vma->vm_start;
+ if (perf_addr_filter_vma_adjust(filter, vma, fr))
+ return;
}
-
- return 0;
}
/*
@@ -8829,15 +9088,15 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- event->addr_filters_offs[count] = 0;
+ event->addr_filter_ranges[count].start = 0;
+ event->addr_filter_ranges[count].size = 0;
/*
* Adjust base offset if the filter is associated to a binary
* that needs to be mapped:
*/
if (filter->path.dentry)
- event->addr_filters_offs[count] =
- perf_addr_filter_apply(filter, mm);
+ perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
count++;
}
@@ -8951,6 +9210,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
case IF_SRC_KERNELADDR:
case IF_SRC_KERNEL:
kernel = 1;
+ /* fall through */
case IF_SRC_FILEADDR:
case IF_SRC_FILE:
@@ -9788,6 +10048,15 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (ctx)
perf_event_ctx_unlock(event->group_leader, ctx);
+ if (!ret) {
+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
+ event_has_any_exclude_flag(event)) {
+ if (event->destroy)
+ event->destroy(event);
+ ret = -EINVAL;
+ }
+ }
+
if (ret)
module_put(pmu->module);
@@ -9916,6 +10185,10 @@ static void account_event(struct perf_event *event)
inc = true;
if (is_cgroup_event(event))
inc = true;
+ if (event->attr.ksymbol)
+ atomic_inc(&nr_ksymbol_events);
+ if (event->attr.bpf_event)
+ atomic_inc(&nr_bpf_events);
if (inc) {
/*
@@ -9996,6 +10269,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
+ event->pending_disable = -1;
init_irq_work(&event->pending, perf_pending_event);
mutex_init(&event->mmap_mutex);
@@ -10098,14 +10372,28 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
goto err_pmu;
if (has_addr_filter(event)) {
- event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
- sizeof(unsigned long),
- GFP_KERNEL);
- if (!event->addr_filters_offs) {
+ event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
+ sizeof(struct perf_addr_filter_range),
+ GFP_KERNEL);
+ if (!event->addr_filter_ranges) {
err = -ENOMEM;
goto err_per_task;
}
+ /*
+ * Clone the parent's vma offsets: they are valid until exec()
+ * even if the mm is not shared with the parent.
+ */
+ if (event->parent) {
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+ raw_spin_lock_irq(&ifh->lock);
+ memcpy(event->addr_filter_ranges,
+ event->parent->addr_filter_ranges,
+ pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
+ raw_spin_unlock_irq(&ifh->lock);
+ }
+
/* force hw sync on the address filters */
event->addr_filters_gen = 1;
}
@@ -10124,7 +10412,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
return event;
err_addr_filters:
- kfree(event->addr_filters_offs);
+ kfree(event->addr_filter_ranges);
err_per_task:
exclusive_event_destroy(event);
@@ -10407,7 +10695,7 @@ __perf_event_ctx_lock_double(struct perf_event *group_leader,
again:
rcu_read_lock();
gctx = READ_ONCE(group_leader->ctx);
- if (!atomic_inc_not_zero(&gctx->refcount)) {
+ if (!refcount_inc_not_zero(&gctx->refcount)) {
rcu_read_unlock();
goto again;
}
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 5befb338a18d..c5cd852fe86b 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -1,18 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) 2007 Alan Stern
* Copyright (C) IBM Corporation, 2009
* Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 6dc725a7e7bc..79c47076700a 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -4,13 +4,14 @@
#include <linux/hardirq.h>
#include <linux/uaccess.h>
+#include <linux/refcount.h>
/* Buffer handling */
#define RING_BUFFER_WRITABLE 0x01
struct ring_buffer {
- atomic_t refcount;
+ refcount_t refcount;
struct rcu_head rcu_head;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
@@ -48,7 +49,7 @@ struct ring_buffer {
atomic_t aux_mmap_count;
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
- atomic_t aux_refcount;
+ refcount_t aux_refcount;
void **aux_pages;
void *aux_priv;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 5ab4fe3b1dcc..2545ac08cc77 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events ring-buffer code:
*
@@ -5,8 +6,6 @@
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
@@ -285,7 +284,7 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
else
rb->overwrite = 1;
- atomic_set(&rb->refcount, 1);
+ refcount_set(&rb->refcount, 1);
INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
@@ -358,7 +357,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (!atomic_read(&rb->aux_mmap_count))
goto err;
- if (!atomic_inc_not_zero(&rb->aux_refcount))
+ if (!refcount_inc_not_zero(&rb->aux_refcount))
goto err;
/*
@@ -393,7 +392,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* store that will be enabled on successful return
*/
if (!handle->size) { /* A, matches D */
- event->pending_disable = 1;
+ event->pending_disable = smp_processor_id();
perf_output_wakeup(handle);
local_set(&rb->aux_nest, 0);
goto err_put;
@@ -481,7 +480,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
- handle->event->pending_disable = 1;
+ handle->event->pending_disable = smp_processor_id();
perf_output_wakeup(handle);
}
@@ -599,29 +598,27 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
{
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
- int ret = -ENOMEM, max_order = 0;
+ int ret = -ENOMEM, max_order;
if (!has_aux(event))
return -EOPNOTSUPP;
- if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
- /*
- * We need to start with the max_order that fits in nr_pages,
- * not the other way around, hence ilog2() and not get_order.
- */
- max_order = ilog2(nr_pages);
+ /*
+ * We need to start with the max_order that fits in nr_pages,
+ * not the other way around, hence ilog2() and not get_order.
+ */
+ max_order = ilog2(nr_pages);
- /*
- * PMU requests more than one contiguous chunks of memory
- * for SW double buffering
- */
- if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
- !overwrite) {
- if (!max_order)
- return -EINVAL;
+ /*
+ * PMU requests more than one contiguous chunks of memory
+ * for SW double buffering
+ */
+ if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
+ !overwrite) {
+ if (!max_order)
+ return -EINVAL;
- max_order--;
- }
+ max_order--;
}
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
@@ -658,7 +655,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
goto out;
}
- rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+ rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
overwrite);
if (!rb->aux_priv)
goto out;
@@ -671,7 +668,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
* we keep a refcount here to make sure either of the two can
* reference them safely.
*/
- atomic_set(&rb->aux_refcount, 1);
+ refcount_set(&rb->aux_refcount, 1);
rb->aux_overwrite = overwrite;
rb->aux_watermark = watermark;
@@ -690,7 +687,7 @@ out:
void rb_free_aux(struct ring_buffer *rb)
{
- if (atomic_dec_and_test(&rb->aux_refcount))
+ if (refcount_dec_and_test(&rb->aux_refcount))
__rb_free_aux(rb);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8aef47ee7bfa..c5cde87329c7 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* User-space Probes (UProbes)
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) IBM Corporation, 2008-2012
* Authors:
* Srikar Dronamraju
@@ -66,7 +53,7 @@ static struct percpu_rw_semaphore dup_mmap_sem;
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
- atomic_t ref;
+ refcount_t ref;
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
@@ -560,13 +547,13 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
- atomic_inc(&uprobe->ref);
+ refcount_inc(&uprobe->ref);
return uprobe;
}
static void put_uprobe(struct uprobe *uprobe)
{
- if (atomic_dec_and_test(&uprobe->ref)) {
+ if (refcount_dec_and_test(&uprobe->ref)) {
/*
* If application munmap(exec_vma) before uprobe_unregister()
* gets called, we don't get a chance to remove uprobe from
@@ -657,7 +644,7 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
rb_link_node(&uprobe->rb_node, parent, p);
rb_insert_color(&uprobe->rb_node, &uprobes_tree);
/* get access + creation ref */
- atomic_set(&uprobe->ref, 2);
+ refcount_set(&uprobe->ref, 2);
return u;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2639a30a8aa5..2166c2d92ddc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -219,6 +219,7 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
+ cgroup_release(p);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
diff --git a/kernel/fork.c b/kernel/fork.c
index b69248e6f0e0..9dcd18aa210b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,7 +77,6 @@
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
-#include <linux/sched/mm.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
@@ -429,7 +428,7 @@ static void release_task_stack(struct task_struct *tsk)
#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
- if (atomic_dec_and_test(&tsk->stack_refcount))
+ if (refcount_dec_and_test(&tsk->stack_refcount))
release_task_stack(tsk);
}
#endif
@@ -447,7 +446,7 @@ void free_task(struct task_struct *tsk)
* If the task had a separate stack allocation, it should be gone
* by now.
*/
- WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
+ WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
#endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
@@ -710,14 +709,14 @@ static inline void free_signal_struct(struct signal_struct *sig)
static inline void put_signal_struct(struct signal_struct *sig)
{
- if (atomic_dec_and_test(&sig->sigcnt))
+ if (refcount_dec_and_test(&sig->sigcnt))
free_signal_struct(sig);
}
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
- WARN_ON(atomic_read(&tsk->usage));
+ WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
cgroup_free(tsk);
@@ -867,7 +866,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->stack_vm_area = stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
- atomic_set(&tsk->stack_refcount, 1);
+ refcount_set(&tsk->stack_refcount, 1);
#endif
if (err)
@@ -896,7 +895,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
* One for us, one for whoever does the "release_task()" (usually
* parent)
*/
- atomic_set(&tsk->usage, 2);
+ refcount_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
@@ -981,7 +980,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
- mm->pinned_vm = 0;
+ atomic64_set(&mm->pinned_vm, 0);
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
spin_lock_init(&mm->arg_lock);
@@ -1463,7 +1462,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
struct sighand_struct *sig;
if (clone_flags & CLONE_SIGHAND) {
- atomic_inc(&current->sighand->count);
+ refcount_inc(&current->sighand->count);
return 0;
}
sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
@@ -1471,7 +1470,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
if (!sig)
return -ENOMEM;
- atomic_set(&sig->count, 1);
+ refcount_set(&sig->count, 1);
spin_lock_irq(&current->sighand->siglock);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
spin_unlock_irq(&current->sighand->siglock);
@@ -1480,7 +1479,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
void __cleanup_sighand(struct sighand_struct *sighand)
{
- if (atomic_dec_and_test(&sighand->count)) {
+ if (refcount_dec_and_test(&sighand->count)) {
signalfd_cleanup(sighand);
/*
* sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
@@ -1527,7 +1526,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->nr_threads = 1;
atomic_set(&sig->live, 1);
- atomic_set(&sig->sigcnt, 1);
+ refcount_set(&sig->sigcnt, 1);
/* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
@@ -2082,7 +2081,7 @@ static __latent_entropy struct task_struct *copy_process(
} else {
current->signal->nr_threads++;
atomic_inc(&current->signal->live);
- atomic_inc(&current->signal->sigcnt);
+ refcount_inc(&current->signal->sigcnt);
task_join_group_stop(p);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
@@ -2439,7 +2438,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
return -EINVAL;
}
if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
- if (atomic_read(&current->sighand->count) > 1)
+ if (refcount_read(&current->sighand->count) > 1)
return -EINVAL;
}
if (unshare_flags & CLONE_VM) {
diff --git a/kernel/futex.c b/kernel/futex.c
index a0514e01c3eb..9e40cf7be606 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -68,6 +68,7 @@
#include <linux/freezer.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
+#include <linux/refcount.h>
#include <asm/futex.h>
@@ -212,7 +213,7 @@ struct futex_pi_state {
struct rt_mutex pi_mutex;
struct task_struct *owner;
- atomic_t refcount;
+ refcount_t refcount;
union futex_key key;
} __randomize_layout;
@@ -321,12 +322,8 @@ static int __init fail_futex_debugfs(void)
if (IS_ERR(dir))
return PTR_ERR(dir);
- if (!debugfs_create_bool("ignore-private", mode, dir,
- &fail_futex.ignore_private)) {
- debugfs_remove_recursive(dir);
- return -ENOMEM;
- }
-
+ debugfs_create_bool("ignore-private", mode, dir,
+ &fail_futex.ignore_private);
return 0;
}
@@ -803,7 +800,7 @@ static int refill_pi_state_cache(void)
INIT_LIST_HEAD(&pi_state->list);
/* pi_mutex gets initialized later */
pi_state->owner = NULL;
- atomic_set(&pi_state->refcount, 1);
+ refcount_set(&pi_state->refcount, 1);
pi_state->key = FUTEX_KEY_INIT;
current->pi_state_cache = pi_state;
@@ -823,7 +820,7 @@ static struct futex_pi_state *alloc_pi_state(void)
static void get_pi_state(struct futex_pi_state *pi_state)
{
- WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+ WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
}
/*
@@ -835,7 +832,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
if (!pi_state)
return;
- if (!atomic_dec_and_test(&pi_state->refcount))
+ if (!refcount_dec_and_test(&pi_state->refcount))
return;
/*
@@ -865,7 +862,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* refcount is at 0 - put it back to 1.
*/
pi_state->owner = NULL;
- atomic_set(&pi_state->refcount, 1);
+ refcount_set(&pi_state->refcount, 1);
current->pi_state_cache = pi_state;
}
}
@@ -908,7 +905,7 @@ void exit_pi_state_list(struct task_struct *curr)
* In that case; drop the locks to let put_pi_state() make
* progress and retry the loop.
*/
- if (!atomic_inc_not_zero(&pi_state->refcount)) {
+ if (!refcount_inc_not_zero(&pi_state->refcount)) {
raw_spin_unlock_irq(&curr->pi_lock);
cpu_relax();
raw_spin_lock_irq(&curr->pi_lock);
@@ -1064,7 +1061,7 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
* and futex_wait_requeue_pi() as it cannot go to 0 and consequently
* free pi_state before we can take a reference ourselves.
*/
- WARN_ON(!atomic_read(&pi_state->refcount));
+ WARN_ON(!refcount_read(&pi_state->refcount));
/*
* Now that we have a pi_state, we can acquire wait_lock
@@ -1467,8 +1464,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
* Queue the task for later wakeup for after we've released
* the hb->lock. wake_q_add() grabs reference to p.
*/
- wake_q_add(wake_q, p);
- put_task_struct(p);
+ wake_q_add_safe(wake_q, p);
}
/*
@@ -3440,6 +3436,10 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int p
{
u32 uval, uninitialized_var(nval), mval;
+ /* Futex address must be 32bit aligned */
+ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+ return -1;
+
retry:
if (get_user(uval, uaddr))
return -1;
@@ -3823,7 +3823,7 @@ err_unlock:
#endif /* CONFIG_COMPAT */
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
u32, val3)
{
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 1e32e66c9563..2dddecbdbe6e 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -245,8 +245,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info)
/* Duplicate gcov_info. */
active = num_counter_active(info);
- dup = kzalloc(sizeof(struct gcov_info) +
- sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
+ dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL);
if (!dup)
return NULL;
dup->version = info->version;
@@ -364,8 +363,7 @@ struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
{
struct gcov_iterator *iter;
- iter = kzalloc(sizeof(struct gcov_iterator) +
- num_counter_active(info) * sizeof(struct type_info),
+ iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)),
GFP_KERNEL);
if (iter)
iter->info = info;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 4a9191617076..f108a95882c6 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -19,6 +19,7 @@
#include <linux/utsname.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
+#include <linux/sched/sysctl.h>
#include <trace/events/sched.h>
@@ -126,7 +127,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
if (sysctl_hung_task_warnings > 0)
sysctl_hung_task_warnings--;
pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
- t->comm, t->pid, timeout);
+ t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
pr_err(" %s %s %.*s\n",
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 45b68b4ea48b..f18cd5aa33e8 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -9,7 +9,7 @@
#include <linux/cpu.h>
static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
- int cpus_per_vec)
+ unsigned int cpus_per_vec)
{
const struct cpumask *siblmsk;
int cpu, sibl;
@@ -95,15 +95,17 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
}
static int __irq_build_affinity_masks(const struct irq_affinity *affd,
- int startvec, int numvecs, int firstvec,
+ unsigned int startvec,
+ unsigned int numvecs,
+ unsigned int firstvec,
cpumask_var_t *node_to_cpumask,
const struct cpumask *cpu_mask,
struct cpumask *nmsk,
struct irq_affinity_desc *masks)
{
- int n, nodes, cpus_per_vec, extra_vecs, done = 0;
- int last_affv = firstvec + numvecs;
- int curvec = startvec;
+ unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+ unsigned int last_affv = firstvec + numvecs;
+ unsigned int curvec = startvec;
nodemask_t nodemsk = NODE_MASK_NONE;
if (!cpumask_weight(cpu_mask))
@@ -117,18 +119,16 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
*/
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
- cpumask_or(&masks[curvec].mask,
- &masks[curvec].mask,
- node_to_cpumask[n]);
+ cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
+ node_to_cpumask[n]);
if (++curvec == last_affv)
curvec = firstvec;
}
- done = numvecs;
- goto out;
+ return numvecs;
}
for_each_node_mask(n, nodemsk) {
- int ncpus, v, vecs_to_assign, vecs_per_node;
+ unsigned int ncpus, v, vecs_to_assign, vecs_per_node;
/* Spread the vectors per node */
vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
@@ -163,8 +163,6 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
curvec = firstvec;
--nodes;
}
-
-out:
return done;
}
@@ -174,19 +172,24 @@ out:
* 2) spread other possible CPUs on these vectors
*/
static int irq_build_affinity_masks(const struct irq_affinity *affd,
- int startvec, int numvecs, int firstvec,
- cpumask_var_t *node_to_cpumask,
+ unsigned int startvec, unsigned int numvecs,
+ unsigned int firstvec,
struct irq_affinity_desc *masks)
{
- int curvec = startvec, nr_present, nr_others;
- int ret = -ENOMEM;
+ unsigned int curvec = startvec, nr_present, nr_others;
+ cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
+ int ret = -ENOMEM;
if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
return ret;
if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
- goto fail;
+ goto fail_nmsk;
+
+ node_to_cpumask = alloc_node_to_cpumask();
+ if (!node_to_cpumask)
+ goto fail_npresmsk;
ret = 0;
/* Stabilize the cpumasks */
@@ -217,13 +220,22 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
if (nr_present < numvecs)
WARN_ON(nr_present + nr_others < numvecs);
+ free_node_to_cpumask(node_to_cpumask);
+
+ fail_npresmsk:
free_cpumask_var(npresmsk);
- fail:
+ fail_nmsk:
free_cpumask_var(nmsk);
return ret;
}
+static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
+{
+ affd->nr_sets = 1;
+ affd->set_size[0] = affvecs;
+}
+
/**
* irq_create_affinity_masks - Create affinity masks for multiqueue spreading
* @nvecs: The total number of vectors
@@ -232,50 +244,62 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
* Returns the irq_affinity_desc pointer or NULL if allocation failed.
*/
struct irq_affinity_desc *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
{
- int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
- int curvec, usedvecs;
- cpumask_var_t *node_to_cpumask;
+ unsigned int affvecs, curvec, usedvecs, i;
struct irq_affinity_desc *masks = NULL;
- int i, nr_sets;
/*
- * If there aren't any vectors left after applying the pre/post
- * vectors don't bother with assigning affinity.
+ * Determine the number of vectors which need interrupt affinities
+ * assigned. If the pre/post request exhausts the available vectors
+ * then nothing to do here except for invoking the calc_sets()
+ * callback so the device driver can adjust to the situation. If there
+ * is only a single vector, then managing the queue is pointless as
+ * well.
*/
- if (nvecs == affd->pre_vectors + affd->post_vectors)
+ if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors)
+ affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+ else
+ affvecs = 0;
+
+ /*
+ * Simple invocations do not provide a calc_sets() callback. Install
+ * the generic one.
+ */
+ if (!affd->calc_sets)
+ affd->calc_sets = default_calc_sets;
+
+ /* Recalculate the sets */
+ affd->calc_sets(affd, affvecs);
+
+ if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
return NULL;
- node_to_cpumask = alloc_node_to_cpumask();
- if (!node_to_cpumask)
+ /* Nothing to assign? */
+ if (!affvecs)
return NULL;
masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
if (!masks)
- goto outnodemsk;
+ return NULL;
/* Fill out vectors at the beginning that don't need affinity */
for (curvec = 0; curvec < affd->pre_vectors; curvec++)
cpumask_copy(&masks[curvec].mask, irq_default_affinity);
+
/*
* Spread on present CPUs starting from affd->pre_vectors. If we
* have multiple sets, build each sets affinity mask separately.
*/
- nr_sets = affd->nr_sets;
- if (!nr_sets)
- nr_sets = 1;
-
- for (i = 0, usedvecs = 0; i < nr_sets; i++) {
- int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+ unsigned int this_vecs = affd->set_size[i];
int ret;
ret = irq_build_affinity_masks(affd, curvec, this_vecs,
- curvec, node_to_cpumask, masks);
+ curvec, masks);
if (ret) {
kfree(masks);
- masks = NULL;
- goto outnodemsk;
+ return NULL;
}
curvec += this_vecs;
usedvecs += this_vecs;
@@ -293,8 +317,6 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
masks[i].is_managed = 1;
-outnodemsk:
- free_node_to_cpumask(node_to_cpumask);
return masks;
}
@@ -304,25 +326,22 @@ outnodemsk:
* @maxvec: The maximum number of vectors available
* @affd: Description of the affinity requirements
*/
-int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
+unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
+ const struct irq_affinity *affd)
{
- int resv = affd->pre_vectors + affd->post_vectors;
- int vecs = maxvec - resv;
- int set_vecs;
+ unsigned int resv = affd->pre_vectors + affd->post_vectors;
+ unsigned int set_vecs;
if (resv > minvec)
return 0;
- if (affd->nr_sets) {
- int i;
-
- for (i = 0, set_vecs = 0; i < affd->nr_sets; i++)
- set_vecs += affd->sets[i];
+ if (affd->calc_sets) {
+ set_vecs = maxvec - resv;
} else {
get_online_cpus();
set_vecs = cpumask_weight(cpu_possible_mask);
put_online_cpus();
}
- return resv + min(set_vecs, vecs);
+ return resv + min(set_vecs, maxvec - resv);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 34e969069488..51128bea3846 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -730,6 +730,37 @@ out:
EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
/**
+ * handle_fasteoi_nmi - irq handler for NMI interrupt lines
+ * @desc: the interrupt description structure for this irq
+ *
+ * A simple NMI-safe handler, considering the restrictions
+ * from request_nmi.
+ *
+ * Only a single callback will be issued to the chip: an ->eoi()
+ * call when the interrupt has been serviced. This enables support
+ * for modern forms of interrupt handlers, which handle the flow
+ * details in hardware, transparently.
+ */
+void handle_fasteoi_nmi(struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ unsigned int irq = irq_desc_get_irq(desc);
+ irqreturn_t res;
+
+ trace_irq_handler_entry(irq, action);
+ /*
+ * NMIs cannot be shared, there is only one action.
+ */
+ res = action->handler(irq, action->dev_id);
+ trace_irq_handler_exit(irq, action, res);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_nmi);
+
+/**
* handle_edge_irq - edge type IRQ handler
* @desc: the interrupt description structure for this irq
*
@@ -855,7 +886,11 @@ void handle_percpu_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
- kstat_incr_irqs_this_cpu(desc);
+ /*
+ * PER CPU interrupts are not serialized. Do not touch
+ * desc->tot_count.
+ */
+ __kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -884,7 +919,11 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
unsigned int irq = irq_desc_get_irq(desc);
irqreturn_t res;
- kstat_incr_irqs_this_cpu(desc);
+ /*
+ * PER CPU interrupts are not serialized. Do not touch
+ * desc->tot_count.
+ */
+ __kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -908,6 +947,29 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
chip->irq_eoi(&desc->irq_data);
}
+/**
+ * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu
+ * dev ids
+ * @desc: the interrupt description structure for this irq
+ *
+ * Similar to handle_fasteoi_nmi, but handling the dev_id cookie
+ * as a percpu pointer.
+ */
+void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ unsigned int irq = irq_desc_get_irq(desc);
+ irqreturn_t res;
+
+ trace_irq_handler_entry(irq, action);
+ res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
+ trace_irq_handler_exit(irq, action, res);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+}
+
static void
__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
int is_chained, const char *name)
@@ -1278,6 +1340,17 @@ void irq_chip_mask_parent(struct irq_data *data)
EXPORT_SYMBOL_GPL(irq_chip_mask_parent);
/**
+ * irq_chip_mask_ack_parent - Mask and acknowledge the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_mask_ack_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_mask_ack(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_mask_ack_parent);
+
+/**
* irq_chip_unmask_parent - Unmask the parent interrupt
* @data: Pointer to interrupt specific data
*/
@@ -1376,11 +1449,16 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
{
data = data->parent_data;
+
+ if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE)
+ return 0;
+
if (data->chip->irq_set_wake)
return data->chip->irq_set_wake(data, on);
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent);
#endif
/**
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 6f636136cccc..516c00a5e867 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -56,6 +56,7 @@ static const struct irq_bit_descr irqchip_flags[] = {
BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE),
BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
+ BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
};
static void
@@ -140,6 +141,7 @@ static const struct irq_bit_descr irqdesc_istates[] = {
BIT_MASK_DESCR(IRQS_WAITING),
BIT_MASK_DESCR(IRQS_PENDING),
BIT_MASK_DESCR(IRQS_SUSPENDED),
+ BIT_MASK_DESCR(IRQS_NMI),
};
@@ -203,8 +205,8 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,
chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
- if (irq_settings_is_level(desc)) {
- /* Can't do level, sorry */
+ if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) {
+ /* Can't do level nor NMIs, sorry */
err = -EINVAL;
} else {
desc->istate |= IRQS_PENDING;
@@ -256,8 +258,6 @@ static int __init irq_debugfs_init(void)
int irq;
root_dir = debugfs_create_dir("irq", NULL);
- if (!root_dir)
- return -ENOMEM;
irq_domain_debugfs_init(root_dir);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 5d5378ea0afe..f808c6a97dcc 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -84,8 +84,6 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
* @dev: device to request interrupt for
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs
- * @thread_fn: function to be called in a threaded interrupt context. NULL
- * for devices which handle everything in @handler
* @irqflags: Interrupt type flags
* @devname: An ascii name for the claiming device, dev_name(dev) if NULL
* @dev_id: A cookie passed back to the handler function
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 38554bc35375..6df5ddfdb0f8 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -166,7 +166,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
__irq_wake_thread(desc, action);
- /* Fall through to add to randomness */
+ /* Fall through - to add to randomness */
case IRQ_HANDLED:
*flags |= action->flags;
break;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ca6afa267070..70c3053bc1f6 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -49,6 +49,7 @@ enum {
* IRQS_WAITING - irq is waiting
* IRQS_PENDING - irq is pending and replayed later
* IRQS_SUSPENDED - irq is suspended
+ * IRQS_NMI - irq line is used to deliver NMIs
*/
enum {
IRQS_AUTODETECT = 0x00000001,
@@ -60,6 +61,7 @@ enum {
IRQS_PENDING = 0x00000200,
IRQS_SUSPENDED = 0x00000800,
IRQS_TIMINGS = 0x00001000,
+ IRQS_NMI = 0x00002000,
};
#include "debug.h"
@@ -242,12 +244,18 @@ static inline void irq_state_set_masked(struct irq_desc *desc)
#undef __irqd_to_state
-static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
+static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
}
+static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
+{
+ __kstat_incr_irqs_this_cpu(desc);
+ desc->tot_count++;
+}
+
static inline int irq_desc_get_node(struct irq_desc *desc)
{
return irq_common_data_get_node(&desc->irq_common_data);
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 98a20e1594ce..b992f88c5613 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -25,10 +25,22 @@ static void irq_sim_irqunmask(struct irq_data *data)
irq_ctx->enabled = true;
}
+static int irq_sim_set_type(struct irq_data *data, unsigned int type)
+{
+ /* We only support rising and falling edge trigger types. */
+ if (type & ~IRQ_TYPE_EDGE_BOTH)
+ return -EINVAL;
+
+ irqd_set_trigger_type(data, type);
+
+ return 0;
+}
+
static struct irq_chip irq_sim_irqchip = {
.name = "irq_sim",
.irq_mask = irq_sim_irqmask,
.irq_unmask = irq_sim_irqunmask,
+ .irq_set_type = irq_sim_set_type,
};
static void irq_sim_handle_irq(struct irq_work *work)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index ef8ad36cadcf..9f8a709337cf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -119,6 +119,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->depth = 1;
desc->irq_count = 0;
desc->irqs_unhandled = 0;
+ desc->tot_count = 0;
desc->name = NULL;
desc->owner = owner;
for_each_possible_cpu(cpu)
@@ -557,6 +558,7 @@ int __init early_irq_init(void)
alloc_masks(&desc[i], node);
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+ mutex_init(&desc[i].request_mutex);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
}
return arch_early_irq_init();
@@ -669,6 +671,41 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
set_irq_regs(old_regs);
return ret;
}
+
+#ifdef CONFIG_IRQ_DOMAIN
+/**
+ * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ * @regs: Register file coming from the low-level handling code
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ */
+int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
+ struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ unsigned int irq;
+ int ret = 0;
+
+ nmi_enter();
+
+ irq = irq_find_mapping(domain, hwirq);
+
+ /*
+ * ack_bad_irq is not NMI-safe, just report
+ * an invalid interrupt.
+ */
+ if (likely(irq))
+ generic_handle_irq(irq);
+ else
+ ret = -EINVAL;
+
+ nmi_exit();
+ set_irq_regs(old_regs);
+ return ret;
+}
+#endif
#endif
/* Dynamic interrupt handling */
@@ -919,11 +956,15 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- int cpu;
unsigned int sum = 0;
+ int cpu;
if (!desc || !desc->kstat_irqs)
return 0;
+ if (!irq_settings_is_per_cpu_devid(desc) &&
+ !irq_settings_is_per_cpu(desc))
+ return desc->tot_count;
+
for_each_possible_cpu(cpu)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8b0be4bd6565..9ed29e4a7dbf 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -458,6 +458,20 @@ void irq_set_default_host(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_set_default_host);
+/**
+ * irq_get_default_host() - Retrieve the "default" irq domain
+ *
+ * Returns: the default domain, if any.
+ *
+ * Modern code should never use this. This should only be used on
+ * systems that cannot implement a firmware->fwnode mapping (which
+ * both DT and ACPI provide).
+ */
+struct irq_domain *irq_get_default_host(void)
+{
+ return irq_default_domain;
+}
+
static void irq_domain_clear_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
@@ -729,16 +743,17 @@ static int irq_domain_translate(struct irq_domain *d,
return 0;
}
-static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
+static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
+ unsigned int count,
struct irq_fwspec *fwspec)
{
int i;
- fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL;
- fwspec->param_count = irq_data->args_count;
+ fwspec->fwnode = np ? &np->fwnode : NULL;
+ fwspec->param_count = count;
- for (i = 0; i < irq_data->args_count; i++)
- fwspec->param[i] = irq_data->args[i];
+ for (i = 0; i < count; i++)
+ fwspec->param[i] = args[i];
}
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
@@ -836,7 +851,9 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
{
struct irq_fwspec fwspec;
- of_phandle_args_to_fwspec(irq_data, &fwspec);
+ of_phandle_args_to_fwspec(irq_data->np, irq_data->args,
+ irq_data->args_count, &fwspec);
+
return irq_create_fwspec_mapping(&fwspec);
}
EXPORT_SYMBOL_GPL(irq_create_of_mapping);
@@ -928,11 +945,10 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
const u32 *intspec, unsigned int intsize,
irq_hw_number_t *out_hwirq, unsigned int *out_type)
{
- if (WARN_ON(intsize < 2))
- return -EINVAL;
- *out_hwirq = intspec[0];
- *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
- return 0;
+ struct irq_fwspec fwspec;
+
+ of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec);
+ return irq_domain_translate_twocell(d, &fwspec, out_hwirq, out_type);
}
EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
@@ -968,6 +984,27 @@ const struct irq_domain_ops irq_domain_simple_ops = {
};
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+/**
+ * irq_domain_translate_twocell() - Generic translate for direct two cell
+ * bindings
+ *
+ * Device Tree IRQ specifier translation function which works with two cell
+ * bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ */
+int irq_domain_translate_twocell(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ unsigned long *out_hwirq,
+ unsigned int *out_type)
+{
+ if (WARN_ON(fwspec->param_count < 2))
+ return -EINVAL;
+ *out_hwirq = fwspec->param[0];
+ *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_twocell);
+
int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
int node, const struct irq_affinity_desc *affinity)
{
@@ -1749,8 +1786,6 @@ void __init irq_domain_debugfs_init(struct dentry *root)
struct irq_domain *d;
domain_dir = debugfs_create_dir("domains", root);
- if (!domain_dir)
- return;
debugfs_create_file("default", 0444, domain_dir, NULL,
&irq_domain_debug_fops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 84b54a17b95d..1401afa0d58a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -196,6 +196,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
cpumask_copy(desc->irq_common_data.affinity, mask);
+ /* fall through */
case IRQ_SET_MASK_OK_NOCOPY:
irq_validate_effective_affinity(data);
irq_set_thread_affinity(desc);
@@ -341,7 +342,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
/* The release function is promised process context */
might_sleep();
- if (!desc)
+ if (!desc || desc->istate & IRQS_NMI)
return -EINVAL;
/* Complete initialisation of *notify */
@@ -553,6 +554,21 @@ bool disable_hardirq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(disable_hardirq);
+/**
+ * disable_nmi_nosync - disable an nmi without waiting
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Disables and enables are
+ * nested.
+ * The interrupt to disable must have been requested through request_nmi.
+ * Unlike disable_nmi(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
+ */
+void disable_nmi_nosync(unsigned int irq)
+{
+ disable_irq_nosync(irq);
+}
+
void __enable_irq(struct irq_desc *desc)
{
switch (desc->depth) {
@@ -609,6 +625,20 @@ out:
}
EXPORT_SYMBOL(enable_irq);
+/**
+ * enable_nmi - enable handling of an nmi
+ * @irq: Interrupt to enable
+ *
+ * The interrupt to enable must have been requested through request_nmi.
+ * Undoes the effect of one call to disable_nmi(). If this
+ * matches the last disable, processing of interrupts on this
+ * IRQ line is re-enabled.
+ */
+void enable_nmi(unsigned int irq)
+{
+ enable_irq(irq);
+}
+
static int set_irq_wake_real(unsigned int irq, unsigned int on)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -644,6 +674,12 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
if (!desc)
return -EINVAL;
+ /* Don't use NMIs as wake up interrupts please */
+ if (desc->istate & IRQS_NMI) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/* wakeup-capable irqs can be shared between drivers that
* don't need to have the same sleep mode behaviors.
*/
@@ -666,6 +702,8 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
}
}
+
+out_unlock:
irq_put_desc_busunlock(desc, flags);
return ret;
}
@@ -726,6 +764,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
case IRQ_SET_MASK_OK_DONE:
irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
irqd_set(&desc->irq_data, flags);
+ /* fall through */
case IRQ_SET_MASK_OK_NOCOPY:
flags = irqd_get_trigger_type(&desc->irq_data);
@@ -1128,6 +1167,39 @@ static void irq_release_resources(struct irq_desc *desc)
c->irq_release_resources(d);
}
+static bool irq_supports_nmi(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ /* Only IRQs directly managed by the root irqchip can be set as NMI */
+ if (d->parent_data)
+ return false;
+#endif
+ /* Don't support NMIs for chips behind a slow bus */
+ if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock)
+ return false;
+
+ return d->chip->flags & IRQCHIP_SUPPORTS_NMI;
+}
+
+static int irq_nmi_setup(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_chip *c = d->chip;
+
+ return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL;
+}
+
+static void irq_nmi_teardown(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_chip *c = d->chip;
+
+ if (c->irq_nmi_teardown)
+ c->irq_nmi_teardown(d);
+}
+
static int
setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
{
@@ -1302,9 +1374,17 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* fields must have IRQF_SHARED set and the bits which
* set the trigger type must match. Also all must
* agree on ONESHOT.
+ * Interrupt lines used for NMIs cannot be shared.
*/
unsigned int oldtype;
+ if (desc->istate & IRQS_NMI) {
+ pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n",
+ new->name, irq, desc->irq_data.chip->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* If nobody did set the configuration before, inherit
* the one provided by the requester.
@@ -1756,6 +1836,59 @@ const void *free_irq(unsigned int irq, void *dev_id)
}
EXPORT_SYMBOL(free_irq);
+/* This function must be called with desc->lock held */
+static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
+{
+ const char *devname = NULL;
+
+ desc->istate &= ~IRQS_NMI;
+
+ if (!WARN_ON(desc->action == NULL)) {
+ irq_pm_remove_action(desc, desc->action);
+ devname = desc->action->name;
+ unregister_handler_proc(irq, desc->action);
+
+ kfree(desc->action);
+ desc->action = NULL;
+ }
+
+ irq_settings_clr_disable_unlazy(desc);
+ irq_shutdown(desc);
+
+ irq_release_resources(desc);
+
+ irq_chip_pm_put(&desc->irq_data);
+ module_put(desc->owner);
+
+ return devname;
+}
+
+const void *free_nmi(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+ const void *devname;
+
+ if (!desc || WARN_ON(!(desc->istate & IRQS_NMI)))
+ return NULL;
+
+ if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ return NULL;
+
+ /* NMI still enabled */
+ if (WARN_ON(desc->depth == 0))
+ disable_nmi_nosync(irq);
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ irq_nmi_teardown(desc);
+ devname = __cleanup_nmi(irq, desc);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return devname;
+}
+
/**
* request_threaded_irq - allocate an interrupt line
* @irq: Interrupt line to allocate
@@ -1925,6 +2058,101 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
}
EXPORT_SYMBOL_GPL(request_any_context_irq);
+/**
+ * request_nmi - allocate an interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @irqflags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the
+ * interrupt line and IRQ handling. It sets up the IRQ line
+ * to be handled as an NMI.
+ *
+ * An interrupt line delivering NMIs cannot be shared and IRQ handling
+ * cannot be threaded.
+ *
+ * Interrupt lines requested for NMI delivering must produce per cpu
+ * interrupts and have auto enabling setting disabled.
+ *
+ * Dev_id must be globally unique. Normally the address of the
+ * device data structure is used as the cookie. Since the handler
+ * receives this value it makes sense to use it.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail and return a negative value.
+ */
+int request_nmi(unsigned int irq, irq_handler_t handler,
+ unsigned long irqflags, const char *name, void *dev_id)
+{
+ struct irqaction *action;
+ struct irq_desc *desc;
+ unsigned long flags;
+ int retval;
+
+ if (irq == IRQ_NOTCONNECTED)
+ return -ENOTCONN;
+
+ /* NMI cannot be shared, used for Polling */
+ if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL))
+ return -EINVAL;
+
+ if (!(irqflags & IRQF_PERCPU))
+ return -EINVAL;
+
+ if (!handler)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+
+ if (!desc || irq_settings_can_autoenable(desc) ||
+ !irq_settings_can_request(desc) ||
+ WARN_ON(irq_settings_is_per_cpu_devid(desc)) ||
+ !irq_supports_nmi(desc))
+ return -EINVAL;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING;
+ action->name = name;
+ action->dev_id = dev_id;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ goto err_out;
+
+ retval = __setup_irq(irq, desc, action);
+ if (retval)
+ goto err_irq_setup;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ /* Setup NMI state */
+ desc->istate |= IRQS_NMI;
+ retval = irq_nmi_setup(desc);
+ if (retval) {
+ __cleanup_nmi(irq, desc);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return -EINVAL;
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+
+err_irq_setup:
+ irq_chip_pm_put(&desc->irq_data);
+err_out:
+ kfree(action);
+
+ return retval;
+}
+
void enable_percpu_irq(unsigned int irq, unsigned int type)
{
unsigned int cpu = smp_processor_id();
@@ -1959,6 +2187,11 @@ out:
}
EXPORT_SYMBOL_GPL(enable_percpu_irq);
+void enable_percpu_nmi(unsigned int irq, unsigned int type)
+{
+ enable_percpu_irq(irq, type);
+}
+
/**
* irq_percpu_is_enabled - Check whether the per cpu irq is enabled
* @irq: Linux irq number to check for
@@ -1998,6 +2231,11 @@ void disable_percpu_irq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(disable_percpu_irq);
+void disable_percpu_nmi(unsigned int irq)
+{
+ disable_percpu_irq(irq);
+}
+
/*
* Internal function to unregister a percpu irqaction.
*/
@@ -2029,6 +2267,8 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
/* Found it - now remove it from the list of entries: */
desc->action = NULL;
+ desc->istate &= ~IRQS_NMI;
+
raw_spin_unlock_irqrestore(&desc->lock, flags);
unregister_handler_proc(irq, action);
@@ -2082,6 +2322,19 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
}
EXPORT_SYMBOL_GPL(free_percpu_irq);
+void free_percpu_nmi(unsigned int irq, void __percpu *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !irq_settings_is_per_cpu_devid(desc))
+ return;
+
+ if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ return;
+
+ kfree(__free_percpu_irq(irq, dev_id));
+}
+
/**
* setup_percpu_irq - setup a per-cpu interrupt
* @irq: Interrupt line to setup
@@ -2172,6 +2425,158 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL_GPL(__request_percpu_irq);
/**
+ * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @name: An ascii name for the claiming device
+ * @dev_id: A percpu cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
+ * have to be setup on each CPU by calling prepare_percpu_nmi() before
+ * being enabled on the same CPU by using enable_percpu_nmi().
+ *
+ * Dev_id must be globally unique. It is a per-cpu variable, and
+ * the handler gets called with the interrupted CPU's instance of
+ * that variable.
+ *
+ * Interrupt lines requested for NMI delivering should have auto enabling
+ * setting disabled.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail returning a negative value.
+ */
+int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
+ const char *name, void __percpu *dev_id)
+{
+ struct irqaction *action;
+ struct irq_desc *desc;
+ unsigned long flags;
+ int retval;
+
+ if (!handler)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+
+ if (!desc || !irq_settings_can_request(desc) ||
+ !irq_settings_is_per_cpu_devid(desc) ||
+ irq_settings_can_autoenable(desc) ||
+ !irq_supports_nmi(desc))
+ return -EINVAL;
+
+ /* The line cannot already be NMI */
+ if (desc->istate & IRQS_NMI)
+ return -EINVAL;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD
+ | IRQF_NOBALANCING;
+ action->name = name;
+ action->percpu_dev_id = dev_id;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ goto err_out;
+
+ retval = __setup_irq(irq, desc, action);
+ if (retval)
+ goto err_irq_setup;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ desc->istate |= IRQS_NMI;
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+
+err_irq_setup:
+ irq_chip_pm_put(&desc->irq_data);
+err_out:
+ kfree(action);
+
+ return retval;
+}
+
+/**
+ * prepare_percpu_nmi - performs CPU local setup for NMI delivery
+ * @irq: Interrupt line to prepare for NMI delivery
+ *
+ * This call prepares an interrupt line to deliver NMI on the current CPU,
+ * before that interrupt line gets enabled with enable_percpu_nmi().
+ *
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail returning a negative value.
+ */
+int prepare_percpu_nmi(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc;
+ int ret = 0;
+
+ WARN_ON(preemptible());
+
+ desc = irq_get_desc_lock(irq, &flags,
+ IRQ_GET_DESC_CHECK_PERCPU);
+ if (!desc)
+ return -EINVAL;
+
+ if (WARN(!(desc->istate & IRQS_NMI),
+ KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n",
+ irq)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = irq_nmi_setup(desc);
+ if (ret) {
+ pr_err("Failed to setup NMI delivery: irq %u\n", irq);
+ goto out;
+ }
+
+out:
+ irq_put_desc_unlock(desc, flags);
+ return ret;
+}
+
+/**
+ * teardown_percpu_nmi - undoes NMI setup of IRQ line
+ * @irq: Interrupt line from which CPU local NMI configuration should be
+ * removed
+ *
+ * This call undoes the setup done by prepare_percpu_nmi().
+ *
+ * IRQ line should not be enabled for the current CPU.
+ *
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
+ */
+void teardown_percpu_nmi(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc;
+
+ WARN_ON(preemptible());
+
+ desc = irq_get_desc_lock(irq, &flags,
+ IRQ_GET_DESC_CHECK_PERCPU);
+ if (!desc)
+ return;
+
+ if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ goto out;
+
+ irq_nmi_teardown(desc);
+out:
+ irq_put_desc_unlock(desc, flags);
+}
+
+/**
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
* @irq: Interrupt line that is forwarded to a VM
* @which: One of IRQCHIP_STATE_* the caller wants to know about
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index f3a04994e063..14934afa9e68 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -494,7 +494,7 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
- iter->module_name[0] = '\0';
+ strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN);
iter->exported = 0;
return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
&iter->value, &iter->type,
diff --git a/kernel/kcov.c b/kernel/kcov.c
index c2277dbdbfb1..2ee38727844a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -20,6 +20,7 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/kcov.h>
+#include <linux/refcount.h>
#include <asm/setup.h>
/* Number of 64-bit words written per one comparison: */
@@ -44,7 +45,7 @@ struct kcov {
* - opened file descriptor
* - task with enabled coverage (we can't unwire it from another task)
*/
- atomic_t refcount;
+ refcount_t refcount;
/* The lock protects mode, size, area and t. */
spinlock_t lock;
enum kcov_mode mode;
@@ -228,12 +229,12 @@ EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
static void kcov_get(struct kcov *kcov)
{
- atomic_inc(&kcov->refcount);
+ refcount_inc(&kcov->refcount);
}
static void kcov_put(struct kcov *kcov)
{
- if (atomic_dec_and_test(&kcov->refcount)) {
+ if (refcount_dec_and_test(&kcov->refcount)) {
vfree(kcov->area);
kfree(kcov);
}
@@ -312,7 +313,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
if (!kcov)
return -ENOMEM;
kcov->mode = KCOV_MODE_DISABLED;
- atomic_set(&kcov->refcount, 1);
+ refcount_set(&kcov->refcount, 1);
spin_lock_init(&kcov->lock);
filep->private_data = kcov;
return nonseekable_open(inode, filep);
@@ -444,10 +445,8 @@ static int __init kcov_init(void)
* there is no need to protect it against removal races. The
* use of debugfs_create_file_unsafe() is actually safe here.
*/
- if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {
- pr_err("failed to create kcov in debugfs\n");
- return -ENOMEM;
- }
+ debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops);
+
return 0;
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f4ddfdd2d07e..c83e54727131 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1396,7 +1396,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)
addr < (unsigned long)__kprobes_text_end;
}
-bool within_kprobe_blacklist(unsigned long addr)
+static bool __within_kprobe_blacklist(unsigned long addr)
{
struct kprobe_blacklist_entry *ent;
@@ -1410,7 +1410,26 @@ bool within_kprobe_blacklist(unsigned long addr)
if (addr >= ent->start_addr && addr < ent->end_addr)
return true;
}
+ return false;
+}
+bool within_kprobe_blacklist(unsigned long addr)
+{
+ char symname[KSYM_NAME_LEN], *p;
+
+ if (__within_kprobe_blacklist(addr))
+ return true;
+
+ /* Check if the address is on a suffixed-symbol */
+ if (!lookup_symbol_name(addr, symname)) {
+ p = strchr(symname, '.');
+ if (!p)
+ return false;
+ *p = '\0';
+ addr = (unsigned long)kprobe_lookup_name(symname, 0);
+ if (addr)
+ return __within_kprobe_blacklist(addr);
+ }
return false;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 087d18d771b5..5942eeafb9ac 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,6 +20,7 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/numa.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -101,6 +102,12 @@ bool kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
+bool __kthread_should_park(struct task_struct *k)
+{
+ return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
+}
+EXPORT_SYMBOL_GPL(__kthread_should_park);
+
/**
* kthread_should_park - should this kthread park now?
*
@@ -114,7 +121,7 @@ EXPORT_SYMBOL(kthread_should_stop);
*/
bool kthread_should_park(void)
{
- return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+ return __kthread_should_park(current);
}
EXPORT_SYMBOL_GPL(kthread_should_park);
@@ -599,7 +606,7 @@ void __kthread_init_worker(struct kthread_worker *worker,
struct lock_class_key *key)
{
memset(worker, 0, sizeof(struct kthread_worker));
- spin_lock_init(&worker->lock);
+ raw_spin_lock_init(&worker->lock);
lockdep_set_class_and_name(&worker->lock, key, name);
INIT_LIST_HEAD(&worker->work_list);
INIT_LIST_HEAD(&worker->delayed_work_list);
@@ -641,21 +648,21 @@ repeat:
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
worker->task = NULL;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
return 0;
}
work = NULL;
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
if (!list_empty(&worker->work_list)) {
work = list_first_entry(&worker->work_list,
struct kthread_work, node);
list_del_init(&work->node);
}
worker->current_work = work;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
if (work) {
__set_current_state(TASK_RUNNING);
@@ -675,7 +682,7 @@ __kthread_create_worker(int cpu, unsigned int flags,
{
struct kthread_worker *worker;
struct task_struct *task;
- int node = -1;
+ int node = NUMA_NO_NODE;
worker = kzalloc(sizeof(*worker), GFP_KERNEL);
if (!worker)
@@ -812,12 +819,12 @@ bool kthread_queue_work(struct kthread_worker *worker,
bool ret = false;
unsigned long flags;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
if (!queuing_blocked(worker, work)) {
kthread_insert_work(worker, work, &worker->work_list);
ret = true;
}
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_work);
@@ -835,6 +842,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
struct kthread_work *work = &dwork->work;
struct kthread_worker *worker = work->worker;
+ unsigned long flags;
/*
* This might happen when a pending work is reinitialized.
@@ -843,7 +851,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
if (WARN_ON_ONCE(!worker))
return;
- spin_lock(&worker->lock);
+ raw_spin_lock_irqsave(&worker->lock, flags);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
@@ -852,7 +860,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
list_del_init(&work->node);
kthread_insert_work(worker, work, &worker->work_list);
- spin_unlock(&worker->lock);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
@@ -908,14 +916,14 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker,
unsigned long flags;
bool ret = false;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
if (!queuing_blocked(worker, work)) {
__kthread_queue_delayed_work(worker, dwork, delay);
ret = true;
}
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);
@@ -951,7 +959,7 @@ void kthread_flush_work(struct kthread_work *work)
if (!worker)
return;
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
@@ -963,7 +971,7 @@ void kthread_flush_work(struct kthread_work *work)
else
noop = true;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
if (!noop)
wait_for_completion(&fwork.done);
@@ -996,9 +1004,9 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
* any queuing is blocked by setting the canceling counter.
*/
work->canceling++;
- spin_unlock_irqrestore(&worker->lock, *flags);
+ raw_spin_unlock_irqrestore(&worker->lock, *flags);
del_timer_sync(&dwork->timer);
- spin_lock_irqsave(&worker->lock, *flags);
+ raw_spin_lock_irqsave(&worker->lock, *flags);
work->canceling--;
}
@@ -1045,7 +1053,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
unsigned long flags;
int ret = false;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
/* Do not bother with canceling when never queued. */
if (!work->worker)
@@ -1062,7 +1070,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
fast_queue:
__kthread_queue_delayed_work(worker, dwork, delay);
out:
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);
@@ -1076,7 +1084,7 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
if (!worker)
goto out;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
@@ -1090,13 +1098,13 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
* In the meantime, block any queuing by setting the canceling counter.
*/
work->canceling++;
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
kthread_flush_work(work);
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
work->canceling--;
out_fast:
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
out:
return ret;
}
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 5b77a7314e01..eb0ee10a1981 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -45,7 +45,12 @@
*/
DEFINE_MUTEX(klp_mutex);
-static LIST_HEAD(klp_patches);
+/*
+ * Actively used patches: enabled or in transition. Note that replaced
+ * or disabled patches are not listed even though the related kernel
+ * module still can be loaded.
+ */
+LIST_HEAD(klp_patches);
static struct kobject *klp_root_kobj;
@@ -82,20 +87,43 @@ static void klp_find_object_module(struct klp_object *obj)
mutex_unlock(&module_mutex);
}
-static bool klp_is_patch_registered(struct klp_patch *patch)
+static bool klp_initialized(void)
+{
+ return !!klp_root_kobj;
+}
+
+static struct klp_func *klp_find_func(struct klp_object *obj,
+ struct klp_func *old_func)
{
- struct klp_patch *mypatch;
+ struct klp_func *func;
- list_for_each_entry(mypatch, &klp_patches, list)
- if (mypatch == patch)
- return true;
+ klp_for_each_func(obj, func) {
+ if ((strcmp(old_func->old_name, func->old_name) == 0) &&
+ (old_func->old_sympos == func->old_sympos)) {
+ return func;
+ }
+ }
- return false;
+ return NULL;
}
-static bool klp_initialized(void)
+static struct klp_object *klp_find_object(struct klp_patch *patch,
+ struct klp_object *old_obj)
{
- return !!klp_root_kobj;
+ struct klp_object *obj;
+
+ klp_for_each_object(patch, obj) {
+ if (klp_is_module(old_obj)) {
+ if (klp_is_module(obj) &&
+ strcmp(old_obj->name, obj->name) == 0) {
+ return obj;
+ }
+ } else if (!klp_is_module(obj)) {
+ return obj;
+ }
+ }
+
+ return NULL;
}
struct klp_find_arg {
@@ -278,170 +306,6 @@ static int klp_write_object_relocations(struct module *pmod,
return ret;
}
-static int __klp_disable_patch(struct klp_patch *patch)
-{
- struct klp_object *obj;
-
- if (WARN_ON(!patch->enabled))
- return -EINVAL;
-
- if (klp_transition_patch)
- return -EBUSY;
-
- /* enforce stacking: only the last enabled patch can be disabled */
- if (!list_is_last(&patch->list, &klp_patches) &&
- list_next_entry(patch, list)->enabled)
- return -EBUSY;
-
- klp_init_transition(patch, KLP_UNPATCHED);
-
- klp_for_each_object(patch, obj)
- if (obj->patched)
- klp_pre_unpatch_callback(obj);
-
- /*
- * Enforce the order of the func->transition writes in
- * klp_init_transition() and the TIF_PATCH_PENDING writes in
- * klp_start_transition(). In the rare case where klp_ftrace_handler()
- * is called shortly after klp_update_patch_state() switches the task,
- * this ensures the handler sees that func->transition is set.
- */
- smp_wmb();
-
- klp_start_transition();
- klp_try_complete_transition();
- patch->enabled = false;
-
- return 0;
-}
-
-/**
- * klp_disable_patch() - disables a registered patch
- * @patch: The registered, enabled patch to be disabled
- *
- * Unregisters the patched functions from ftrace.
- *
- * Return: 0 on success, otherwise error
- */
-int klp_disable_patch(struct klp_patch *patch)
-{
- int ret;
-
- mutex_lock(&klp_mutex);
-
- if (!klp_is_patch_registered(patch)) {
- ret = -EINVAL;
- goto err;
- }
-
- if (!patch->enabled) {
- ret = -EINVAL;
- goto err;
- }
-
- ret = __klp_disable_patch(patch);
-
-err:
- mutex_unlock(&klp_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(klp_disable_patch);
-
-static int __klp_enable_patch(struct klp_patch *patch)
-{
- struct klp_object *obj;
- int ret;
-
- if (klp_transition_patch)
- return -EBUSY;
-
- if (WARN_ON(patch->enabled))
- return -EINVAL;
-
- /* enforce stacking: only the first disabled patch can be enabled */
- if (patch->list.prev != &klp_patches &&
- !list_prev_entry(patch, list)->enabled)
- return -EBUSY;
-
- /*
- * A reference is taken on the patch module to prevent it from being
- * unloaded.
- */
- if (!try_module_get(patch->mod))
- return -ENODEV;
-
- pr_notice("enabling patch '%s'\n", patch->mod->name);
-
- klp_init_transition(patch, KLP_PATCHED);
-
- /*
- * Enforce the order of the func->transition writes in
- * klp_init_transition() and the ops->func_stack writes in
- * klp_patch_object(), so that klp_ftrace_handler() will see the
- * func->transition updates before the handler is registered and the
- * new funcs become visible to the handler.
- */
- smp_wmb();
-
- klp_for_each_object(patch, obj) {
- if (!klp_is_object_loaded(obj))
- continue;
-
- ret = klp_pre_patch_callback(obj);
- if (ret) {
- pr_warn("pre-patch callback failed for object '%s'\n",
- klp_is_module(obj) ? obj->name : "vmlinux");
- goto err;
- }
-
- ret = klp_patch_object(obj);
- if (ret) {
- pr_warn("failed to patch object '%s'\n",
- klp_is_module(obj) ? obj->name : "vmlinux");
- goto err;
- }
- }
-
- klp_start_transition();
- klp_try_complete_transition();
- patch->enabled = true;
-
- return 0;
-err:
- pr_warn("failed to enable patch '%s'\n", patch->mod->name);
-
- klp_cancel_transition();
- return ret;
-}
-
-/**
- * klp_enable_patch() - enables a registered patch
- * @patch: The registered, disabled patch to be enabled
- *
- * Performs the needed symbol lookups and code relocations,
- * then registers the patched functions with ftrace.
- *
- * Return: 0 on success, otherwise error
- */
-int klp_enable_patch(struct klp_patch *patch)
-{
- int ret;
-
- mutex_lock(&klp_mutex);
-
- if (!klp_is_patch_registered(patch)) {
- ret = -EINVAL;
- goto err;
- }
-
- ret = __klp_enable_patch(patch);
-
-err:
- mutex_unlock(&klp_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(klp_enable_patch);
-
/*
* Sysfs Interface
*
@@ -449,11 +313,11 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
* /sys/kernel/livepatch/<patch>
* /sys/kernel/livepatch/<patch>/enabled
* /sys/kernel/livepatch/<patch>/transition
- * /sys/kernel/livepatch/<patch>/signal
* /sys/kernel/livepatch/<patch>/force
* /sys/kernel/livepatch/<patch>/<object>
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
+static int __klp_disable_patch(struct klp_patch *patch);
static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
@@ -470,40 +334,32 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
mutex_lock(&klp_mutex);
- if (!klp_is_patch_registered(patch)) {
- /*
- * Module with the patch could either disappear meanwhile or is
- * not properly initialized yet.
- */
- ret = -EINVAL;
- goto err;
- }
-
if (patch->enabled == enabled) {
/* already in requested state */
ret = -EINVAL;
- goto err;
+ goto out;
}
- if (patch == klp_transition_patch) {
+ /*
+ * Allow to reverse a pending transition in both ways. It might be
+ * necessary to complete the transition without forcing and breaking
+ * the system integrity.
+ *
+ * Do not allow to re-enable a disabled patch.
+ */
+ if (patch == klp_transition_patch)
klp_reverse_transition();
- } else if (enabled) {
- ret = __klp_enable_patch(patch);
- if (ret)
- goto err;
- } else {
+ else if (!enabled)
ret = __klp_disable_patch(patch);
- if (ret)
- goto err;
- }
+ else
+ ret = -EINVAL;
+out:
mutex_unlock(&klp_mutex);
+ if (ret)
+ return ret;
return count;
-
-err:
- mutex_unlock(&klp_mutex);
- return ret;
}
static ssize_t enabled_show(struct kobject *kobj,
@@ -525,35 +381,6 @@ static ssize_t transition_show(struct kobject *kobj,
patch == klp_transition_patch);
}
-static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- struct klp_patch *patch;
- int ret;
- bool val;
-
- ret = kstrtobool(buf, &val);
- if (ret)
- return ret;
-
- if (!val)
- return count;
-
- mutex_lock(&klp_mutex);
-
- patch = container_of(kobj, struct klp_patch, kobj);
- if (patch != klp_transition_patch) {
- mutex_unlock(&klp_mutex);
- return -EINVAL;
- }
-
- klp_send_signals();
-
- mutex_unlock(&klp_mutex);
-
- return count;
-}
-
static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
@@ -585,16 +412,129 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
-static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal);
static struct kobj_attribute force_kobj_attr = __ATTR_WO(force);
static struct attribute *klp_patch_attrs[] = {
&enabled_kobj_attr.attr,
&transition_kobj_attr.attr,
- &signal_kobj_attr.attr,
&force_kobj_attr.attr,
NULL
};
+static void klp_free_object_dynamic(struct klp_object *obj)
+{
+ kfree(obj->name);
+ kfree(obj);
+}
+
+static struct klp_object *klp_alloc_object_dynamic(const char *name)
+{
+ struct klp_object *obj;
+
+ obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return NULL;
+
+ if (name) {
+ obj->name = kstrdup(name, GFP_KERNEL);
+ if (!obj->name) {
+ kfree(obj);
+ return NULL;
+ }
+ }
+
+ INIT_LIST_HEAD(&obj->func_list);
+ obj->dynamic = true;
+
+ return obj;
+}
+
+static void klp_free_func_nop(struct klp_func *func)
+{
+ kfree(func->old_name);
+ kfree(func);
+}
+
+static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func,
+ struct klp_object *obj)
+{
+ struct klp_func *func;
+
+ func = kzalloc(sizeof(*func), GFP_KERNEL);
+ if (!func)
+ return NULL;
+
+ if (old_func->old_name) {
+ func->old_name = kstrdup(old_func->old_name, GFP_KERNEL);
+ if (!func->old_name) {
+ kfree(func);
+ return NULL;
+ }
+ }
+
+ /*
+ * func->new_func is same as func->old_func. These addresses are
+ * set when the object is loaded, see klp_init_object_loaded().
+ */
+ func->old_sympos = old_func->old_sympos;
+ func->nop = true;
+
+ return func;
+}
+
+static int klp_add_object_nops(struct klp_patch *patch,
+ struct klp_object *old_obj)
+{
+ struct klp_object *obj;
+ struct klp_func *func, *old_func;
+
+ obj = klp_find_object(patch, old_obj);
+
+ if (!obj) {
+ obj = klp_alloc_object_dynamic(old_obj->name);
+ if (!obj)
+ return -ENOMEM;
+
+ list_add_tail(&obj->node, &patch->obj_list);
+ }
+
+ klp_for_each_func(old_obj, old_func) {
+ func = klp_find_func(obj, old_func);
+ if (func)
+ continue;
+
+ func = klp_alloc_func_nop(old_func, obj);
+ if (!func)
+ return -ENOMEM;
+
+ list_add_tail(&func->node, &obj->func_list);
+ }
+
+ return 0;
+}
+
+/*
+ * Add 'nop' functions which simply return to the caller to run
+ * the original function. The 'nop' functions are added to a
+ * patch to facilitate a 'replace' mode.
+ */
+static int klp_add_nops(struct klp_patch *patch)
+{
+ struct klp_patch *old_patch;
+ struct klp_object *old_obj;
+
+ klp_for_each_patch(old_patch) {
+ klp_for_each_object(old_patch, old_obj) {
+ int err;
+
+ err = klp_add_object_nops(patch, old_obj);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
static void klp_kobj_release_patch(struct kobject *kobj)
{
struct klp_patch *patch;
@@ -611,6 +551,12 @@ static struct kobj_type klp_ktype_patch = {
static void klp_kobj_release_object(struct kobject *kobj)
{
+ struct klp_object *obj;
+
+ obj = container_of(kobj, struct klp_object, kobj);
+
+ if (obj->dynamic)
+ klp_free_object_dynamic(obj);
}
static struct kobj_type klp_ktype_object = {
@@ -620,6 +566,12 @@ static struct kobj_type klp_ktype_object = {
static void klp_kobj_release_func(struct kobject *kobj)
{
+ struct klp_func *func;
+
+ func = container_of(kobj, struct klp_func, kobj);
+
+ if (func->nop)
+ klp_free_func_nop(func);
}
static struct kobj_type klp_ktype_func = {
@@ -627,17 +579,23 @@ static struct kobj_type klp_ktype_func = {
.sysfs_ops = &kobj_sysfs_ops,
};
-/*
- * Free all functions' kobjects in the array up to some limit. When limit is
- * NULL, all kobjects are freed.
- */
-static void klp_free_funcs_limited(struct klp_object *obj,
- struct klp_func *limit)
+static void __klp_free_funcs(struct klp_object *obj, bool nops_only)
{
- struct klp_func *func;
+ struct klp_func *func, *tmp_func;
+
+ klp_for_each_func_safe(obj, func, tmp_func) {
+ if (nops_only && !func->nop)
+ continue;
+
+ list_del(&func->node);
- for (func = obj->funcs; func->old_name && func != limit; func++)
- kobject_put(&func->kobj);
+ /* Might be called from klp_init_patch() error path. */
+ if (func->kobj_added) {
+ kobject_put(&func->kobj);
+ } else if (func->nop) {
+ klp_free_func_nop(func);
+ }
+ }
}
/* Clean up when a patched object is unloaded */
@@ -647,35 +605,111 @@ static void klp_free_object_loaded(struct klp_object *obj)
obj->mod = NULL;
- klp_for_each_func(obj, func)
- func->old_addr = 0;
+ klp_for_each_func(obj, func) {
+ func->old_func = NULL;
+
+ if (func->nop)
+ func->new_func = NULL;
+ }
}
-/*
- * Free all objects' kobjects in the array up to some limit. When limit is
- * NULL, all kobjects are freed.
- */
-static void klp_free_objects_limited(struct klp_patch *patch,
- struct klp_object *limit)
+static void __klp_free_objects(struct klp_patch *patch, bool nops_only)
{
- struct klp_object *obj;
+ struct klp_object *obj, *tmp_obj;
+
+ klp_for_each_object_safe(patch, obj, tmp_obj) {
+ __klp_free_funcs(obj, nops_only);
+
+ if (nops_only && !obj->dynamic)
+ continue;
+
+ list_del(&obj->node);
- for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
- klp_free_funcs_limited(obj, NULL);
- kobject_put(&obj->kobj);
+ /* Might be called from klp_init_patch() error path. */
+ if (obj->kobj_added) {
+ kobject_put(&obj->kobj);
+ } else if (obj->dynamic) {
+ klp_free_object_dynamic(obj);
+ }
}
}
-static void klp_free_patch(struct klp_patch *patch)
+static void klp_free_objects(struct klp_patch *patch)
+{
+ __klp_free_objects(patch, false);
+}
+
+static void klp_free_objects_dynamic(struct klp_patch *patch)
+{
+ __klp_free_objects(patch, true);
+}
+
+/*
+ * This function implements the free operations that can be called safely
+ * under klp_mutex.
+ *
+ * The operation must be completed by calling klp_free_patch_finish()
+ * outside klp_mutex.
+ */
+void klp_free_patch_start(struct klp_patch *patch)
{
- klp_free_objects_limited(patch, NULL);
if (!list_empty(&patch->list))
list_del(&patch->list);
+
+ klp_free_objects(patch);
+}
+
+/*
+ * This function implements the free part that must be called outside
+ * klp_mutex.
+ *
+ * It must be called after klp_free_patch_start(). And it has to be
+ * the last function accessing the livepatch structures when the patch
+ * gets disabled.
+ */
+static void klp_free_patch_finish(struct klp_patch *patch)
+{
+ /*
+ * Avoid deadlock with enabled_store() sysfs callback by
+ * calling this outside klp_mutex. It is safe because
+ * this is called when the patch gets disabled and it
+ * cannot get enabled again.
+ */
+ if (patch->kobj_added) {
+ kobject_put(&patch->kobj);
+ wait_for_completion(&patch->finish);
+ }
+
+ /* Put the module after the last access to struct klp_patch. */
+ if (!patch->forced)
+ module_put(patch->mod);
+}
+
+/*
+ * The livepatch might be freed from sysfs interface created by the patch.
+ * This work allows to wait until the interface is destroyed in a separate
+ * context.
+ */
+static void klp_free_patch_work_fn(struct work_struct *work)
+{
+ struct klp_patch *patch =
+ container_of(work, struct klp_patch, free_work);
+
+ klp_free_patch_finish(patch);
}
static int klp_init_func(struct klp_object *obj, struct klp_func *func)
{
- if (!func->old_name || !func->new_func)
+ int ret;
+
+ if (!func->old_name)
+ return -EINVAL;
+
+ /*
+ * NOPs get the address later. The patched module must be loaded,
+ * see klp_init_object_loaded().
+ */
+ if (!func->new_func && !func->nop)
return -EINVAL;
if (strlen(func->old_name) >= KSYM_NAME_LEN)
@@ -690,9 +724,13 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
* object. If the user selects 0 for old_sympos, then 1 will be used
* since a unique symbol will be the first occurrence.
*/
- return kobject_init_and_add(&func->kobj, &klp_ktype_func,
- &obj->kobj, "%s,%lu", func->old_name,
- func->old_sympos ? func->old_sympos : 1);
+ ret = kobject_init_and_add(&func->kobj, &klp_ktype_func,
+ &obj->kobj, "%s,%lu", func->old_name,
+ func->old_sympos ? func->old_sympos : 1);
+ if (!ret)
+ func->kobj_added = true;
+
+ return ret;
}
/* Arches may override this to finish any remaining arch-specific tasks */
@@ -721,11 +759,11 @@ static int klp_init_object_loaded(struct klp_patch *patch,
klp_for_each_func(obj, func) {
ret = klp_find_object_symbol(obj->name, func->old_name,
func->old_sympos,
- &func->old_addr);
+ (unsigned long *)&func->old_func);
if (ret)
return ret;
- ret = kallsyms_lookup_size_offset(func->old_addr,
+ ret = kallsyms_lookup_size_offset((unsigned long)func->old_func,
&func->old_size, NULL);
if (!ret) {
pr_err("kallsyms size lookup failed for '%s'\n",
@@ -733,6 +771,9 @@ static int klp_init_object_loaded(struct klp_patch *patch,
return -ENOENT;
}
+ if (func->nop)
+ func->new_func = func->old_func;
+
ret = kallsyms_lookup_size_offset((unsigned long)func->new_func,
&func->new_size, NULL);
if (!ret) {
@@ -751,9 +792,6 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
int ret;
const char *name;
- if (!obj->funcs)
- return -EINVAL;
-
if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN)
return -EINVAL;
@@ -767,122 +805,191 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
&patch->kobj, "%s", name);
if (ret)
return ret;
+ obj->kobj_added = true;
klp_for_each_func(obj, func) {
ret = klp_init_func(obj, func);
if (ret)
- goto free;
+ return ret;
}
- if (klp_is_object_loaded(obj)) {
+ if (klp_is_object_loaded(obj))
ret = klp_init_object_loaded(patch, obj);
- if (ret)
- goto free;
- }
- return 0;
-
-free:
- klp_free_funcs_limited(obj, func);
- kobject_put(&obj->kobj);
return ret;
}
-static int klp_init_patch(struct klp_patch *patch)
+static int klp_init_patch_early(struct klp_patch *patch)
{
struct klp_object *obj;
- int ret;
+ struct klp_func *func;
if (!patch->objs)
return -EINVAL;
- mutex_lock(&klp_mutex);
-
+ INIT_LIST_HEAD(&patch->list);
+ INIT_LIST_HEAD(&patch->obj_list);
+ patch->kobj_added = false;
patch->enabled = false;
+ patch->forced = false;
+ INIT_WORK(&patch->free_work, klp_free_patch_work_fn);
init_completion(&patch->finish);
+ klp_for_each_object_static(patch, obj) {
+ if (!obj->funcs)
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&obj->func_list);
+ obj->kobj_added = false;
+ list_add_tail(&obj->node, &patch->obj_list);
+
+ klp_for_each_func_static(obj, func) {
+ func->kobj_added = false;
+ list_add_tail(&func->node, &obj->func_list);
+ }
+ }
+
+ if (!try_module_get(patch->mod))
+ return -ENODEV;
+
+ return 0;
+}
+
+static int klp_init_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
+
ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
klp_root_kobj, "%s", patch->mod->name);
- if (ret) {
- mutex_unlock(&klp_mutex);
+ if (ret)
return ret;
+ patch->kobj_added = true;
+
+ if (patch->replace) {
+ ret = klp_add_nops(patch);
+ if (ret)
+ return ret;
}
klp_for_each_object(patch, obj) {
ret = klp_init_object(patch, obj);
if (ret)
- goto free;
+ return ret;
}
list_add_tail(&patch->list, &klp_patches);
- mutex_unlock(&klp_mutex);
-
return 0;
+}
-free:
- klp_free_objects_limited(patch, obj);
+static int __klp_disable_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
- mutex_unlock(&klp_mutex);
+ if (WARN_ON(!patch->enabled))
+ return -EINVAL;
- kobject_put(&patch->kobj);
- wait_for_completion(&patch->finish);
+ if (klp_transition_patch)
+ return -EBUSY;
- return ret;
+ klp_init_transition(patch, KLP_UNPATCHED);
+
+ klp_for_each_object(patch, obj)
+ if (obj->patched)
+ klp_pre_unpatch_callback(obj);
+
+ /*
+ * Enforce the order of the func->transition writes in
+ * klp_init_transition() and the TIF_PATCH_PENDING writes in
+ * klp_start_transition(). In the rare case where klp_ftrace_handler()
+ * is called shortly after klp_update_patch_state() switches the task,
+ * this ensures the handler sees that func->transition is set.
+ */
+ smp_wmb();
+
+ klp_start_transition();
+ patch->enabled = false;
+ klp_try_complete_transition();
+
+ return 0;
}
-/**
- * klp_unregister_patch() - unregisters a patch
- * @patch: Disabled patch to be unregistered
- *
- * Frees the data structures and removes the sysfs interface.
- *
- * Return: 0 on success, otherwise error
- */
-int klp_unregister_patch(struct klp_patch *patch)
+static int __klp_enable_patch(struct klp_patch *patch)
{
+ struct klp_object *obj;
int ret;
- mutex_lock(&klp_mutex);
+ if (klp_transition_patch)
+ return -EBUSY;
- if (!klp_is_patch_registered(patch)) {
- ret = -EINVAL;
- goto err;
- }
+ if (WARN_ON(patch->enabled))
+ return -EINVAL;
- if (patch->enabled) {
- ret = -EBUSY;
- goto err;
- }
+ if (!patch->kobj_added)
+ return -EINVAL;
- klp_free_patch(patch);
+ pr_notice("enabling patch '%s'\n", patch->mod->name);
- mutex_unlock(&klp_mutex);
+ klp_init_transition(patch, KLP_PATCHED);
+
+ /*
+ * Enforce the order of the func->transition writes in
+ * klp_init_transition() and the ops->func_stack writes in
+ * klp_patch_object(), so that klp_ftrace_handler() will see the
+ * func->transition updates before the handler is registered and the
+ * new funcs become visible to the handler.
+ */
+ smp_wmb();
+
+ klp_for_each_object(patch, obj) {
+ if (!klp_is_object_loaded(obj))
+ continue;
+
+ ret = klp_pre_patch_callback(obj);
+ if (ret) {
+ pr_warn("pre-patch callback failed for object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
+ }
+
+ ret = klp_patch_object(obj);
+ if (ret) {
+ pr_warn("failed to patch object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
+ }
+ }
- kobject_put(&patch->kobj);
- wait_for_completion(&patch->finish);
+ klp_start_transition();
+ patch->enabled = true;
+ klp_try_complete_transition();
return 0;
err:
- mutex_unlock(&klp_mutex);
+ pr_warn("failed to enable patch '%s'\n", patch->mod->name);
+
+ klp_cancel_transition();
return ret;
}
-EXPORT_SYMBOL_GPL(klp_unregister_patch);
/**
- * klp_register_patch() - registers a patch
- * @patch: Patch to be registered
+ * klp_enable_patch() - enable the livepatch
+ * @patch: patch to be enabled
*
- * Initializes the data structure associated with the patch and
- * creates the sysfs interface.
+ * Initializes the data structure associated with the patch, creates the sysfs
+ * interface, performs the needed symbol lookups and code relocations,
+ * registers the patched functions with ftrace.
*
- * There is no need to take the reference on the patch module here. It is done
- * later when the patch is enabled.
+ * This function is supposed to be called from the livepatch module_init()
+ * callback.
*
* Return: 0 on success, otherwise error
*/
-int klp_register_patch(struct klp_patch *patch)
+int klp_enable_patch(struct klp_patch *patch)
{
+ int ret;
+
if (!patch || !patch->mod)
return -EINVAL;
@@ -897,12 +1004,91 @@ int klp_register_patch(struct klp_patch *patch)
if (!klp_have_reliable_stack()) {
pr_err("This architecture doesn't have support for the livepatch consistency model.\n");
- return -ENOSYS;
+ return -EOPNOTSUPP;
+ }
+
+
+ mutex_lock(&klp_mutex);
+
+ ret = klp_init_patch_early(patch);
+ if (ret) {
+ mutex_unlock(&klp_mutex);
+ return ret;
}
- return klp_init_patch(patch);
+ ret = klp_init_patch(patch);
+ if (ret)
+ goto err;
+
+ ret = __klp_enable_patch(patch);
+ if (ret)
+ goto err;
+
+ mutex_unlock(&klp_mutex);
+
+ return 0;
+
+err:
+ klp_free_patch_start(patch);
+
+ mutex_unlock(&klp_mutex);
+
+ klp_free_patch_finish(patch);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_enable_patch);
+
+/*
+ * This function removes replaced patches.
+ *
+ * We could be pretty aggressive here. It is called in the situation where
+ * these structures are no longer accessible. All functions are redirected
+ * by the klp_transition_patch. They use either a new code or they are in
+ * the original code because of the special nop function patches.
+ *
+ * The only exception is when the transition was forced. In this case,
+ * klp_ftrace_handler() might still see the replaced patch on the stack.
+ * Fortunately, it is carefully designed to work with removed functions
+ * thanks to RCU. We only have to keep the patches on the system. Also
+ * this is handled transparently by patch->module_put.
+ */
+void klp_discard_replaced_patches(struct klp_patch *new_patch)
+{
+ struct klp_patch *old_patch, *tmp_patch;
+
+ klp_for_each_patch_safe(old_patch, tmp_patch) {
+ if (old_patch == new_patch)
+ return;
+
+ old_patch->enabled = false;
+ klp_unpatch_objects(old_patch);
+ klp_free_patch_start(old_patch);
+ schedule_work(&old_patch->free_work);
+ }
+}
+
+/*
+ * This function removes the dynamically allocated 'nop' functions.
+ *
+ * We could be pretty aggressive. NOPs do not change the existing
+ * behavior except for adding unnecessary delay by the ftrace handler.
+ *
+ * It is safe even when the transition was forced. The ftrace handler
+ * will see a valid ops->func_stack entry thanks to RCU.
+ *
+ * We could even free the NOPs structures. They must be the last entry
+ * in ops->func_stack. Therefore unregister_ftrace_function() is called.
+ * It does the same as klp_synchronize_transition() to make sure that
+ * nobody is inside the ftrace handler once the operation finishes.
+ *
+ * IMPORTANT: It must be called right after removing the replaced patches!
+ */
+void klp_discard_nops(struct klp_patch *new_patch)
+{
+ klp_unpatch_objects_dynamic(klp_transition_patch);
+ klp_free_objects_dynamic(klp_transition_patch);
}
-EXPORT_SYMBOL_GPL(klp_register_patch);
/*
* Remove parts of patches that touch a given kernel module. The list of
@@ -915,7 +1101,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
struct klp_patch *patch;
struct klp_object *obj;
- list_for_each_entry(patch, &klp_patches, list) {
+ klp_for_each_patch(patch) {
if (patch == limit)
break;
@@ -923,21 +1109,14 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
- /*
- * Only unpatch the module if the patch is enabled or
- * is in transition.
- */
- if (patch->enabled || patch == klp_transition_patch) {
+ if (patch != klp_transition_patch)
+ klp_pre_unpatch_callback(obj);
- if (patch != klp_transition_patch)
- klp_pre_unpatch_callback(obj);
+ pr_notice("reverting patch '%s' on unloading module '%s'\n",
+ patch->mod->name, obj->mod->name);
+ klp_unpatch_object(obj);
- pr_notice("reverting patch '%s' on unloading module '%s'\n",
- patch->mod->name, obj->mod->name);
- klp_unpatch_object(obj);
-
- klp_post_unpatch_callback(obj);
- }
+ klp_post_unpatch_callback(obj);
klp_free_object_loaded(obj);
break;
@@ -962,7 +1141,7 @@ int klp_module_coming(struct module *mod)
*/
mod->klp_alive = true;
- list_for_each_entry(patch, &klp_patches, list) {
+ klp_for_each_patch(patch) {
klp_for_each_object(patch, obj) {
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
@@ -976,13 +1155,6 @@ int klp_module_coming(struct module *mod)
goto err;
}
- /*
- * Only patch the module if the patch is enabled or is
- * in transition.
- */
- if (!patch->enabled && patch != klp_transition_patch)
- break;
-
pr_notice("applying patch '%s' to loading module '%s'\n",
patch->mod->name, obj->mod->name);
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index 48a83d4364cf..ec43a40b853f 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -5,6 +5,17 @@
#include <linux/livepatch.h>
extern struct mutex klp_mutex;
+extern struct list_head klp_patches;
+
+#define klp_for_each_patch_safe(patch, tmp_patch) \
+ list_for_each_entry_safe(patch, tmp_patch, &klp_patches, list)
+
+#define klp_for_each_patch(patch) \
+ list_for_each_entry(patch, &klp_patches, list)
+
+void klp_free_patch_start(struct klp_patch *patch);
+void klp_discard_replaced_patches(struct klp_patch *new_patch);
+void klp_discard_nops(struct klp_patch *new_patch);
static inline bool klp_is_object_loaded(struct klp_object *obj)
{
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 7702cb4064fc..99cb3ad05eb4 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -34,7 +34,7 @@
static LIST_HEAD(klp_ops);
-struct klp_ops *klp_find_ops(unsigned long old_addr)
+struct klp_ops *klp_find_ops(void *old_func)
{
struct klp_ops *ops;
struct klp_func *func;
@@ -42,7 +42,7 @@ struct klp_ops *klp_find_ops(unsigned long old_addr)
list_for_each_entry(ops, &klp_ops, node) {
func = list_first_entry(&ops->func_stack, struct klp_func,
stack_node);
- if (func->old_addr == old_addr)
+ if (func->old_func == old_func)
return ops;
}
@@ -118,7 +118,15 @@ static void notrace klp_ftrace_handler(unsigned long ip,
}
}
+ /*
+ * NOPs are used to replace existing patches with original code.
+ * Do nothing! Setting pc would cause an infinite loop.
+ */
+ if (func->nop)
+ goto unlock;
+
klp_arch_set_pc(regs, (unsigned long)func->new_func);
+
unlock:
preempt_enable_notrace();
}
@@ -142,17 +150,18 @@ static void klp_unpatch_func(struct klp_func *func)
if (WARN_ON(!func->patched))
return;
- if (WARN_ON(!func->old_addr))
+ if (WARN_ON(!func->old_func))
return;
- ops = klp_find_ops(func->old_addr);
+ ops = klp_find_ops(func->old_func);
if (WARN_ON(!ops))
return;
if (list_is_singular(&ops->func_stack)) {
unsigned long ftrace_loc;
- ftrace_loc = klp_get_ftrace_location(func->old_addr);
+ ftrace_loc =
+ klp_get_ftrace_location((unsigned long)func->old_func);
if (WARN_ON(!ftrace_loc))
return;
@@ -174,17 +183,18 @@ static int klp_patch_func(struct klp_func *func)
struct klp_ops *ops;
int ret;
- if (WARN_ON(!func->old_addr))
+ if (WARN_ON(!func->old_func))
return -EINVAL;
if (WARN_ON(func->patched))
return -EINVAL;
- ops = klp_find_ops(func->old_addr);
+ ops = klp_find_ops(func->old_func);
if (!ops) {
unsigned long ftrace_loc;
- ftrace_loc = klp_get_ftrace_location(func->old_addr);
+ ftrace_loc =
+ klp_get_ftrace_location((unsigned long)func->old_func);
if (!ftrace_loc) {
pr_err("failed to find location for function '%s'\n",
func->old_name);
@@ -236,15 +246,26 @@ err:
return ret;
}
-void klp_unpatch_object(struct klp_object *obj)
+static void __klp_unpatch_object(struct klp_object *obj, bool nops_only)
{
struct klp_func *func;
- klp_for_each_func(obj, func)
+ klp_for_each_func(obj, func) {
+ if (nops_only && !func->nop)
+ continue;
+
if (func->patched)
klp_unpatch_func(func);
+ }
- obj->patched = false;
+ if (obj->dynamic || !nops_only)
+ obj->patched = false;
+}
+
+
+void klp_unpatch_object(struct klp_object *obj)
+{
+ __klp_unpatch_object(obj, false);
}
int klp_patch_object(struct klp_object *obj)
@@ -267,11 +288,21 @@ int klp_patch_object(struct klp_object *obj)
return 0;
}
-void klp_unpatch_objects(struct klp_patch *patch)
+static void __klp_unpatch_objects(struct klp_patch *patch, bool nops_only)
{
struct klp_object *obj;
klp_for_each_object(patch, obj)
if (obj->patched)
- klp_unpatch_object(obj);
+ __klp_unpatch_object(obj, nops_only);
+}
+
+void klp_unpatch_objects(struct klp_patch *patch)
+{
+ __klp_unpatch_objects(patch, false);
+}
+
+void klp_unpatch_objects_dynamic(struct klp_patch *patch)
+{
+ __klp_unpatch_objects(patch, true);
}
diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h
index e72d8250d04b..d5f2fbe373e0 100644
--- a/kernel/livepatch/patch.h
+++ b/kernel/livepatch/patch.h
@@ -10,7 +10,7 @@
* struct klp_ops - structure for tracking registered ftrace ops structs
*
* A single ftrace_ops is shared between all enabled replacement functions
- * (klp_func structs) which have the same old_addr. This allows the switch
+ * (klp_func structs) which have the same old_func. This allows the switch
* between function versions to happen instantaneously by updating the klp_ops
* struct's func_stack list. The winner is the klp_func at the top of the
* func_stack (front of the list).
@@ -25,10 +25,11 @@ struct klp_ops {
struct ftrace_ops fops;
};
-struct klp_ops *klp_find_ops(unsigned long old_addr);
+struct klp_ops *klp_find_ops(void *old_func);
int klp_patch_object(struct klp_object *obj);
void klp_unpatch_object(struct klp_object *obj);
void klp_unpatch_objects(struct klp_patch *patch);
+void klp_unpatch_objects_dynamic(struct klp_patch *patch);
#endif /* _LIVEPATCH_PATCH_H */
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 304d5eb8a98c..9c89ae8b337a 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -29,11 +29,13 @@
#define MAX_STACK_ENTRIES 100
#define STACK_ERR_BUF_SIZE 128
+#define SIGNALS_TIMEOUT 15
+
struct klp_patch *klp_transition_patch;
static int klp_target_state = KLP_UNDEFINED;
-static bool klp_forced = false;
+static unsigned int klp_signals_cnt;
/*
* This work can be performed periodically to finish patching or unpatching any
@@ -87,6 +89,11 @@ static void klp_complete_transition(void)
klp_transition_patch->mod->name,
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+ if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) {
+ klp_discard_replaced_patches(klp_transition_patch);
+ klp_discard_nops(klp_transition_patch);
+ }
+
if (klp_target_state == KLP_UNPATCHED) {
/*
* All tasks have transitioned to KLP_UNPATCHED so we can now
@@ -136,13 +143,6 @@ static void klp_complete_transition(void)
pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
- /*
- * klp_forced set implies unbounded increase of module's ref count if
- * the module is disabled/enabled in a loop.
- */
- if (!klp_forced && klp_target_state == KLP_UNPATCHED)
- module_put(klp_transition_patch->mod);
-
klp_target_state = KLP_UNDEFINED;
klp_transition_patch = NULL;
}
@@ -224,11 +224,11 @@ static int klp_check_stack_func(struct klp_func *func,
* Check for the to-be-patched function
* (the previous func).
*/
- ops = klp_find_ops(func->old_addr);
+ ops = klp_find_ops(func->old_func);
if (list_is_singular(&ops->func_stack)) {
/* original function */
- func_addr = func->old_addr;
+ func_addr = (unsigned long)func->old_func;
func_size = func->old_size;
} else {
/* previously patched function */
@@ -348,6 +348,47 @@ done:
}
/*
+ * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
+ * Kthreads with TIF_PATCH_PENDING set are woken up.
+ */
+static void klp_send_signals(void)
+{
+ struct task_struct *g, *task;
+
+ if (klp_signals_cnt == SIGNALS_TIMEOUT)
+ pr_notice("signaling remaining tasks\n");
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task) {
+ if (!klp_patch_pending(task))
+ continue;
+
+ /*
+ * There is a small race here. We could see TIF_PATCH_PENDING
+ * set and decide to wake up a kthread or send a fake signal.
+ * Meanwhile the task could migrate itself and the action
+ * would be meaningless. It is not serious though.
+ */
+ if (task->flags & PF_KTHREAD) {
+ /*
+ * Wake up a kthread which sleeps interruptedly and
+ * still has not been migrated.
+ */
+ wake_up_state(task, TASK_INTERRUPTIBLE);
+ } else {
+ /*
+ * Send fake signal to all non-kthread tasks which are
+ * still not migrated.
+ */
+ spin_lock_irq(&task->sighand->siglock);
+ signal_wake_up(task, 0);
+ spin_unlock_irq(&task->sighand->siglock);
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+/*
* Try to switch all remaining tasks to the target patch state by walking the
* stacks of sleeping tasks and looking for any to-be-patched or
* to-be-unpatched functions. If such functions are found, the task can't be
@@ -359,6 +400,7 @@ void klp_try_complete_transition(void)
{
unsigned int cpu;
struct task_struct *g, *task;
+ struct klp_patch *patch;
bool complete = true;
WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
@@ -396,6 +438,10 @@ void klp_try_complete_transition(void)
put_online_cpus();
if (!complete) {
+ if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT))
+ klp_send_signals();
+ klp_signals_cnt++;
+
/*
* Some tasks weren't able to be switched over. Try again
* later and/or wait for other methods like kernel exit
@@ -407,7 +453,18 @@ void klp_try_complete_transition(void)
}
/* we're done, now cleanup the data structures */
+ patch = klp_transition_patch;
klp_complete_transition();
+
+ /*
+ * It would make more sense to free the patch in
+ * klp_complete_transition() but it is called also
+ * from klp_cancel_transition().
+ */
+ if (!patch->enabled) {
+ klp_free_patch_start(patch);
+ schedule_work(&patch->free_work);
+ }
}
/*
@@ -446,6 +503,8 @@ void klp_start_transition(void)
if (task->patch_state != klp_target_state)
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
}
+
+ klp_signals_cnt = 0;
}
/*
@@ -569,47 +628,6 @@ void klp_copy_process(struct task_struct *child)
}
/*
- * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
- * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this
- * action currently.
- */
-void klp_send_signals(void)
-{
- struct task_struct *g, *task;
-
- pr_notice("signaling remaining tasks\n");
-
- read_lock(&tasklist_lock);
- for_each_process_thread(g, task) {
- if (!klp_patch_pending(task))
- continue;
-
- /*
- * There is a small race here. We could see TIF_PATCH_PENDING
- * set and decide to wake up a kthread or send a fake signal.
- * Meanwhile the task could migrate itself and the action
- * would be meaningless. It is not serious though.
- */
- if (task->flags & PF_KTHREAD) {
- /*
- * Wake up a kthread which sleeps interruptedly and
- * still has not been migrated.
- */
- wake_up_state(task, TASK_INTERRUPTIBLE);
- } else {
- /*
- * Send fake signal to all non-kthread tasks which are
- * still not migrated.
- */
- spin_lock_irq(&task->sighand->siglock);
- signal_wake_up(task, 0);
- spin_unlock_irq(&task->sighand->siglock);
- }
- }
- read_unlock(&tasklist_lock);
-}
-
-/*
* Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an
* existing transition to finish.
*
@@ -620,6 +638,7 @@ void klp_send_signals(void)
*/
void klp_force_transition(void)
{
+ struct klp_patch *patch;
struct task_struct *g, *task;
unsigned int cpu;
@@ -633,5 +652,6 @@ void klp_force_transition(void)
for_each_possible_cpu(cpu)
klp_update_patch_state(idle_task(cpu));
- klp_forced = true;
+ klp_for_each_patch(patch)
+ patch->forced = true;
}
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
index f9d0bc016067..322db16233de 100644
--- a/kernel/livepatch/transition.h
+++ b/kernel/livepatch/transition.h
@@ -11,7 +11,6 @@ void klp_cancel_transition(void);
void klp_start_transition(void);
void klp_try_complete_transition(void);
void klp_reverse_transition(void);
-void klp_send_signals(void);
void klp_force_transition(void);
#endif /* _LIVEPATCH_TRANSITION_H */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 95932333a48b..e16766ff184b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -45,11 +45,14 @@
#include <linux/hash.h>
#include <linux/ftrace.h>
#include <linux/stringify.h>
+#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/gfp.h>
#include <linux/random.h>
#include <linux/jhash.h>
#include <linux/nmi.h>
+#include <linux/rcupdate.h>
+#include <linux/kprobes.h>
#include <asm/sections.h>
@@ -81,6 +84,7 @@ module_param(lock_stat, int, 0644);
* code to recurse back into the lockdep code...
*/
static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static struct task_struct *lockdep_selftest_task_struct;
static int graph_lock(void)
{
@@ -130,13 +134,17 @@ static inline int debug_locks_off_graph_unlock(void)
unsigned long nr_list_entries;
static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
/*
* All data structures here are protected by the global debug_lock.
*
- * Mutex key structs only get allocated, once during bootup, and never
- * get freed - this significantly simplifies the debugging code.
+ * nr_lock_classes is the number of elements of lock_classes[] that is
+ * in use.
*/
+#define KEYHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
+#define KEYHASH_SIZE (1UL << KEYHASH_BITS)
+static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
#ifndef CONFIG_DEBUG_LOCKDEP
static
@@ -277,11 +285,42 @@ static inline void lock_release_holdtime(struct held_lock *hlock)
#endif
/*
- * We keep a global list of all lock classes. The list only grows,
- * never shrinks. The list is only accessed with the lockdep
- * spinlock lock held.
+ * We keep a global list of all lock classes. The list is only accessed with
+ * the lockdep spinlock lock held. free_lock_classes is a list with free
+ * elements. These elements are linked together by the lock_entry member in
+ * struct lock_class.
*/
LIST_HEAD(all_lock_classes);
+static LIST_HEAD(free_lock_classes);
+
+/**
+ * struct pending_free - information about data structures about to be freed
+ * @zapped: Head of a list with struct lock_class elements.
+ * @lock_chains_being_freed: Bitmap that indicates which lock_chains[] elements
+ * are about to be freed.
+ */
+struct pending_free {
+ struct list_head zapped;
+ DECLARE_BITMAP(lock_chains_being_freed, MAX_LOCKDEP_CHAINS);
+};
+
+/**
+ * struct delayed_free - data structures used for delayed freeing
+ *
+ * A data structure for delayed freeing of data structures that may be
+ * accessed by RCU readers at the time these were freed.
+ *
+ * @rcu_head: Used to schedule an RCU callback for freeing data structures.
+ * @index: Index of @pf to which freed data structures are added.
+ * @scheduled: Whether or not an RCU callback has been scheduled.
+ * @pf: Array with information about data structures about to be freed.
+ */
+static struct delayed_free {
+ struct rcu_head rcu_head;
+ int index;
+ int scheduled;
+ struct pending_free pf[2];
+} delayed_free;
/*
* The lockdep classes are in a hash-table as well, for fast lookup:
@@ -331,6 +370,11 @@ void lockdep_on(void)
}
EXPORT_SYMBOL(lockdep_on);
+void lockdep_set_selftest_task(struct task_struct *task)
+{
+ lockdep_selftest_task_struct = task;
+}
+
/*
* Debugging switches:
*/
@@ -599,7 +643,7 @@ static int very_verbose(struct lock_class *class)
* Is this the address of a static object:
*/
#ifdef __KERNEL__
-static int static_obj(void *obj)
+static int static_obj(const void *obj)
{
unsigned long start = (unsigned long) &_stext,
end = (unsigned long) &_end,
@@ -716,6 +760,17 @@ static bool assign_lock_key(struct lockdep_map *lock)
{
unsigned long can_addr, addr = (unsigned long)lock;
+#ifdef __KERNEL__
+ /*
+ * lockdep_free_key_range() assumes that struct lock_class_key
+ * objects do not overlap. Since we use the address of lock
+ * objects as class key for static objects, check whether the
+ * size of lock_class_key objects does not exceed the size of
+ * the smallest lock object.
+ */
+ BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(raw_spinlock_t));
+#endif
+
if (__is_kernel_percpu_address(addr, &can_addr))
lock->key = (void *)can_addr;
else if (__is_module_percpu_address(addr, &can_addr))
@@ -735,6 +790,289 @@ static bool assign_lock_key(struct lockdep_map *lock)
return true;
}
+#ifdef CONFIG_DEBUG_LOCKDEP
+
+/* Check whether element @e occurs in list @h */
+static bool in_list(struct list_head *e, struct list_head *h)
+{
+ struct list_head *f;
+
+ list_for_each(f, h) {
+ if (e == f)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Check whether entry @e occurs in any of the locks_after or locks_before
+ * lists.
+ */
+static bool in_any_class_list(struct list_head *e)
+{
+ struct lock_class *class;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (in_list(e, &class->locks_after) ||
+ in_list(e, &class->locks_before))
+ return true;
+ }
+ return false;
+}
+
+static bool class_lock_list_valid(struct lock_class *c, struct list_head *h)
+{
+ struct lock_list *e;
+
+ list_for_each_entry(e, h, entry) {
+ if (e->links_to != c) {
+ printk(KERN_INFO "class %s: mismatch for lock entry %ld; class %s <> %s",
+ c->name ? : "(?)",
+ (unsigned long)(e - list_entries),
+ e->links_to && e->links_to->name ?
+ e->links_to->name : "(?)",
+ e->class && e->class->name ? e->class->name :
+ "(?)");
+ return false;
+ }
+ }
+ return true;
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+#endif
+
+static bool check_lock_chain_key(struct lock_chain *chain)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ u64 chain_key = 0;
+ int i;
+
+ for (i = chain->base; i < chain->base + chain->depth; i++)
+ chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
+ /*
+ * The 'unsigned long long' casts avoid that a compiler warning
+ * is reported when building tools/lib/lockdep.
+ */
+ if (chain->chain_key != chain_key) {
+ printk(KERN_INFO "chain %lld: key %#llx <> %#llx\n",
+ (unsigned long long)(chain - lock_chains),
+ (unsigned long long)chain->chain_key,
+ (unsigned long long)chain_key);
+ return false;
+ }
+#endif
+ return true;
+}
+
+static bool in_any_zapped_class_list(struct lock_class *class)
+{
+ struct pending_free *pf;
+ int i;
+
+ for (i = 0, pf = delayed_free.pf; i < ARRAY_SIZE(delayed_free.pf); i++, pf++) {
+ if (in_list(&class->lock_entry, &pf->zapped))
+ return true;
+ }
+
+ return false;
+}
+
+static bool __check_data_structures(void)
+{
+ struct lock_class *class;
+ struct lock_chain *chain;
+ struct hlist_head *head;
+ struct lock_list *e;
+ int i;
+
+ /* Check whether all classes occur in a lock list. */
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (!in_list(&class->lock_entry, &all_lock_classes) &&
+ !in_list(&class->lock_entry, &free_lock_classes) &&
+ !in_any_zapped_class_list(class)) {
+ printk(KERN_INFO "class %px/%s is not in any class list\n",
+ class, class->name ? : "(?)");
+ return false;
+ }
+ }
+
+ /* Check whether all classes have valid lock lists. */
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (!class_lock_list_valid(class, &class->locks_before))
+ return false;
+ if (!class_lock_list_valid(class, &class->locks_after))
+ return false;
+ }
+
+ /* Check the chain_key of all lock chains. */
+ for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) {
+ head = chainhash_table + i;
+ hlist_for_each_entry_rcu(chain, head, entry) {
+ if (!check_lock_chain_key(chain))
+ return false;
+ }
+ }
+
+ /*
+ * Check whether all list entries that are in use occur in a class
+ * lock list.
+ */
+ for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ e = list_entries + i;
+ if (!in_any_class_list(&e->entry)) {
+ printk(KERN_INFO "list entry %d is not in any class list; class %s <> %s\n",
+ (unsigned int)(e - list_entries),
+ e->class->name ? : "(?)",
+ e->links_to->name ? : "(?)");
+ return false;
+ }
+ }
+
+ /*
+ * Check whether all list entries that are not in use do not occur in
+ * a class lock list.
+ */
+ for_each_clear_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ e = list_entries + i;
+ if (in_any_class_list(&e->entry)) {
+ printk(KERN_INFO "list entry %d occurs in a class list; class %s <> %s\n",
+ (unsigned int)(e - list_entries),
+ e->class && e->class->name ? e->class->name :
+ "(?)",
+ e->links_to && e->links_to->name ?
+ e->links_to->name : "(?)");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int check_consistency = 0;
+module_param(check_consistency, int, 0644);
+
+static void check_data_structures(void)
+{
+ static bool once = false;
+
+ if (check_consistency && !once) {
+ if (!__check_data_structures()) {
+ once = true;
+ WARN_ON(once);
+ }
+ }
+}
+
+#else /* CONFIG_DEBUG_LOCKDEP */
+
+static inline void check_data_structures(void) { }
+
+#endif /* CONFIG_DEBUG_LOCKDEP */
+
+/*
+ * Initialize the lock_classes[] array elements, the free_lock_classes list
+ * and also the delayed_free structure.
+ */
+static void init_data_structures_once(void)
+{
+ static bool ds_initialized, rcu_head_initialized;
+ int i;
+
+ if (likely(rcu_head_initialized))
+ return;
+
+ if (system_state >= SYSTEM_SCHEDULING) {
+ init_rcu_head(&delayed_free.rcu_head);
+ rcu_head_initialized = true;
+ }
+
+ if (ds_initialized)
+ return;
+
+ ds_initialized = true;
+
+ INIT_LIST_HEAD(&delayed_free.pf[0].zapped);
+ INIT_LIST_HEAD(&delayed_free.pf[1].zapped);
+
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ list_add_tail(&lock_classes[i].lock_entry, &free_lock_classes);
+ INIT_LIST_HEAD(&lock_classes[i].locks_after);
+ INIT_LIST_HEAD(&lock_classes[i].locks_before);
+ }
+}
+
+static inline struct hlist_head *keyhashentry(const struct lock_class_key *key)
+{
+ unsigned long hash = hash_long((uintptr_t)key, KEYHASH_BITS);
+
+ return lock_keys_hash + hash;
+}
+
+/* Register a dynamically allocated key. */
+void lockdep_register_key(struct lock_class_key *key)
+{
+ struct hlist_head *hash_head;
+ struct lock_class_key *k;
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return;
+ hash_head = keyhashentry(key);
+
+ raw_local_irq_save(flags);
+ if (!graph_lock())
+ goto restore_irqs;
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (WARN_ON_ONCE(k == key))
+ goto out_unlock;
+ }
+ hlist_add_head_rcu(&key->hash_entry, hash_head);
+out_unlock:
+ graph_unlock();
+restore_irqs:
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lockdep_register_key);
+
+/* Check whether a key has been registered as a dynamic key. */
+static bool is_dynamic_key(const struct lock_class_key *key)
+{
+ struct hlist_head *hash_head;
+ struct lock_class_key *k;
+ bool found = false;
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return false;
+
+ /*
+ * If lock debugging is disabled lock_keys_hash[] may contain
+ * pointers to memory that has already been freed. Avoid triggering
+ * a use-after-free in that case by returning early.
+ */
+ if (!debug_locks)
+ return true;
+
+ hash_head = keyhashentry(key);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (k == key) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
/*
* Register a lock's class in the hash-table, if the class is not present
* yet. Otherwise we look it up. We cache the result in the lock object
@@ -756,7 +1094,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
if (!lock->key) {
if (!assign_lock_key(lock))
return NULL;
- } else if (!static_obj(lock->key)) {
+ } else if (!static_obj(lock->key) && !is_dynamic_key(lock->key)) {
return NULL;
}
@@ -775,11 +1113,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
goto out_unlock_set;
}
- /*
- * Allocate a new key from the static array, and add it to
- * the hash:
- */
- if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
+ init_data_structures_once();
+
+ /* Allocate a new lock class and add it to the hash. */
+ class = list_first_entry_or_null(&free_lock_classes, typeof(*class),
+ lock_entry);
+ if (!class) {
if (!debug_locks_off_graph_unlock()) {
return NULL;
}
@@ -788,13 +1127,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
dump_stack();
return NULL;
}
- class = lock_classes + nr_lock_classes++;
+ nr_lock_classes++;
debug_atomic_inc(nr_unused_locks);
class->key = key;
class->name = lock->name;
class->subclass = subclass;
- INIT_LIST_HEAD(&class->locks_before);
- INIT_LIST_HEAD(&class->locks_after);
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
class->name_version = count_matching_names(class);
/*
* We use RCU's safe list-add method to make
@@ -802,9 +1141,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
*/
hlist_add_head_rcu(&class->hash_entry, hash_head);
/*
- * Add it to the global list of classes:
+ * Remove the class from the free list and add it to the global list
+ * of classes.
*/
- list_add_tail(&class->lock_entry, &all_lock_classes);
+ list_move_tail(&class->lock_entry, &all_lock_classes);
if (verbose(class)) {
graph_unlock();
@@ -845,7 +1185,10 @@ out_set_class_cache:
*/
static struct lock_list *alloc_list_entry(void)
{
- if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
+ int idx = find_first_zero_bit(list_entries_in_use,
+ ARRAY_SIZE(list_entries));
+
+ if (idx >= ARRAY_SIZE(list_entries)) {
if (!debug_locks_off_graph_unlock())
return NULL;
@@ -853,13 +1196,16 @@ static struct lock_list *alloc_list_entry(void)
dump_stack();
return NULL;
}
- return list_entries + nr_list_entries++;
+ nr_list_entries++;
+ __set_bit(idx, list_entries_in_use);
+ return list_entries + idx;
}
/*
* Add a new dependency to the head of the list:
*/
-static int add_lock_to_list(struct lock_class *this, struct list_head *head,
+static int add_lock_to_list(struct lock_class *this,
+ struct lock_class *links_to, struct list_head *head,
unsigned long ip, int distance,
struct stack_trace *trace)
{
@@ -873,6 +1219,7 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head,
return 0;
entry->class = this;
+ entry->links_to = links_to;
entry->distance = distance;
entry->trace = *trace;
/*
@@ -955,7 +1302,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
unsigned long nr;
nr = lock - list_entries;
- WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+ WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */
lock->parent = parent;
lock->class->dep_gen_id = lockdep_dependency_gen_id;
}
@@ -965,7 +1312,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
unsigned long nr;
nr = lock - list_entries;
- WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+ WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */
return lock->class->dep_gen_id == lockdep_dependency_gen_id;
}
@@ -1624,29 +1971,18 @@ static const char *state_rnames[] = {
static inline const char *state_name(enum lock_usage_bit bit)
{
- return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+ return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2];
}
static int exclusive_bit(int new_bit)
{
- /*
- * USED_IN
- * USED_IN_READ
- * ENABLED
- * ENABLED_READ
- *
- * bit 0 - write/read
- * bit 1 - used_in/enabled
- * bit 2+ state
- */
-
- int state = new_bit & ~3;
- int dir = new_bit & 2;
+ int state = new_bit & LOCK_USAGE_STATE_MASK;
+ int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
* keep state, bit flip the direction and strip read.
*/
- return state | (dir ^ 2);
+ return state | (dir ^ LOCK_USAGE_DIR_MASK);
}
static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
@@ -1842,6 +2178,24 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
struct lock_list this;
int ret;
+ if (!hlock_class(prev)->key || !hlock_class(next)->key) {
+ /*
+ * The warning statements below may trigger a use-after-free
+ * of the class name. It is better to trigger a use-after free
+ * and to have the class name most of the time instead of not
+ * having the class name available.
+ */
+ WARN_ONCE(!debug_locks_silent && !hlock_class(prev)->key,
+ "Detected use-after-free of lock class %px/%s\n",
+ hlock_class(prev),
+ hlock_class(prev)->name);
+ WARN_ONCE(!debug_locks_silent && !hlock_class(next)->key,
+ "Detected use-after-free of lock class %px/%s\n",
+ hlock_class(next),
+ hlock_class(next)->name);
+ return 2;
+ }
+
/*
* Prove that the new <prev> -> <next> dependency would not
* create a circular dependency in the graph. (We do this by
@@ -1918,14 +2272,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* Ok, all validations passed, add the new lock
* to the previous lock's dependency list:
*/
- ret = add_lock_to_list(hlock_class(next),
+ ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
&hlock_class(prev)->locks_after,
next->acquire_ip, distance, trace);
if (!ret)
return 0;
- ret = add_lock_to_list(hlock_class(prev),
+ ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
&hlock_class(next)->locks_before,
next->acquire_ip, distance, trace);
if (!ret)
@@ -2018,8 +2372,8 @@ out_bug:
return 0;
}
-unsigned long nr_lock_chains;
struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS);
int nr_chain_hlocks;
static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
@@ -2153,6 +2507,33 @@ static int check_no_collision(struct task_struct *curr,
}
/*
+ * Given an index that is >= -1, return the index of the next lock chain.
+ * Return -2 if there is no next lock chain.
+ */
+long lockdep_next_lockchain(long i)
+{
+ i = find_next_bit(lock_chains_in_use, ARRAY_SIZE(lock_chains), i + 1);
+ return i < ARRAY_SIZE(lock_chains) ? i : -2;
+}
+
+unsigned long lock_chain_count(void)
+{
+ return bitmap_weight(lock_chains_in_use, ARRAY_SIZE(lock_chains));
+}
+
+/* Must be called with the graph lock held. */
+static struct lock_chain *alloc_lock_chain(void)
+{
+ int idx = find_first_zero_bit(lock_chains_in_use,
+ ARRAY_SIZE(lock_chains));
+
+ if (unlikely(idx >= ARRAY_SIZE(lock_chains)))
+ return NULL;
+ __set_bit(idx, lock_chains_in_use);
+ return lock_chains + idx;
+}
+
+/*
* Adds a dependency chain into chain hashtable. And must be called with
* graph_lock held.
*
@@ -2169,19 +2550,15 @@ static inline int add_chain_cache(struct task_struct *curr,
int i, j;
/*
- * Allocate a new chain entry from the static array, and add
- * it to the hash:
- */
-
- /*
- * We might need to take the graph lock, ensure we've got IRQs
+ * The caller must hold the graph lock, ensure we've got IRQs
* disabled to make this an IRQ-safe lock.. for recursion reasons
* lockdep won't complain about its own locking errors.
*/
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
- if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ chain = alloc_lock_chain();
+ if (!chain) {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -2189,7 +2566,6 @@ static inline int add_chain_cache(struct task_struct *curr,
dump_stack();
return 0;
}
- chain = lock_chains + nr_lock_chains++;
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
i = get_first_held_lock(curr, hlock);
@@ -2206,16 +2582,8 @@ static inline int add_chain_cache(struct task_struct *curr,
chain_hlocks[chain->base + j] = lock_id;
}
chain_hlocks[chain->base + j] = class - lock_classes;
- }
-
- if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS)
nr_chain_hlocks += chain->depth;
-
-#ifdef CONFIG_DEBUG_LOCKDEP
- /*
- * Important for check_no_collision().
- */
- if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ } else {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -2223,7 +2591,6 @@ static inline int add_chain_cache(struct task_struct *curr,
dump_stack();
return 0;
}
-#endif
hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
@@ -2233,19 +2600,16 @@ static inline int add_chain_cache(struct task_struct *curr,
}
/*
- * Look up a dependency chain.
+ * Look up a dependency chain. Must be called with either the graph lock or
+ * the RCU read lock held.
*/
static inline struct lock_chain *lookup_chain_cache(u64 chain_key)
{
struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
- /*
- * We can walk it lock-free, because entries only get added
- * to the hash:
- */
hlist_for_each_entry_rcu(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
+ if (READ_ONCE(chain->chain_key) == chain_key) {
debug_atomic_inc(chain_lookup_hits);
return chain;
}
@@ -2662,8 +3026,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
int excl_bit = exclusive_bit(new_bit);
- int read = new_bit & 1;
- int dir = new_bit & 2;
+ int read = new_bit & LOCK_USAGE_READ_MASK;
+ int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
* mark USED_IN has to look forwards -- to ensure no dependency
@@ -2687,19 +3051,19 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
* states.
*/
if ((!read || !dir || STRICT_READ_CHECKS) &&
- !usage(curr, this, excl_bit, state_name(new_bit & ~1)))
+ !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK)))
return 0;
/*
* Check for read in write conflicts
*/
if (!read) {
- if (!valid_state(curr, this, new_bit, excl_bit + 1))
+ if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK))
return 0;
if (STRICT_READ_CHECKS &&
- !usage(curr, this, excl_bit + 1,
- state_name(new_bit + 1)))
+ !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK,
+ state_name(new_bit + LOCK_USAGE_READ_MASK)))
return 0;
}
@@ -2709,35 +3073,28 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
return 1;
}
-enum mark_type {
-#define LOCKDEP_STATE(__STATE) __STATE,
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-};
-
/*
* Mark all held locks with a usage bit:
*/
static int
-mark_held_locks(struct task_struct *curr, enum mark_type mark)
+mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit)
{
- enum lock_usage_bit usage_bit;
struct held_lock *hlock;
int i;
for (i = 0; i < curr->lockdep_depth; i++) {
+ enum lock_usage_bit hlock_bit = base_bit;
hlock = curr->held_locks + i;
- usage_bit = 2 + (mark << 2); /* ENABLED */
if (hlock->read)
- usage_bit += 1; /* READ */
+ hlock_bit += LOCK_USAGE_READ_MASK;
- BUG_ON(usage_bit >= LOCK_USAGE_STATES);
+ BUG_ON(hlock_bit >= LOCK_USAGE_STATES);
if (!hlock->check)
continue;
- if (!mark_lock(curr, hlock, usage_bit))
+ if (!mark_lock(curr, hlock, hlock_bit))
return 0;
}
@@ -2758,7 +3115,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
*/
- if (!mark_held_locks(curr, HARDIRQ))
+ if (!mark_held_locks(curr, LOCK_ENABLED_HARDIRQ))
return;
/*
* If we have softirqs enabled, then set the usage
@@ -2766,7 +3123,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
* this bit from being set before)
*/
if (curr->softirqs_enabled)
- if (!mark_held_locks(curr, SOFTIRQ))
+ if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ))
return;
curr->hardirq_enable_ip = ip;
@@ -2814,6 +3171,7 @@ void lockdep_hardirqs_on(unsigned long ip)
__trace_hardirqs_on_caller(ip);
current->lockdep_recursion = 0;
}
+NOKPROBE_SYMBOL(lockdep_hardirqs_on);
/*
* Hardirqs were disabled:
@@ -2843,6 +3201,7 @@ void lockdep_hardirqs_off(unsigned long ip)
} else
debug_atomic_inc(redundant_hardirqs_off);
}
+NOKPROBE_SYMBOL(lockdep_hardirqs_off);
/*
* Softirqs will be enabled:
@@ -2880,7 +3239,7 @@ void trace_softirqs_on(unsigned long ip)
* enabled too:
*/
if (curr->hardirqs_enabled)
- mark_held_locks(curr, SOFTIRQ);
+ mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
current->lockdep_recursion = 0;
}
@@ -3119,13 +3478,12 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
if (DEBUG_LOCKS_WARN_ON(!key))
return;
/*
- * Sanity check, the lock-class key must be persistent:
+ * Sanity check, the lock-class key must either have been allocated
+ * statically or must have been registered as a dynamic key.
*/
- if (!static_obj(key)) {
- printk("BUG: key %px not in .data!\n", key);
- /*
- * What it says above ^^^^^, I suggest you read it.
- */
+ if (!static_obj(key) && !is_dynamic_key(key)) {
+ if (debug_locks)
+ printk(KERN_ERR "BUG: key %px has not been registered!\n", key);
DEBUG_LOCKS_WARN_ON(1);
return;
}
@@ -3335,6 +3693,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (nest_lock && !__lock_is_held(nest_lock, -1))
return print_lock_nested_lock_not_held(curr, hlock, ip);
+ if (!debug_locks_silent) {
+ WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key);
+ WARN_ON_ONCE(!hlock_class(hlock)->key);
+ }
+
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
return 0;
@@ -3497,6 +3860,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
unsigned int depth;
int i;
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* This function is about (re)setting the class of a held lock,
@@ -3535,6 +3901,9 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
unsigned int depth;
int i;
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* This function is about (re)setting the class of a held lock,
@@ -3650,7 +4019,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
return 0;
}
-static int __lock_is_held(const struct lockdep_map *lock, int read)
+static nokprobe_inline
+int __lock_is_held(const struct lockdep_map *lock, int read)
{
struct task_struct *curr = current;
int i;
@@ -3883,6 +4253,7 @@ int lock_is_held_type(const struct lockdep_map *lock, int read)
return ret;
}
EXPORT_SYMBOL_GPL(lock_is_held_type);
+NOKPROBE_SYMBOL(lock_is_held_type);
struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
{
@@ -4123,29 +4494,131 @@ void lockdep_reset(void)
raw_local_irq_restore(flags);
}
+/* Remove a class from a lock chain. Must be called with the graph lock held. */
+static void remove_class_from_lock_chain(struct pending_free *pf,
+ struct lock_chain *chain,
+ struct lock_class *class)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ struct lock_chain *new_chain;
+ u64 chain_key;
+ int i;
+
+ for (i = chain->base; i < chain->base + chain->depth; i++) {
+ if (chain_hlocks[i] != class - lock_classes)
+ continue;
+ /* The code below leaks one chain_hlock[] entry. */
+ if (--chain->depth > 0) {
+ memmove(&chain_hlocks[i], &chain_hlocks[i + 1],
+ (chain->base + chain->depth - i) *
+ sizeof(chain_hlocks[0]));
+ }
+ /*
+ * Each lock class occurs at most once in a lock chain so once
+ * we found a match we can break out of this loop.
+ */
+ goto recalc;
+ }
+ /* Since the chain has not been modified, return. */
+ return;
+
+recalc:
+ chain_key = 0;
+ for (i = chain->base; i < chain->base + chain->depth; i++)
+ chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
+ if (chain->depth && chain->chain_key == chain_key)
+ return;
+ /* Overwrite the chain key for concurrent RCU readers. */
+ WRITE_ONCE(chain->chain_key, chain_key);
+ /*
+ * Note: calling hlist_del_rcu() from inside a
+ * hlist_for_each_entry_rcu() loop is safe.
+ */
+ hlist_del_rcu(&chain->entry);
+ __set_bit(chain - lock_chains, pf->lock_chains_being_freed);
+ if (chain->depth == 0)
+ return;
+ /*
+ * If the modified lock chain matches an existing lock chain, drop
+ * the modified lock chain.
+ */
+ if (lookup_chain_cache(chain_key))
+ return;
+ new_chain = alloc_lock_chain();
+ if (WARN_ON_ONCE(!new_chain)) {
+ debug_locks_off();
+ return;
+ }
+ *new_chain = *chain;
+ hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key));
+#endif
+}
+
+/* Must be called with the graph lock held. */
+static void remove_class_from_lock_chains(struct pending_free *pf,
+ struct lock_class *class)
+{
+ struct lock_chain *chain;
+ struct hlist_head *head;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) {
+ head = chainhash_table + i;
+ hlist_for_each_entry_rcu(chain, head, entry) {
+ remove_class_from_lock_chain(pf, chain, class);
+ }
+ }
+}
+
/*
* Remove all references to a lock class. The caller must hold the graph lock.
*/
-static void zap_class(struct lock_class *class)
+static void zap_class(struct pending_free *pf, struct lock_class *class)
{
+ struct lock_list *entry;
int i;
+ WARN_ON_ONCE(!class->key);
+
/*
* Remove all dependencies this lock is
* involved in:
*/
- for (i = 0; i < nr_list_entries; i++) {
- if (list_entries[i].class == class)
- list_del_rcu(&list_entries[i].entry);
+ for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ entry = list_entries + i;
+ if (entry->class != class && entry->links_to != class)
+ continue;
+ __clear_bit(i, list_entries_in_use);
+ nr_list_entries--;
+ list_del_rcu(&entry->entry);
+ }
+ if (list_empty(&class->locks_after) &&
+ list_empty(&class->locks_before)) {
+ list_move_tail(&class->lock_entry, &pf->zapped);
+ hlist_del_rcu(&class->hash_entry);
+ WRITE_ONCE(class->key, NULL);
+ WRITE_ONCE(class->name, NULL);
+ nr_lock_classes--;
+ } else {
+ WARN_ONCE(true, "%s() failed for class %s\n", __func__,
+ class->name);
}
- /*
- * Unhash the class and remove it from the all_lock_classes list:
- */
- hlist_del_rcu(&class->hash_entry);
- list_del(&class->lock_entry);
- RCU_INIT_POINTER(class->key, NULL);
- RCU_INIT_POINTER(class->name, NULL);
+ remove_class_from_lock_chains(pf, class);
+}
+
+static void reinit_class(struct lock_class *class)
+{
+ void *const p = class;
+ const unsigned int offset = offsetof(struct lock_class, key);
+
+ WARN_ON_ONCE(!class->lock_entry.next);
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
+ memset(p + offset, 0, sizeof(*class) - offset);
+ WARN_ON_ONCE(!class->lock_entry.next);
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
}
static inline int within(const void *addr, void *start, unsigned long size)
@@ -4153,55 +4626,171 @@ static inline int within(const void *addr, void *start, unsigned long size)
return addr >= start && addr < start + size;
}
+static bool inside_selftest(void)
+{
+ return current == lockdep_selftest_task_struct;
+}
+
+/* The caller must hold the graph lock. */
+static struct pending_free *get_pending_free(void)
+{
+ return delayed_free.pf + delayed_free.index;
+}
+
+static void free_zapped_rcu(struct rcu_head *cb);
+
/*
- * Used in module.c to remove lock classes from memory that is going to be
- * freed; and possibly re-used by other modules.
- *
- * We will have had one sync_sched() before getting here, so we're guaranteed
- * nobody will look up these exact classes -- they're properly dead but still
- * allocated.
+ * Schedule an RCU callback if no RCU callback is pending. Must be called with
+ * the graph lock held.
*/
-void lockdep_free_key_range(void *start, unsigned long size)
+static void call_rcu_zapped(struct pending_free *pf)
+{
+ WARN_ON_ONCE(inside_selftest());
+
+ if (list_empty(&pf->zapped))
+ return;
+
+ if (delayed_free.scheduled)
+ return;
+
+ delayed_free.scheduled = true;
+
+ WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf);
+ delayed_free.index ^= 1;
+
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+}
+
+/* The caller must hold the graph lock. May be called from RCU context. */
+static void __free_zapped_classes(struct pending_free *pf)
{
struct lock_class *class;
- struct hlist_head *head;
+
+ check_data_structures();
+
+ list_for_each_entry(class, &pf->zapped, lock_entry)
+ reinit_class(class);
+
+ list_splice_init(&pf->zapped, &free_lock_classes);
+
+#ifdef CONFIG_PROVE_LOCKING
+ bitmap_andnot(lock_chains_in_use, lock_chains_in_use,
+ pf->lock_chains_being_freed, ARRAY_SIZE(lock_chains));
+ bitmap_clear(pf->lock_chains_being_freed, 0, ARRAY_SIZE(lock_chains));
+#endif
+}
+
+static void free_zapped_rcu(struct rcu_head *ch)
+{
+ struct pending_free *pf;
unsigned long flags;
- int i;
- int locked;
+
+ if (WARN_ON_ONCE(ch != &delayed_free.rcu_head))
+ return;
raw_local_irq_save(flags);
- locked = graph_lock();
+ arch_spin_lock(&lockdep_lock);
+ current->lockdep_recursion = 1;
+
+ /* closed head */
+ pf = delayed_free.pf + (delayed_free.index ^ 1);
+ __free_zapped_classes(pf);
+ delayed_free.scheduled = false;
/*
- * Unhash all classes that were created by this module:
+ * If there's anything on the open list, close and start a new callback.
*/
+ call_rcu_zapped(delayed_free.pf + delayed_free.index);
+
+ current->lockdep_recursion = 0;
+ arch_spin_unlock(&lockdep_lock);
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Remove all lock classes from the class hash table and from the
+ * all_lock_classes list whose key or name is in the address range [start,
+ * start + size). Move these lock classes to the zapped_classes list. Must
+ * be called with the graph lock held.
+ */
+static void __lockdep_free_key_range(struct pending_free *pf, void *start,
+ unsigned long size)
+{
+ struct lock_class *class;
+ struct hlist_head *head;
+ int i;
+
+ /* Unhash all classes that were created by a module. */
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
hlist_for_each_entry_rcu(class, head, hash_entry) {
- if (within(class->key, start, size))
- zap_class(class);
- else if (within(class->name, start, size))
- zap_class(class);
+ if (!within(class->key, start, size) &&
+ !within(class->name, start, size))
+ continue;
+ zap_class(pf, class);
}
}
+}
- if (locked)
- graph_unlock();
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one synchronize_rcu() before getting here, so we're
+ * guaranteed nobody will look up these exact classes -- they're properly dead
+ * but still allocated.
+ */
+static void lockdep_free_key_range_reg(void *start, unsigned long size)
+{
+ struct pending_free *pf;
+ unsigned long flags;
+
+ init_data_structures_once();
+
+ raw_local_irq_save(flags);
+ arch_spin_lock(&lockdep_lock);
+ current->lockdep_recursion = 1;
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, start, size);
+ call_rcu_zapped(pf);
+ current->lockdep_recursion = 0;
+ arch_spin_unlock(&lockdep_lock);
raw_local_irq_restore(flags);
/*
* Wait for any possible iterators from look_up_lock_class() to pass
* before continuing to free the memory they refer to.
- *
- * sync_sched() is sufficient because the read-side is IRQ disable.
*/
synchronize_rcu();
+}
- /*
- * XXX at this point we could return the resources to the pool;
- * instead we leak them. We would need to change to bitmap allocators
- * instead of the linear allocators we have now.
- */
+/*
+ * Free all lockdep keys in the range [start, start+size). Does not sleep.
+ * Ignores debug_locks. Must only be used by the lockdep selftests.
+ */
+static void lockdep_free_key_range_imm(void *start, unsigned long size)
+{
+ struct pending_free *pf = delayed_free.pf;
+ unsigned long flags;
+
+ init_data_structures_once();
+
+ raw_local_irq_save(flags);
+ arch_spin_lock(&lockdep_lock);
+ __lockdep_free_key_range(pf, start, size);
+ __free_zapped_classes(pf);
+ arch_spin_unlock(&lockdep_lock);
+ raw_local_irq_restore(flags);
+}
+
+void lockdep_free_key_range(void *start, unsigned long size)
+{
+ init_data_structures_once();
+
+ if (inside_selftest())
+ lockdep_free_key_range_imm(start, size);
+ else
+ lockdep_free_key_range_reg(start, size);
}
/*
@@ -4226,14 +4815,12 @@ static bool lock_class_cache_is_registered(struct lockdep_map *lock)
return false;
}
-void lockdep_reset_lock(struct lockdep_map *lock)
+/* The caller must hold the graph lock. Does not sleep. */
+static void __lockdep_reset_lock(struct pending_free *pf,
+ struct lockdep_map *lock)
{
struct lock_class *class;
- unsigned long flags;
- int j, locked;
-
- raw_local_irq_save(flags);
- locked = graph_lock();
+ int j;
/*
* Remove all classes this lock might have:
@@ -4244,27 +4831,103 @@ void lockdep_reset_lock(struct lockdep_map *lock)
*/
class = look_up_lock_class(lock, j);
if (class)
- zap_class(class);
+ zap_class(pf, class);
}
/*
* Debug check: in the end all mapped classes should
* be gone.
*/
- if (unlikely(lock_class_cache_is_registered(lock))) {
- if (debug_locks_off_graph_unlock()) {
- /*
- * We all just reset everything, how did it match?
- */
- WARN_ON(1);
+ if (WARN_ON_ONCE(lock_class_cache_is_registered(lock)))
+ debug_locks_off();
+}
+
+/*
+ * Remove all information lockdep has about a lock if debug_locks == 1. Free
+ * released data structures from RCU context.
+ */
+static void lockdep_reset_lock_reg(struct lockdep_map *lock)
+{
+ struct pending_free *pf;
+ unsigned long flags;
+ int locked;
+
+ raw_local_irq_save(flags);
+ locked = graph_lock();
+ if (!locked)
+ goto out_irq;
+
+ pf = get_pending_free();
+ __lockdep_reset_lock(pf, lock);
+ call_rcu_zapped(pf);
+
+ graph_unlock();
+out_irq:
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Reset a lock. Does not sleep. Ignores debug_locks. Must only be used by the
+ * lockdep selftests.
+ */
+static void lockdep_reset_lock_imm(struct lockdep_map *lock)
+{
+ struct pending_free *pf = delayed_free.pf;
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ arch_spin_lock(&lockdep_lock);
+ __lockdep_reset_lock(pf, lock);
+ __free_zapped_classes(pf);
+ arch_spin_unlock(&lockdep_lock);
+ raw_local_irq_restore(flags);
+}
+
+void lockdep_reset_lock(struct lockdep_map *lock)
+{
+ init_data_structures_once();
+
+ if (inside_selftest())
+ lockdep_reset_lock_imm(lock);
+ else
+ lockdep_reset_lock_reg(lock);
+}
+
+/* Unregister a dynamically allocated key. */
+void lockdep_unregister_key(struct lock_class_key *key)
+{
+ struct hlist_head *hash_head = keyhashentry(key);
+ struct lock_class_key *k;
+ struct pending_free *pf;
+ unsigned long flags;
+ bool found = false;
+
+ might_sleep();
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return;
+
+ raw_local_irq_save(flags);
+ arch_spin_lock(&lockdep_lock);
+ current->lockdep_recursion = 1;
+ pf = get_pending_free();
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (k == key) {
+ hlist_del_rcu(&k->hash_entry);
+ found = true;
+ break;
}
- goto out_restore;
}
- if (locked)
- graph_unlock();
-
-out_restore:
+ WARN_ON_ONCE(!found);
+ __lockdep_free_key_range(pf, key, 1);
+ call_rcu_zapped(pf);
+ current->lockdep_recursion = 0;
+ arch_spin_unlock(&lockdep_lock);
raw_local_irq_restore(flags);
+
+ /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
+ synchronize_rcu();
}
+EXPORT_SYMBOL_GPL(lockdep_unregister_key);
void __init lockdep_init(void)
{
@@ -4278,20 +4941,24 @@ void __init lockdep_init(void)
printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
- printk(" memory used by lock dependency info: %lu kB\n",
- (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
- sizeof(struct list_head) * CLASSHASH_SIZE +
- sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
- sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
- sizeof(struct list_head) * CHAINHASH_SIZE
+ printk(" memory used by lock dependency info: %zu kB\n",
+ (sizeof(lock_classes) +
+ sizeof(classhash_table) +
+ sizeof(list_entries) +
+ sizeof(list_entries_in_use) +
+ sizeof(chainhash_table) +
+ sizeof(delayed_free)
#ifdef CONFIG_PROVE_LOCKING
- + sizeof(struct circular_queue)
+ + sizeof(lock_cq)
+ + sizeof(lock_chains)
+ + sizeof(lock_chains_in_use)
+ + sizeof(chain_hlocks)
#endif
) / 1024
);
- printk(" per task-struct memory footprint: %lu bytes\n",
- sizeof(struct held_lock) * MAX_LOCK_DEPTH);
+ printk(" per task-struct memory footprint: %zu bytes\n",
+ sizeof(((struct task_struct *)NULL)->held_locks));
}
static void
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 88c847a41c8a..d4c197425f68 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -22,6 +22,10 @@ enum lock_usage_bit {
LOCK_USAGE_STATES
};
+#define LOCK_USAGE_READ_MASK 1
+#define LOCK_USAGE_DIR_MASK 2
+#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK))
+
/*
* Usage-state bitmasks:
*/
@@ -96,7 +100,8 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
extern unsigned long nr_lock_classes;
extern unsigned long nr_list_entries;
-extern unsigned long nr_lock_chains;
+long lockdep_next_lockchain(long i);
+unsigned long lock_chain_count(void);
extern int nr_chain_hlocks;
extern unsigned long nr_stack_trace_entries;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 3d31f9b0059e..9c49ec645d8b 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -104,18 +104,18 @@ static const struct seq_operations lockdep_ops = {
#ifdef CONFIG_PROVE_LOCKING
static void *lc_start(struct seq_file *m, loff_t *pos)
{
+ if (*pos < 0)
+ return NULL;
+
if (*pos == 0)
return SEQ_START_TOKEN;
- if (*pos - 1 < nr_lock_chains)
- return lock_chains + (*pos - 1);
-
- return NULL;
+ return lock_chains + (*pos - 1);
}
static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
{
- (*pos)++;
+ *pos = lockdep_next_lockchain(*pos - 1) + 1;
return lc_start(m, pos);
}
@@ -268,7 +268,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#ifdef CONFIG_PROVE_LOCKING
seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
- nr_lock_chains, MAX_LOCKDEP_CHAINS);
+ lock_chain_count(), MAX_LOCKDEP_CHAINS);
seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n",
nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
#endif
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 7d0b0ed74404..ad40a2617063 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Module-based torture test facility for locking
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2014
*
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
* Davidlohr Bueso <dave@stgolabs.net>
* Based on kernel/rcu/torture.c.
*/
@@ -45,7 +32,7 @@
#include <linux/torture.h>
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
torture_param(int, nwriters_stress, -1,
"Number of write-locking stress-test threads");
@@ -970,7 +957,7 @@ static int __init lock_torture_init(void)
/* Prepare torture context. */
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ,
- onoff_interval * HZ);
+ onoff_interval * HZ, NULL);
if (firsterr)
goto unwind;
}
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 8a8c3c208c5e..5e9247dc2515 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -124,9 +124,6 @@ static inline __pure u32 encode_tail(int cpu, int idx)
{
u32 tail;
-#ifdef CONFIG_DEBUG_SPINLOCK
- BUG_ON(idx > 3);
-#endif
tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
@@ -412,12 +409,28 @@ pv_queue:
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
+ /*
+ * 4 nodes are allocated based on the assumption that there will
+ * not be nested NMIs taking spinlocks. That may not be true in
+ * some architectures even though the chance of needing more than
+ * 4 nodes will still be extremely unlikely. When that happens,
+ * we fall back to spinning on the lock directly without using
+ * any MCS node. This is not the most elegant solution, but is
+ * simple enough.
+ */
+ if (unlikely(idx >= MAX_NODES)) {
+ qstat_inc(qstat_lock_no_node, true);
+ while (!queued_spin_trylock(lock))
+ cpu_relax();
+ goto release;
+ }
+
node = grab_mcs_node(node, idx);
/*
* Keep counts of non-zero index values:
*/
- qstat_inc(qstat_lock_idx1 + idx - 1, idx);
+ qstat_inc(qstat_lock_use_node2 + idx - 1, idx);
/*
* Ensure that we increment the head node->count before initialising
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 42d3d8dc8f49..d73f85388d5c 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -30,6 +30,13 @@
* pv_wait_node - # of vCPU wait's at a non-head queue node
* lock_pending - # of locking operations via pending code
* lock_slowpath - # of locking operations via MCS lock queue
+ * lock_use_node2 - # of locking operations that use 2nd per-CPU node
+ * lock_use_node3 - # of locking operations that use 3rd per-CPU node
+ * lock_use_node4 - # of locking operations that use 4th per-CPU node
+ * lock_no_node - # of locking operations without using per-CPU node
+ *
+ * Subtracting lock_use_node[234] from lock_slowpath will give you
+ * lock_use_node1.
*
* Writing to the "reset_counters" file will reset all the above counter
* values.
@@ -55,9 +62,10 @@ enum qlock_stats {
qstat_pv_wait_node,
qstat_lock_pending,
qstat_lock_slowpath,
- qstat_lock_idx1,
- qstat_lock_idx2,
- qstat_lock_idx3,
+ qstat_lock_use_node2,
+ qstat_lock_use_node3,
+ qstat_lock_use_node4,
+ qstat_lock_no_node,
qstat_num, /* Total number of statistical counters */
qstat_reset_cnts = qstat_num,
};
@@ -85,9 +93,10 @@ static const char * const qstat_names[qstat_num + 1] = {
[qstat_pv_wait_node] = "pv_wait_node",
[qstat_lock_pending] = "lock_pending",
[qstat_lock_slowpath] = "lock_slowpath",
- [qstat_lock_idx1] = "lock_index1",
- [qstat_lock_idx2] = "lock_index2",
- [qstat_lock_idx3] = "lock_index3",
+ [qstat_lock_use_node2] = "lock_use_node2",
+ [qstat_lock_use_node3] = "lock_use_node3",
+ [qstat_lock_use_node4] = "lock_use_node4",
+ [qstat_lock_no_node] = "lock_no_node",
[qstat_reset_cnts] = "reset_counters",
};
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 50d9af615dc4..fbe96341beee 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -211,9 +211,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
* Ensure issuing the wakeup (either by us or someone else)
* after setting the reader waiter to nil.
*/
- wake_q_add(wake_q, tsk);
- /* wake_q_add() already take the task ref */
- put_task_struct(tsk);
+ wake_q_add_safe(wake_q, tsk);
}
adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
diff --git a/kernel/module.c b/kernel/module.c
index 2ad1b5239910..0b9aa8ab89f0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2719,11 +2719,7 @@ static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsig
{
if (!debug)
return;
-#ifdef CONFIG_DYNAMIC_DEBUG
- if (ddebug_add_module(debug, num, mod->name))
- pr_err("dynamic debug error adding module: %s\n",
- debug->modname);
-#endif
+ ddebug_add_module(debug, num, mod->name);
}
static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
diff --git a/kernel/panic.c b/kernel/panic.c
index f121e6ba7e11..0ae0d7332f12 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -642,16 +642,14 @@ static int clear_warn_once_set(void *data, u64 val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
- NULL,
- clear_warn_once_set,
- "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set,
+ "%lld\n");
static __init int register_warn_debugfs(void)
{
/* Don't care about failure */
- debugfs_create_file("clear_warn_once", 0200, NULL,
- NULL, &clear_warn_once_fops);
+ debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL,
+ &clear_warn_once_fops);
return 0;
}
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index d9dc2c38764a..7d66ee68aaaf 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -10,6 +10,7 @@
#include <linux/cpu.h>
#include <linux/cpumask.h>
+#include <linux/debugfs.h>
#include <linux/energy_model.h>
#include <linux/sched/topology.h>
#include <linux/slab.h>
@@ -23,6 +24,60 @@ static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
*/
static DEFINE_MUTEX(em_pd_mutex);
+#ifdef CONFIG_DEBUG_FS
+static struct dentry *rootdir;
+
+static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd)
+{
+ struct dentry *d;
+ char name[24];
+
+ snprintf(name, sizeof(name), "cs:%lu", cs->frequency);
+
+ /* Create per-cs directory */
+ d = debugfs_create_dir(name, pd);
+ debugfs_create_ulong("frequency", 0444, d, &cs->frequency);
+ debugfs_create_ulong("power", 0444, d, &cs->power);
+ debugfs_create_ulong("cost", 0444, d, &cs->cost);
+}
+
+static int em_debug_cpus_show(struct seq_file *s, void *unused)
+{
+ seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private)));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
+
+static void em_debug_create_pd(struct em_perf_domain *pd, int cpu)
+{
+ struct dentry *d;
+ char name[8];
+ int i;
+
+ snprintf(name, sizeof(name), "pd%d", cpu);
+
+ /* Create the directory of the performance domain */
+ d = debugfs_create_dir(name, rootdir);
+
+ debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops);
+
+ /* Create a sub-directory for each capacity state */
+ for (i = 0; i < pd->nr_cap_states; i++)
+ em_debug_create_cs(&pd->table[i], d);
+}
+
+static int __init em_debug_init(void)
+{
+ /* Create /sys/kernel/debug/energy_model directory */
+ rootdir = debugfs_create_dir("energy_model", NULL);
+
+ return 0;
+}
+core_initcall(em_debug_init);
+#else /* CONFIG_DEBUG_FS */
+static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {}
+#endif
static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
struct em_data_callback *cb)
{
@@ -102,6 +157,8 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
pd->nr_cap_states = nr_states;
cpumask_copy(to_cpumask(pd->cpus), span);
+ em_debug_create_pd(pd, cpu);
+
return pd;
free_cs_table:
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index b7a82502857a..9d22131afc1e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -582,10 +582,8 @@ static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
qos->pm_qos_power_miscdev.name = qos->name;
qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
- if (d) {
- (void)debugfs_create_file(qos->name, S_IRUGO, d,
- (void *)qos, &pm_qos_debug_fops);
- }
+ debugfs_create_file(qos->name, S_IRUGO, d, (void *)qos,
+ &pm_qos_debug_fops);
return misc_register(&qos->pm_qos_power_miscdev);
}
@@ -685,8 +683,6 @@ static int __init pm_qos_power_init(void)
BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
d = debugfs_create_dir("pm_qos", NULL);
- if (IS_ERR_OR_NULL(d))
- d = NULL;
for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
ret = register_pm_qos_misc(pm_qos_array[i], d);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 640b2034edd6..f08a1e4ee1d4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -965,6 +965,9 @@ void __init __register_nosave_region(unsigned long start_pfn,
/* This allocation cannot fail */
region = memblock_alloc(sizeof(struct nosave_region),
SMP_CACHE_BYTES);
+ if (!region)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(struct nosave_region));
}
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
@@ -1215,14 +1218,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(!PageHighMem(page));
- if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) ||
- PageReserved(page))
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
+ return NULL;
+
+ if (PageReserved(page) || PageOffline(page))
return NULL;
if (page_is_guard(page))
@@ -1277,8 +1282,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(PageHighMem(page));
@@ -1286,6 +1291,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
return NULL;
+ if (PageOffline(page))
+ return NULL;
+
if (PageReserved(page)
&& (!kernel_page_present(page) || pfn_is_nosave(pfn)))
return NULL;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d3d170374ceb..02ca827b8fac 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -65,6 +65,7 @@ int console_printk[4] = {
CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
};
+EXPORT_SYMBOL_GPL(console_printk);
atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
EXPORT_SYMBOL(ignore_console_lock_warning);
@@ -344,7 +345,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
enum log_flags {
LOG_NEWLINE = 2, /* text ended with a newline */
- LOG_PREFIX = 4, /* text started with a prefix */
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
@@ -356,6 +356,9 @@ struct printk_log {
u8 facility; /* syslog facility */
u8 flags:5; /* internal record flags */
u8 level:3; /* syslog level */
+#ifdef CONFIG_PRINTK_CALLER
+ u32 caller_id; /* thread id or processor id */
+#endif
}
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
__packed __aligned(4)
@@ -422,7 +425,11 @@ static u64 exclusive_console_stop_seq;
static u64 clear_seq;
static u32 clear_idx;
+#ifdef CONFIG_PRINTK_CALLER
+#define PREFIX_MAX 48
+#else
#define PREFIX_MAX 32
+#endif
#define LOG_LINE_MAX (1024 - PREFIX_MAX)
#define LOG_LEVEL(v) ((v) & 0x07)
@@ -577,7 +584,7 @@ static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
}
/* insert record into the buffer, discard old ones, update heads */
-static int log_store(int facility, int level,
+static int log_store(u32 caller_id, int facility, int level,
enum log_flags flags, u64 ts_nsec,
const char *dict, u16 dict_len,
const char *text, u16 text_len)
@@ -625,6 +632,9 @@ static int log_store(int facility, int level,
msg->ts_nsec = ts_nsec;
else
msg->ts_nsec = local_clock();
+#ifdef CONFIG_PRINTK_CALLER
+ msg->caller_id = caller_id;
+#endif
memset(log_dict(msg) + dict_len, 0, pad_len);
msg->len = size;
@@ -688,12 +698,21 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
struct printk_log *msg, u64 seq)
{
u64 ts_usec = msg->ts_nsec;
+ char caller[20];
+#ifdef CONFIG_PRINTK_CALLER
+ u32 id = msg->caller_id;
+
+ snprintf(caller, sizeof(caller), ",caller=%c%u",
+ id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
+#else
+ caller[0] = '\0';
+#endif
do_div(ts_usec, 1000);
- return scnprintf(buf, size, "%u,%llu,%llu,%c;",
- (msg->facility << 3) | msg->level, seq, ts_usec,
- msg->flags & LOG_CONT ? 'c' : '-');
+ return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
+ (msg->facility << 3) | msg->level, seq, ts_usec,
+ msg->flags & LOG_CONT ? 'c' : '-', caller);
}
static ssize_t msg_print_ext_body(char *buf, size_t size,
@@ -1038,6 +1057,9 @@ void log_buf_vmcoreinfo_setup(void)
VMCOREINFO_OFFSET(printk_log, len);
VMCOREINFO_OFFSET(printk_log, text_len);
VMCOREINFO_OFFSET(printk_log, dict_len);
+#ifdef CONFIG_PRINTK_CALLER
+ VMCOREINFO_OFFSET(printk_log, caller_id);
+#endif
}
#endif
@@ -1122,14 +1144,7 @@ void __init setup_log_buf(int early)
if (!new_log_buf_len)
return;
- if (early) {
- new_log_buf =
- memblock_alloc(new_log_buf_len, LOG_ALIGN);
- } else {
- new_log_buf = memblock_alloc_nopanic(new_log_buf_len,
- LOG_ALIGN);
- }
-
+ new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
if (unlikely(!new_log_buf)) {
pr_err("log_buf_len: %lu bytes not available\n",
new_log_buf_len);
@@ -1236,10 +1251,23 @@ static size_t print_time(u64 ts, char *buf)
{
unsigned long rem_nsec = do_div(ts, 1000000000);
- return sprintf(buf, "[%5lu.%06lu] ",
+ return sprintf(buf, "[%5lu.%06lu]",
(unsigned long)ts, rem_nsec / 1000);
}
+#ifdef CONFIG_PRINTK_CALLER
+static size_t print_caller(u32 id, char *buf)
+{
+ char caller[12];
+
+ snprintf(caller, sizeof(caller), "%c%u",
+ id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
+ return sprintf(buf, "[%6s]", caller);
+}
+#else
+#define print_caller(id, buf) 0
+#endif
+
static size_t print_prefix(const struct printk_log *msg, bool syslog,
bool time, char *buf)
{
@@ -1247,8 +1275,17 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog,
if (syslog)
len = print_syslog((msg->facility << 3) | msg->level, buf);
+
if (time)
len += print_time(msg->ts_nsec, buf + len);
+
+ len += print_caller(msg->caller_id, buf + len);
+
+ if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
+ buf[len++] = ' ';
+ buf[len] = '\0';
+ }
+
return len;
}
@@ -1752,6 +1789,12 @@ static inline void printk_delay(void)
}
}
+static inline u32 printk_caller_id(void)
+{
+ return in_task() ? task_pid_nr(current) :
+ 0x80000000 + raw_smp_processor_id();
+}
+
/*
* Continuation lines are buffered, and not committed to the record buffer
* until the line is complete, or a race forces it. The line fragments
@@ -1761,7 +1804,7 @@ static inline void printk_delay(void)
static struct cont {
char buf[LOG_LINE_MAX];
size_t len; /* length == 0 means unused buffer */
- struct task_struct *owner; /* task of first print*/
+ u32 caller_id; /* printk_caller_id() of first print */
u64 ts_nsec; /* time of first print */
u8 level; /* log level of first message */
u8 facility; /* log facility of first message */
@@ -1773,12 +1816,13 @@ static void cont_flush(void)
if (cont.len == 0)
return;
- log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec,
- NULL, 0, cont.buf, cont.len);
+ log_store(cont.caller_id, cont.facility, cont.level, cont.flags,
+ cont.ts_nsec, NULL, 0, cont.buf, cont.len);
cont.len = 0;
}
-static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len)
+static bool cont_add(u32 caller_id, int facility, int level,
+ enum log_flags flags, const char *text, size_t len)
{
/* If the line gets too long, split it up in separate records. */
if (cont.len + len > sizeof(cont.buf)) {
@@ -1789,7 +1833,7 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
if (!cont.len) {
cont.facility = facility;
cont.level = level;
- cont.owner = current;
+ cont.caller_id = caller_id;
cont.ts_nsec = local_clock();
cont.flags = flags;
}
@@ -1809,13 +1853,15 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
{
+ const u32 caller_id = printk_caller_id();
+
/*
* If an earlier line was buffered, and we're a continuation
- * write from the same process, try to add it to the buffer.
+ * write from the same context, try to add it to the buffer.
*/
if (cont.len) {
- if (cont.owner == current && (lflags & LOG_CONT)) {
- if (cont_add(facility, level, lflags, text, text_len))
+ if (cont.caller_id == caller_id && (lflags & LOG_CONT)) {
+ if (cont_add(caller_id, facility, level, lflags, text, text_len))
return text_len;
}
/* Otherwise, make sure it's flushed */
@@ -1828,12 +1874,13 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c
/* If it doesn't end in a newline, try to buffer the current line */
if (!(lflags & LOG_NEWLINE)) {
- if (cont_add(facility, level, lflags, text, text_len))
+ if (cont_add(caller_id, facility, level, lflags, text, text_len))
return text_len;
}
/* Store it in the record log */
- return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len);
+ return log_store(caller_id, facility, level, lflags, 0,
+ dict, dictlen, text, text_len);
}
/* Must be called under logbuf_lock. */
@@ -1867,9 +1914,6 @@ int vprintk_store(int facility, int level,
case '0' ... '7':
if (level == LOGLEVEL_DEFAULT)
level = kern_level - '0';
- /* fallthrough */
- case 'd': /* KERN_DEFAULT */
- lflags |= LOG_PREFIX;
break;
case 'c': /* KERN_CONT */
lflags |= LOG_CONT;
@@ -1884,7 +1928,7 @@ int vprintk_store(int facility, int level,
level = default_message_loglevel;
if (dict)
- lflags |= LOG_PREFIX|LOG_NEWLINE;
+ lflags |= LOG_NEWLINE;
return log_output(facility, level, lflags,
dict, dictlen, text, text_len);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 771e93f9c43f..6f357f4fc859 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -29,6 +29,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
#include <linux/compat.h>
+#include <linux/sched/signal.h>
/*
* Access another process' address space via ptrace.
@@ -924,18 +925,26 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setsiginfo(child, &siginfo);
break;
- case PTRACE_GETSIGMASK:
+ case PTRACE_GETSIGMASK: {
+ sigset_t *mask;
+
if (addr != sizeof(sigset_t)) {
ret = -EINVAL;
break;
}
- if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+ if (test_tsk_restore_sigmask(child))
+ mask = &child->saved_sigmask;
+ else
+ mask = &child->blocked;
+
+ if (copy_to_user(datavp, mask, sizeof(sigset_t)))
ret = -EFAULT;
else
ret = 0;
break;
+ }
case PTRACE_SETSIGMASK: {
sigset_t new_set;
@@ -961,6 +970,8 @@ int ptrace_request(struct task_struct *child, long request,
child->blocked = new_set;
spin_unlock_irq(&child->sighand->siglock);
+ clear_tsk_restore_sigmask(child);
+
ret = 0;
break;
}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 939a2056c87a..37301430970e 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -87,36 +87,6 @@ config RCU_STALL_COMMON
config RCU_NEED_SEGCBLIST
def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
-config CONTEXT_TRACKING
- bool
-
-config CONTEXT_TRACKING_FORCE
- bool "Force context tracking"
- depends on CONTEXT_TRACKING
- default y if !NO_HZ_FULL
- help
- The major pre-requirement for full dynticks to work is to
- support the context tracking subsystem. But there are also
- other dependencies to provide in order to make the full
- dynticks working.
-
- This option stands for testing when an arch implements the
- context tracking backend but doesn't yet fullfill all the
- requirements to make the full dynticks feature working.
- Without the full dynticks, there is no way to test the support
- for context tracking and the subsystems that rely on it: RCU
- userspace extended quiescent state and tickless cputime
- accounting. This option copes with the absence of the full
- dynticks subsystem by forcing the context tracking on all
- CPUs in the system.
-
- Say Y only if you're working on the development of an
- architecture backend for the context tracking.
-
- Say N otherwise, this option brings an overhead that you
- don't want in production.
-
-
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
range 2 64 if 64BIT
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index a393e24a9195..acee72c0b24b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update definitions shared among RCU implementations.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2011
*
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#ifndef __LINUX_RCU_H
@@ -30,7 +17,7 @@
#define RCU_TRACE(stmt)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
-/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */
+/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1)
@@ -462,8 +449,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t);
enum rcutorture_type {
RCU_FLAVOR,
- RCU_BH_FLAVOR,
- RCU_SCHED_FLAVOR,
RCU_TASKS_FLAVOR,
SRCU_FLAVOR,
INVALID_RCU_FLAVOR
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 5aff271adf1e..9bd5f6023c21 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* RCU segmented callback lists, function definitions
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2017
*
- * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/types.h>
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 948470cef385..71b64648464e 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* RCU segmented callback lists, internal-to-rcu header file
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2017
*
- * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/rcu_segcblist.h>
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index b459da70b4fc..c29761152874 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update module-based performance-test facility
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2015
*
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#define pr_fmt(fmt) fmt
@@ -54,7 +41,7 @@
#include "rcu.h"
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
#define PERF_FLAG "-perf:"
#define PERFOUT_STRING(s) \
@@ -83,13 +70,19 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
* Various other use cases may of course be specified.
*/
+#ifdef MODULE
+# define RCUPERF_SHUTDOWN 0
+#else
+# define RCUPERF_SHUTDOWN 1
+#endif
+
torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
-torture_param(bool, shutdown, !IS_ENABLED(MODULE),
+torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
"Shutdown at end of performance tests.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f6e85faa4ff4..f14d1b18a74f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update module-based torture test facility
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2005, 2006
*
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
* Josh Triplett <josh@joshtriplett.org>
*
* See also: Documentation/RCU/torture.txt
@@ -61,7 +48,7 @@
#include "rcu.h"
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
/* Bits for ->extendables field, extendables param, and related definitions. */
@@ -1630,21 +1617,34 @@ static bool rcu_fwd_emergency_stop;
#define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */
#define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */
#define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */
-static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)];
+struct rcu_launder_hist {
+ long n_launders;
+ unsigned long launder_gp_seq;
+};
+#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV))
+static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
+static unsigned long rcu_launder_gp_seq_start;
static void rcu_torture_fwd_cb_hist(void)
{
+ unsigned long gps;
+ unsigned long gps_old;
int i;
int j;
for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
- if (n_launders_hist[i] > 0)
+ if (n_launders_hist[i].n_launders > 0)
break;
pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
__func__, jiffies - rcu_fwd_startat);
- for (j = 0; j <= i; j++)
- pr_cont(" %ds/%d: %ld",
- j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j]);
+ gps_old = rcu_launder_gp_seq_start;
+ for (j = 0; j <= i; j++) {
+ gps = n_launders_hist[j].launder_gp_seq;
+ pr_cont(" %ds/%d: %ld:%ld",
+ j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders,
+ rcutorture_seq_diff(gps, gps_old));
+ gps_old = gps;
+ }
pr_cont("\n");
}
@@ -1666,7 +1666,8 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
if (i >= ARRAY_SIZE(n_launders_hist))
i = ARRAY_SIZE(n_launders_hist) - 1;
- n_launders_hist[i]++;
+ n_launders_hist[i].n_launders++;
+ n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
spin_unlock_irqrestore(&rcu_fwd_lock, flags);
}
@@ -1786,9 +1787,10 @@ static void rcu_torture_fwd_prog_cr(void)
n_max_cbs = 0;
n_max_gps = 0;
for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++)
- n_launders_hist[i] = 0;
+ n_launders_hist[i].n_launders = 0;
cver = READ_ONCE(rcu_torture_current_version);
gps = cur_ops->get_gp_seq();
+ rcu_launder_gp_seq_start = gps;
while (time_before(jiffies, stopat) &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
rfcp = READ_ONCE(rcu_fwd_cb_head);
@@ -2228,6 +2230,14 @@ static void rcu_test_debug_objects(void)
#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
}
+static void rcutorture_sync(void)
+{
+ static unsigned long n;
+
+ if (cur_ops->sync && !(++n & 0xfff))
+ cur_ops->sync();
+}
+
static int __init
rcu_torture_init(void)
{
@@ -2389,7 +2399,8 @@ rcu_torture_init(void)
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
if (firsterr)
goto unwind;
- firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval);
+ firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval,
+ rcutorture_sync);
if (firsterr)
goto unwind;
firsterr = rcu_torture_stall_init();
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 32dfd6522548..5d4a39a6505a 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -1,24 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Sleepable Read-Copy Update mechanism for mutual exclusion,
* tiny version for non-preemptible single-CPU use.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2017
*
- * Author: Paul McKenney <paulmck@us.ibm.com>
+ * Author: Paul McKenney <paulmck@linux.ibm.com>
*/
#include <linux/export.h>
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 3600d88d8956..a60b8ba9e1ac 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1,24 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Sleepable Read-Copy Update mechanism for mutual exclusion.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2006
* Copyright (C) Fujitsu, 2012
*
- * Author: Paul McKenney <paulmck@us.ibm.com>
+ * Author: Paul McKenney <paulmck@linux.ibm.com>
* Lai Jiangshan <laijs@cn.fujitsu.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
@@ -58,6 +45,7 @@ static bool __read_mostly srcu_init_done;
static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay);
static void process_srcu(struct work_struct *work);
+static void srcu_delay_timer(struct timer_list *t);
/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
#define spin_lock_rcu_node(p) \
@@ -156,7 +144,8 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
snp->grphi = cpu;
}
sdp->cpu = cpu;
- INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
+ INIT_WORK(&sdp->work, srcu_invoke_callbacks);
+ timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
sdp->ssp = ssp;
sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
if (is_static)
@@ -386,13 +375,19 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
} else {
flush_delayed_work(&ssp->work);
}
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
+
if (quiesced) {
- if (WARN_ON(delayed_work_pending(&per_cpu_ptr(ssp->sda, cpu)->work)))
+ if (WARN_ON(timer_pending(&sdp->delay_work)))
+ return; /* Just leak it! */
+ if (WARN_ON(work_pending(&sdp->work)))
return; /* Just leak it! */
} else {
- flush_delayed_work(&per_cpu_ptr(ssp->sda, cpu)->work);
+ del_timer_sync(&sdp->delay_work);
+ flush_work(&sdp->work);
}
+ }
if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
WARN_ON(srcu_readers_active(ssp))) {
pr_info("%s: Active srcu_struct %p state: %d\n",
@@ -463,39 +458,23 @@ static void srcu_gp_start(struct srcu_struct *ssp)
WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
}
-/*
- * Track online CPUs to guide callback workqueue placement.
- */
-DEFINE_PER_CPU(bool, srcu_online);
-void srcu_online_cpu(unsigned int cpu)
+static void srcu_delay_timer(struct timer_list *t)
{
- WRITE_ONCE(per_cpu(srcu_online, cpu), true);
-}
+ struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work);
-void srcu_offline_cpu(unsigned int cpu)
-{
- WRITE_ONCE(per_cpu(srcu_online, cpu), false);
+ queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);
}
-/*
- * Place the workqueue handler on the specified CPU if online, otherwise
- * just run it whereever. This is useful for placing workqueue handlers
- * that are to invoke the specified CPU's callbacks.
- */
-static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
- struct delayed_work *dwork,
+static void srcu_queue_delayed_work_on(struct srcu_data *sdp,
unsigned long delay)
{
- bool ret;
+ if (!delay) {
+ queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);
+ return;
+ }
- preempt_disable();
- if (READ_ONCE(per_cpu(srcu_online, cpu)))
- ret = queue_delayed_work_on(cpu, wq, dwork, delay);
- else
- ret = queue_delayed_work(wq, dwork, delay);
- preempt_enable();
- return ret;
+ timer_reduce(&sdp->delay_work, jiffies + delay);
}
/*
@@ -504,7 +483,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
*/
static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
{
- srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay);
+ srcu_queue_delayed_work_on(sdp, delay);
}
/*
@@ -1186,7 +1165,8 @@ static void srcu_invoke_callbacks(struct work_struct *work)
struct srcu_data *sdp;
struct srcu_struct *ssp;
- sdp = container_of(work, struct srcu_data, work.work);
+ sdp = container_of(work, struct srcu_data, work);
+
ssp = sdp->ssp;
rcu_cblist_init(&ready_cbs);
spin_lock_irq_rcu_node(sdp);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index be10036fa621..a8304d90573f 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* RCU-based infrastructure for lightweight reader-writer locking
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (c) 2015, Red Hat, Inc.
*
* Author: Oleg Nesterov <oleg@redhat.com>
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 5f5963ba313e..911bd9076d43 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2008
*
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU
@@ -76,7 +63,7 @@ void rcu_qs(void)
* be called from hardirq context. It is normally called from the
* scheduling-clock interrupt.
*/
-void rcu_check_callbacks(int user)
+void rcu_sched_clock_irq(int user)
{
if (user) {
rcu_qs();
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9180158756d2..acd6ccf56faf 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1,27 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update mechanism for mutual exclusion
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2008
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version
*
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
*
* For detailed explanation of Read-Copy Update mechanism see -
@@ -62,6 +49,8 @@
#include <linux/suspend.h>
#include <linux/ftrace.h>
#include <linux/tick.h>
+#include <linux/sysrq.h>
+#include <linux/kprobes.h>
#include "tree.h"
#include "rcu.h"
@@ -115,6 +104,9 @@ int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
/* panic() on RCU Stall sysctl. */
int sysctl_panic_on_rcu_stall __read_mostly;
+/* Commandeer a sysrq key to dump RCU's tree. */
+static bool sysrq_rcu;
+module_param(sysrq_rcu, bool, 0444);
/*
* The rcu_scheduler_active variable is initialized to the value
@@ -479,7 +471,6 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next
module_param(rcu_kick_kthreads, bool, 0644);
static void force_qs_rnp(int (*f)(struct rcu_data *rdp));
-static void force_quiescent_state(void);
static int rcu_pending(void);
/*
@@ -504,13 +495,12 @@ unsigned long rcu_exp_batches_completed(void)
EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
/*
- * Force a quiescent state.
+ * Return the root node of the rcu_state structure.
*/
-void rcu_force_quiescent_state(void)
+static struct rcu_node *rcu_get_root(void)
{
- force_quiescent_state();
+ return &rcu_state.node[0];
}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
* Convert a ->gp_state value to a character string.
@@ -529,19 +519,30 @@ void show_rcu_gp_kthreads(void)
{
int cpu;
unsigned long j;
+ unsigned long ja;
+ unsigned long jr;
+ unsigned long jw;
struct rcu_data *rdp;
struct rcu_node *rnp;
- j = jiffies - READ_ONCE(rcu_state.gp_activity);
- pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %ld\n",
+ j = jiffies;
+ ja = j - READ_ONCE(rcu_state.gp_activity);
+ jr = j - READ_ONCE(rcu_state.gp_req_activity);
+ jw = j - READ_ONCE(rcu_state.gp_wake_time);
+ pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
rcu_state.name, gp_state_getname(rcu_state.gp_state),
- rcu_state.gp_state, rcu_state.gp_kthread->state, j);
+ rcu_state.gp_state,
+ rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
+ ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
+ (long)READ_ONCE(rcu_state.gp_seq),
+ (long)READ_ONCE(rcu_get_root()->gp_seq_needed),
+ READ_ONCE(rcu_state.gp_flags));
rcu_for_each_node_breadth_first(rnp) {
if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
continue;
- pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n",
- rnp->grplo, rnp->grphi, rnp->gp_seq,
- rnp->gp_seq_needed);
+ pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
+ rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
+ (long)rnp->gp_seq_needed);
if (!rcu_is_leaf_node(rnp))
continue;
for_each_leaf_node_possible_cpu(rnp, cpu) {
@@ -550,14 +551,35 @@ void show_rcu_gp_kthreads(void)
ULONG_CMP_GE(rcu_state.gp_seq,
rdp->gp_seq_needed))
continue;
- pr_info("\tcpu %d ->gp_seq_needed %lu\n",
- cpu, rdp->gp_seq_needed);
+ pr_info("\tcpu %d ->gp_seq_needed %ld\n",
+ cpu, (long)rdp->gp_seq_needed);
}
}
/* sched_show_task(rcu_state.gp_kthread); */
}
EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+/* Dump grace-period-request information due to commandeered sysrq. */
+static void sysrq_show_rcu(int key)
+{
+ show_rcu_gp_kthreads();
+}
+
+static struct sysrq_key_op sysrq_rcudump_op = {
+ .handler = sysrq_show_rcu,
+ .help_msg = "show-rcu(y)",
+ .action_msg = "Show RCU tree",
+ .enable_mask = SYSRQ_ENABLE_DUMP,
+};
+
+static int __init rcu_sysrq_init(void)
+{
+ if (sysrq_rcu)
+ return register_sysrq_key('y', &sysrq_rcudump_op);
+ return 0;
+}
+early_initcall(rcu_sysrq_init);
+
/*
* Send along grace-period-related data for rcutorture diagnostics.
*/
@@ -566,8 +588,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
{
switch (test_type) {
case RCU_FLAVOR:
- case RCU_BH_FLAVOR:
- case RCU_SCHED_FLAVOR:
*flags = READ_ONCE(rcu_state.gp_flags);
*gp_seq = rcu_seq_current(&rcu_state.gp_seq);
break;
@@ -578,14 +598,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
/*
- * Return the root node of the rcu_state structure.
- */
-static struct rcu_node *rcu_get_root(void)
-{
- return &rcu_state.node[0];
-}
-
-/*
* Enter an RCU extended quiescent state, which can be either the
* idle loop or adaptive-tickless usermode execution.
*
@@ -701,7 +713,6 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
/**
* rcu_nmi_exit - inform RCU of exit from NMI context
- * @irq: Is this call from rcu_irq_exit?
*
* If you add or remove a call to rcu_nmi_exit(), be sure to test
* with CONFIG_RCU_EQS_DEBUG=y.
@@ -872,6 +883,7 @@ void rcu_nmi_enter(void)
{
rcu_nmi_enter_common(false);
}
+NOKPROBE_SYMBOL(rcu_nmi_enter);
/**
* rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -1115,7 +1127,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
}
/*
- * NO_HZ_FULL CPUs can run in-kernel without rcu_check_callbacks!
+ * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!
* The above code handles this, but only for straight cond_resched().
* And some in-kernel loops check need_resched() before calling
* cond_resched(), which defeats the above code for CPUs that are
@@ -1181,7 +1193,7 @@ static void rcu_check_gp_kthread_starvation(void)
pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
rcu_state.name, j,
(long)rcu_seq_current(&rcu_state.gp_seq),
- rcu_state.gp_flags,
+ READ_ONCE(rcu_state.gp_flags),
gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
if (gpk) {
@@ -1310,7 +1322,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
panic_on_rcu_stall();
- force_quiescent_state(); /* Kick them all. */
+ rcu_force_quiescent_state(); /* Kick them all. */
}
static void print_cpu_stall(void)
@@ -1557,17 +1569,28 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
}
/*
- * Awaken the grace-period kthread. Don't do a self-awaken, and don't
- * bother awakening when there is nothing for the grace-period kthread
- * to do (as in several CPUs raced to awaken, and we lost), and finally
- * don't try to awaken a kthread that has not yet been created.
+ * Awaken the grace-period kthread. Don't do a self-awaken (unless in
+ * an interrupt or softirq handler), and don't bother awakening when there
+ * is nothing for the grace-period kthread to do (as in several CPUs raced
+ * to awaken, and we lost), and finally don't try to awaken a kthread that
+ * has not yet been created. If all those checks are passed, track some
+ * debug information and awaken.
+ *
+ * So why do the self-wakeup when in an interrupt or softirq handler
+ * in the grace-period kthread's context? Because the kthread might have
+ * been interrupted just as it was going to sleep, and just after the final
+ * pre-sleep check of the awaken condition. In this case, a wakeup really
+ * is required, and is therefore supplied.
*/
static void rcu_gp_kthread_wake(void)
{
- if (current == rcu_state.gp_kthread ||
+ if ((current == rcu_state.gp_kthread &&
+ !in_interrupt() && !in_serving_softirq()) ||
!READ_ONCE(rcu_state.gp_flags) ||
!rcu_state.gp_kthread)
return;
+ WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
+ WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
swake_up_one(&rcu_state.gp_wq);
}
@@ -1711,7 +1734,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
zero_cpu_stall_ticks(rdp);
}
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
- if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap)
+ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
rdp->gp_seq_needed = rnp->gp_seq_needed;
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
@@ -1939,7 +1962,7 @@ static void rcu_gp_fqs_loop(void)
if (!ret) {
rcu_state.jiffies_force_qs = jiffies + j;
WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
- jiffies + 3 * j);
+ jiffies + (j ? 3 * j : 2));
}
trace_rcu_grace_period(rcu_state.name,
READ_ONCE(rcu_state.gp_seq),
@@ -2497,14 +2520,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
}
/*
- * Check to see if this CPU is in a non-context-switch quiescent state
- * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule RCU core processing.
- *
- * This function must be called from hardirq context. It is normally
- * invoked from the scheduling-clock interrupt.
+ * This function is invoked from each scheduling-clock interrupt,
+ * and checks to see if this CPU is in a non-context-switch quiescent
+ * state, for example, user mode or idle loop. It also schedules RCU
+ * core processing. If the current grace period has gone on too long,
+ * it will ask the scheduler to manufacture a context switch for the sole
+ * purpose of providing a providing the needed quiescent state.
*/
-void rcu_check_callbacks(int user)
+void rcu_sched_clock_irq(int user)
{
trace_rcu_utilization(TPS("Start scheduler-tick"));
raw_cpu_inc(rcu_data.ticks_this_gp);
@@ -2517,7 +2540,7 @@ void rcu_check_callbacks(int user)
}
__this_cpu_write(rcu_data.rcu_urgent_qs, false);
}
- rcu_flavor_check_callbacks(user);
+ rcu_flavor_sched_clock_irq(user);
if (rcu_pending())
invoke_rcu_core();
@@ -2578,7 +2601,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
* Force quiescent states on reluctant CPUs, and also detect which
* CPUs are in dyntick-idle mode.
*/
-static void force_quiescent_state(void)
+void rcu_force_quiescent_state(void)
{
unsigned long flags;
bool ret;
@@ -2610,6 +2633,7 @@ static void force_quiescent_state(void)
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
rcu_gp_kthread_wake();
}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
* This function checks for grace-period requests that fail to motivate
@@ -2657,16 +2681,11 @@ rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
- pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n",
- __func__, (long)READ_ONCE(rcu_state.gp_seq),
- (long)READ_ONCE(rnp_root->gp_seq_needed),
- j - rcu_state.gp_req_activity, j - rcu_state.gp_activity,
- rcu_state.gp_flags, rcu_state.gp_state, rcu_state.name,
- rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL);
WARN_ON(1);
if (rnp_root != rnp)
raw_spin_unlock_rcu_node(rnp_root);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ show_rcu_gp_kthreads();
}
/*
@@ -2711,12 +2730,8 @@ void rcu_fwd_progress_check(unsigned long j)
}
EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
-/*
- * This does the RCU core processing work for the specified rcu_data
- * structures. This may be called only from the CPU to whom the rdp
- * belongs.
- */
-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
+/* Perform RCU core processing work for the current CPU. */
+static __latent_entropy void rcu_core(struct softirq_action *unused)
{
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
@@ -2801,9 +2816,9 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
/*
* Force the grace period if too many callbacks or too long waiting.
- * Enforce hysteresis, and don't invoke force_quiescent_state()
+ * Enforce hysteresis, and don't invoke rcu_force_quiescent_state()
* if some other CPU has recently done so. Also, don't bother
- * invoking force_quiescent_state() if the newly enqueued callback
+ * invoking rcu_force_quiescent_state() if the newly enqueued callback
* is the only one waiting for a grace period to complete.
*/
if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
@@ -2820,7 +2835,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
rdp->blimit = LONG_MAX;
if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
- force_quiescent_state();
+ rcu_force_quiescent_state();
rdp->n_force_qs_snap = rcu_state.n_force_qs;
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
@@ -2889,9 +2904,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
rcu_segcblist_init(&rdp->cblist);
}
rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
- if (!lazy)
- rcu_idle_count_callbacks_posted();
-
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rcu_state.name, head,
(unsigned long)func,
@@ -2961,6 +2973,79 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);
+/*
+ * During early boot, any blocking grace-period wait automatically
+ * implies a grace period. Later on, this is never the case for PREEMPT.
+ *
+ * Howevr, because a context switch is a grace period for !PREEMPT, any
+ * blocking grace-period wait automatically implies a grace period if
+ * there is only one CPU online at any point time during execution of
+ * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
+ * occasionally incorrectly indicate that there are multiple CPUs online
+ * when there was in fact only one the whole time, as this just adds some
+ * overhead: RCU still operates correctly.
+ */
+static int rcu_blocking_is_gp(void)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
+ might_sleep(); /* Check for RCU read-side critical section. */
+ preempt_disable();
+ ret = num_online_cpus() <= 1;
+ preempt_enable();
+ return ret;
+}
+
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed. Note, however, that
+ * upon return from synchronize_rcu(), the caller might well be executing
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting. RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
+ * In addition, regions of code across which interrupts, preemption, or
+ * softirqs have been disabled also serve as RCU read-side critical
+ * sections. This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
+ *
+ * Note that this guarantee implies further memory-ordering guarantees.
+ * On systems with more than one CPU, when synchronize_rcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last RCU read-side critical section whose beginning
+ * preceded the call to synchronize_rcu(). In addition, each CPU having
+ * an RCU read-side critical section that extends beyond the return from
+ * synchronize_rcu() is guaranteed to have executed a full memory barrier
+ * after the beginning of synchronize_rcu() and before the beginning of
+ * that RCU read-side critical section. Note that these guarantees include
+ * CPUs that are offline, idle, or executing in user mode, as well as CPUs
+ * that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_rcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ */
+void synchronize_rcu(void)
+{
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu() in RCU read-side critical section");
+ if (rcu_blocking_is_gp())
+ return;
+ if (rcu_gp_is_expedited())
+ synchronize_rcu_expedited();
+ else
+ wait_rcu_gp(call_rcu);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
/**
* get_state_synchronize_rcu - Snapshot current RCU state
*
@@ -3049,28 +3134,6 @@ static int rcu_pending(void)
}
/*
- * Return true if the specified CPU has any callback. If all_lazy is
- * non-NULL, store an indication of whether all callbacks are lazy.
- * (If there are no callbacks, all of them are deemed to be lazy.)
- */
-static bool rcu_cpu_has_callbacks(bool *all_lazy)
-{
- bool al = true;
- bool hc = false;
- struct rcu_data *rdp;
-
- rdp = this_cpu_ptr(&rcu_data);
- if (!rcu_segcblist_empty(&rdp->cblist)) {
- hc = true;
- if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist))
- al = false;
- }
- if (all_lazy)
- *all_lazy = al;
- return hc;
-}
-
-/*
* Helper function for rcu_barrier() tracing. If tracing is disabled,
* the compiler is expected to optimize this away.
*/
@@ -3299,7 +3362,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcu_prepare_kthreads(cpu);
- rcu_spawn_all_nocb_kthreads(cpu);
+ rcu_spawn_cpu_nocb_kthread(cpu);
return 0;
}
@@ -3329,8 +3392,6 @@ int rcutree_online_cpu(unsigned int cpu)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->ffmask |= rdp->grpmask;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (IS_ENABLED(CONFIG_TREE_SRCU))
- srcu_online_cpu(cpu);
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return 0; /* Too early in boot for scheduler work. */
sync_sched_exp_online_cleanup(cpu);
@@ -3355,8 +3416,6 @@ int rcutree_offline_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcutree_affinity_setting(cpu, cpu);
- if (IS_ENABLED(CONFIG_TREE_SRCU))
- srcu_offline_cpu(cpu);
return 0;
}
@@ -3777,7 +3836,7 @@ void __init rcu_init(void)
rcu_init_one();
if (dump_tree)
rcu_dump_rcu_node_tree();
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ open_softirq(RCU_SOFTIRQ, rcu_core);
/*
* We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d90b02b53c0e..bb4f995f2d3f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -1,25 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
* Internal non-public definitions.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2008
*
* Author: Ingo Molnar <mingo@elte.hu>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/cache.h>
@@ -36,7 +23,6 @@
/* Communicate arguments to a workqueue handler. */
struct rcu_exp_work {
- smp_call_func_t rew_func;
unsigned long rew_s;
struct work_struct rew_work;
};
@@ -194,10 +180,7 @@ struct rcu_data {
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
#ifdef CONFIG_RCU_FAST_NO_HZ
- bool all_lazy; /* Are all CPU's CBs lazy? */
- unsigned long nonlazy_posted; /* # times non-lazy CB posted to CPU. */
- unsigned long nonlazy_posted_snap;
- /* Nonlazy_posted snapshot. */
+ bool all_lazy; /* All CPU's CBs lazy at idle start? */
unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
@@ -234,7 +217,13 @@ struct rcu_data {
/* Leader CPU takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
- /* 6) Diagnostic data, including RCU CPU stall warnings. */
+ /* 6) RCU priority boosting. */
+ struct task_struct *rcu_cpu_kthread_task;
+ /* rcuc per-CPU kthread or NULL. */
+ unsigned int rcu_cpu_kthread_status;
+ char rcu_cpu_has_work;
+
+ /* 7) Diagnostic data, including RCU CPU stall warnings. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
/* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
struct irq_work rcu_iw; /* Check for non-irq activity. */
@@ -303,6 +292,8 @@ struct rcu_state {
struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */
short gp_state; /* GP kthread sleep state. */
+ unsigned long gp_wake_time; /* Last GP kthread wake. */
+ unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */
/* End of fields guarded by root rcu_node's lock. */
@@ -402,13 +393,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
int rcu_dynticks_snap(struct rcu_data *rdp);
-#ifdef CONFIG_RCU_BOOST
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DECLARE_PER_CPU(char, rcu_cpu_has_work);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
static void rcu_qs(void);
@@ -420,7 +404,7 @@ static void rcu_print_detail_task_stall(void);
static int rcu_print_task_stall(struct rcu_node *rnp);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
-static void rcu_flavor_check_callbacks(int user);
+static void rcu_flavor_sched_clock_irq(int user);
void call_rcu(struct rcu_head *head, rcu_callback_t func);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
@@ -431,7 +415,6 @@ static void __init rcu_spawn_boost_kthreads(void);
static void rcu_prepare_kthreads(int cpu);
static void rcu_cleanup_after_idle(void);
static void rcu_prepare_for_idle(void);
-static void rcu_idle_count_callbacks_posted(void);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
static void rcu_preempt_deferred_qs(struct task_struct *t);
@@ -451,7 +434,7 @@ static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
-static void rcu_spawn_all_nocb_kthreads(int cpu);
+static void rcu_spawn_cpu_nocb_kthread(int cpu);
static void __init rcu_spawn_nocb_kthreads(void);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
@@ -462,11 +445,3 @@ static void rcu_bind_gp_kthread(void);
static bool rcu_nohz_full_cpu(void);
static void rcu_dynticks_task_enter(void);
static void rcu_dynticks_task_exit(void);
-
-#ifdef CONFIG_SRCU
-void srcu_online_cpu(unsigned int cpu);
-void srcu_offline_cpu(unsigned int cpu);
-#else /* #ifdef CONFIG_SRCU */
-void srcu_online_cpu(unsigned int cpu) { }
-void srcu_offline_cpu(unsigned int cpu) { }
-#endif /* #else #ifdef CONFIG_SRCU */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 928fe5893a57..4c2a0189e748 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -1,27 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* RCU expedited grace periods
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2016
*
- * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/lockdep.h>
+static void rcu_exp_handler(void *unused);
+
/*
* Record the start of an expedited grace period.
*/
@@ -344,7 +333,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
{
int cpu;
unsigned long flags;
- smp_call_func_t func;
unsigned long mask_ofl_test;
unsigned long mask_ofl_ipi;
int ret;
@@ -352,7 +340,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
container_of(wp, struct rcu_exp_work, rew_work);
struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
- func = rewp->rew_func;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
/* Each pass checks a CPU for identity, offline, and idle. */
@@ -396,7 +383,7 @@ retry_ipi:
mask_ofl_test |= mask;
continue;
}
- ret = smp_call_function_single(cpu, func, NULL, 0);
+ ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
if (!ret) {
mask_ofl_ipi &= ~mask;
continue;
@@ -426,7 +413,7 @@ retry_ipi:
* Select the nodes that the upcoming expedited grace period needs
* to wait for.
*/
-static void sync_rcu_exp_select_cpus(smp_call_func_t func)
+static void sync_rcu_exp_select_cpus(void)
{
int cpu;
struct rcu_node *rnp;
@@ -440,7 +427,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func)
rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */
- rnp->rew.rew_func = func;
if (!READ_ONCE(rcu_par_gp_wq) ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) {
@@ -449,7 +435,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func)
continue;
}
INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- preempt_disable();
cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
/* If all offline, queue the work on an unbound CPU. */
if (unlikely(cpu > rnp->grphi - rnp->grplo))
@@ -457,7 +442,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func)
else
cpu += rnp->grplo;
queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
- preempt_enable();
rnp->exp_need_flush = true;
}
@@ -580,10 +564,10 @@ static void rcu_exp_wait_wake(unsigned long s)
* Common code to drive an expedited grace period forward, used by
* workqueues and mid-boot-time tasks.
*/
-static void rcu_exp_sel_wait_wake(smp_call_func_t func, unsigned long s)
+static void rcu_exp_sel_wait_wake(unsigned long s)
{
/* Initialize the rcu_node tree in preparation for the wait. */
- sync_rcu_exp_select_cpus(func);
+ sync_rcu_exp_select_cpus();
/* Wait and clean up, including waking everyone. */
rcu_exp_wait_wake(s);
@@ -597,52 +581,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp)
struct rcu_exp_work *rewp;
rewp = container_of(wp, struct rcu_exp_work, rew_work);
- rcu_exp_sel_wait_wake(rewp->rew_func, rewp->rew_s);
-}
-
-/*
- * Given a smp_call_function() handler, kick off the specified
- * implementation of expedited grace period.
- */
-static void _synchronize_rcu_expedited(smp_call_func_t func)
-{
- struct rcu_data *rdp;
- struct rcu_exp_work rew;
- struct rcu_node *rnp;
- unsigned long s;
-
- /* If expedited grace periods are prohibited, fall back to normal. */
- if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu);
- return;
- }
-
- /* Take a snapshot of the sequence number. */
- s = rcu_exp_gp_seq_snap();
- if (exp_funnel_lock(s))
- return; /* Someone else did our work for us. */
-
- /* Ensure that load happens before action based on it. */
- if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
- /* Direct call during scheduler init and early_initcalls(). */
- rcu_exp_sel_wait_wake(func, s);
- } else {
- /* Marshall arguments & schedule the expedited grace period. */
- rew.rew_func = func;
- rew.rew_s = s;
- INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- queue_work(rcu_gp_wq, &rew.rew_work);
- }
-
- /* Wait for expedited grace period to complete. */
- rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
- rnp = rcu_get_root();
- wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
- sync_exp_work_done(s));
- smp_mb(); /* Workqueue actions happen before return. */
-
- /* Let the next expedited grace period start. */
- mutex_unlock(&rcu_state.exp_mutex);
+ rcu_exp_sel_wait_wake(rewp->rew_s);
}
#ifdef CONFIG_PREEMPT_RCU
@@ -654,7 +593,7 @@ static void _synchronize_rcu_expedited(smp_call_func_t func)
* ->expmask fields in the rcu_node tree. Otherwise, immediately
* report the quiescent state.
*/
-static void sync_rcu_exp_handler(void *unused)
+static void rcu_exp_handler(void *unused)
{
unsigned long flags;
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -697,6 +636,7 @@ static void sync_rcu_exp_handler(void *unused)
WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true);
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
}
/*
@@ -730,43 +670,10 @@ static void sync_sched_exp_online_cleanup(int cpu)
{
}
-/**
- * synchronize_rcu_expedited - Brute-force RCU grace period
- *
- * Wait for an RCU-preempt grace period, but expedite it. The basic
- * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler
- * checks whether the CPU is in an RCU-preempt critical section, and
- * if so, it sets a flag that causes the outermost rcu_read_unlock()
- * to report the quiescent state. On the other hand, if the CPU is
- * not in an RCU read-side critical section, the IPI handler reports
- * the quiescent state immediately.
- *
- * Although this is a greate improvement over previous expedited
- * implementations, it is still unfriendly to real-time workloads, so is
- * thus not recommended for any sort of common-case code. In fact, if
- * you are using synchronize_rcu_expedited() in a loop, please restructure
- * your code to batch your updates, and then Use a single synchronize_rcu()
- * instead.
- *
- * This has the same semantics as (but is more brutal than) synchronize_rcu().
- */
-void synchronize_rcu_expedited(void)
-{
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
- lock_is_held(&rcu_lock_map) ||
- lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu_expedited() in RCU read-side critical section");
-
- if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
- return;
- _synchronize_rcu_expedited(sync_rcu_exp_handler);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
#else /* #ifdef CONFIG_PREEMPT_RCU */
/* Invoked on each online non-idle CPU for expedited quiescent state. */
-static void sync_sched_exp_handler(void *unused)
+static void rcu_exp_handler(void *unused)
{
struct rcu_data *rdp;
struct rcu_node *rnp;
@@ -798,44 +705,78 @@ static void sync_sched_exp_online_cleanup(int cpu)
rnp = rdp->mynode;
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
return;
- ret = smp_call_function_single(cpu, sync_sched_exp_handler, NULL, 0);
+ ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
WARN_ON_ONCE(ret);
}
-/*
- * Because a context switch is a grace period for !PREEMPT, any
- * blocking grace-period wait automatically implies a grace period if
- * there is only one CPU online at any point time during execution of
- * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
- * occasionally incorrectly indicate that there are multiple CPUs online
- * when there was in fact only one the whole time, as this just adds some
- * overhead: RCU still operates correctly.
- */
-static int rcu_blocking_is_gp(void)
-{
- int ret;
-
- might_sleep(); /* Check for RCU read-side critical section. */
- preempt_disable();
- ret = num_online_cpus() <= 1;
- preempt_enable();
- return ret;
-}
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-/* PREEMPT=n implementation of synchronize_rcu_expedited(). */
+/**
+ * synchronize_rcu_expedited - Brute-force RCU grace period
+ *
+ * Wait for an RCU grace period, but expedite it. The basic idea is to
+ * IPI all non-idle non-nohz online CPUs. The IPI handler checks whether
+ * the CPU is in an RCU critical section, and if so, it sets a flag that
+ * causes the outermost rcu_read_unlock() to report the quiescent state
+ * for RCU-preempt or asks the scheduler for help for RCU-sched. On the
+ * other hand, if the CPU is not in an RCU read-side critical section,
+ * the IPI handler reports the quiescent state immediately.
+ *
+ * Although this is a greate improvement over previous expedited
+ * implementations, it is still unfriendly to real-time workloads, so is
+ * thus not recommended for any sort of common-case code. In fact, if
+ * you are using synchronize_rcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then Use a single synchronize_rcu()
+ * instead.
+ *
+ * This has the same semantics as (but is more brutal than) synchronize_rcu().
+ */
void synchronize_rcu_expedited(void)
{
+ struct rcu_data *rdp;
+ struct rcu_exp_work rew;
+ struct rcu_node *rnp;
+ unsigned long s;
+
RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
- /* If only one CPU, this is automatically a grace period. */
+ /* Is the state is such that the call is a grace period? */
if (rcu_blocking_is_gp())
return;
- _synchronize_rcu_expedited(sync_sched_exp_handler);
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(call_rcu);
+ return;
+ }
+
+ /* Take a snapshot of the sequence number. */
+ s = rcu_exp_gp_seq_snap();
+ if (exp_funnel_lock(s))
+ return; /* Someone else did our work for us. */
+
+ /* Ensure that load happens before action based on it. */
+ if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
+ /* Direct call during scheduler init and early_initcalls(). */
+ rcu_exp_sel_wait_wake(s);
+ } else {
+ /* Marshall arguments & schedule the expedited grace period. */
+ rew.rew_s = s;
+ INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+ queue_work(rcu_gp_wq, &rew.rew_work);
+ }
+
+ /* Wait for expedited grace period to complete. */
+ rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
+ rnp = rcu_get_root();
+ wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
+ sync_exp_work_done(s));
+ smp_mb(); /* Workqueue actions happen before return. */
+
+ /* Let the next expedited grace period start. */
+ mutex_unlock(&rcu_state.exp_mutex);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
-#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 1b3dd2fc0cd6..97dba50f6fb2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1,27 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
* Internal non-public definitions that provide either classic
* or preemptible semantics.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright Red Hat, 2009
* Copyright IBM Corporation, 2009
*
* Author: Ingo Molnar <mingo@elte.hu>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/delay.h>
@@ -34,17 +21,7 @@
#include "../time/tick-internal.h"
#ifdef CONFIG_RCU_BOOST
-
#include "../locking/rtmutex_common.h"
-
-/*
- * Control variables for per-CPU and per-rcu_node kthreads.
- */
-static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DEFINE_PER_CPU(char, rcu_cpu_has_work);
-
#else /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -307,7 +284,7 @@ static void rcu_qs(void)
__this_cpu_read(rcu_data.gp_seq),
TPS("cpuqs"));
__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
- barrier(); /* Coordinate with rcu_flavor_check_callbacks(). */
+ barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
current->rcu_read_unlock_special.b.need_qs = false;
}
}
@@ -788,13 +765,13 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
}
/*
- * Check for a quiescent state from the current CPU. When a task blocks,
- * the task is recorded in the corresponding CPU's rcu_node structure,
- * which is checked elsewhere.
- *
- * Caller must disable hard irqs.
+ * Check for a quiescent state from the current CPU, including voluntary
+ * context switches for Tasks RCU. When a task blocks, the task is
+ * recorded in the corresponding CPU's rcu_node structure, which is checked
+ * elsewhere, hence this function need only check for quiescent states
+ * related to the current CPU, not to those related to tasks.
*/
-static void rcu_flavor_check_callbacks(int user)
+static void rcu_flavor_sched_clock_irq(int user)
{
struct task_struct *t = current;
@@ -825,54 +802,6 @@ static void rcu_flavor_check_callbacks(int user)
t->rcu_read_unlock_special.b.need_qs = true;
}
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed. Note, however, that
- * upon return from synchronize_rcu(), the caller might well be executing
- * concurrently with new RCU read-side critical sections that began while
- * synchronize_rcu() was waiting. RCU read-side critical sections are
- * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
- * In addition, regions of code across which interrupts, preemption, or
- * softirqs have been disabled also serve as RCU read-side critical
- * sections. This includes hardware interrupt handlers, softirq handlers,
- * and NMI handlers.
- *
- * Note that this guarantee implies further memory-ordering guarantees.
- * On systems with more than one CPU, when synchronize_rcu() returns,
- * each CPU is guaranteed to have executed a full memory barrier since
- * the end of its last RCU read-side critical section whose beginning
- * preceded the call to synchronize_rcu(). In addition, each CPU having
- * an RCU read-side critical section that extends beyond the return from
- * synchronize_rcu() is guaranteed to have executed a full memory barrier
- * after the beginning of synchronize_rcu() and before the beginning of
- * that RCU read-side critical section. Note that these guarantees include
- * CPUs that are offline, idle, or executing in user mode, as well as CPUs
- * that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked synchronize_rcu(), which returned
- * to its caller on CPU B, then both CPU A and CPU B are guaranteed
- * to have executed a full memory barrier during the execution of
- * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
- * again only if the system has more than one CPU).
- */
-void synchronize_rcu(void)
-{
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
- lock_is_held(&rcu_lock_map) ||
- lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
- return;
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
- wait_rcu_gp(call_rcu);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-
/*
* Check for a task exiting while in a preemptible-RCU read-side
* critical section, clean up if so. No need to issue warnings,
@@ -1088,14 +1017,10 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
}
/*
- * Check to see if this CPU is in a non-context-switch quiescent state
- * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule RCU core processing.
- *
- * This function must be called from hardirq context. It is normally
- * invoked from the scheduling-clock interrupt.
+ * Check to see if this CPU is in a non-context-switch quiescent state,
+ * namely user mode and idle loop.
*/
-static void rcu_flavor_check_callbacks(int user)
+static void rcu_flavor_sched_clock_irq(int user)
{
if (user || rcu_is_cpu_rrupt_from_idle()) {
@@ -1115,22 +1040,6 @@ static void rcu_flavor_check_callbacks(int user)
}
}
-/* PREEMPT=n implementation of synchronize_rcu(). */
-void synchronize_rcu(void)
-{
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
- lock_is_held(&rcu_lock_map) ||
- lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_blocking_is_gp())
- return;
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
- wait_rcu_gp(call_rcu);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-
/*
* Because preemptible RCU does not exist, tasks cannot possibly exit
* while in preemptible RCU read-side critical sections.
@@ -1307,11 +1216,11 @@ static void invoke_rcu_callbacks_kthread(void)
unsigned long flags;
local_irq_save(flags);
- __this_cpu_write(rcu_cpu_has_work, 1);
- if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
- current != __this_cpu_read(rcu_cpu_kthread_task)) {
- rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
- __this_cpu_read(rcu_cpu_kthread_status));
+ __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
+ if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL &&
+ current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) {
+ rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task),
+ __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
}
local_irq_restore(flags);
}
@@ -1322,7 +1231,7 @@ static void invoke_rcu_callbacks_kthread(void)
*/
static bool rcu_is_callbacks_kthread(void)
{
- return __this_cpu_read(rcu_cpu_kthread_task) == current;
+ return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
}
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1369,11 +1278,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
return 0;
}
-static void rcu_kthread_do_work(void)
-{
- rcu_do_batch(this_cpu_ptr(&rcu_data));
-}
-
static void rcu_cpu_kthread_setup(unsigned int cpu)
{
struct sched_param sp;
@@ -1384,12 +1288,12 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
static void rcu_cpu_kthread_park(unsigned int cpu)
{
- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+ per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
}
static int rcu_cpu_kthread_should_run(unsigned int cpu)
{
- return __this_cpu_read(rcu_cpu_has_work);
+ return __this_cpu_read(rcu_data.rcu_cpu_has_work);
}
/*
@@ -1399,21 +1303,20 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu)
*/
static void rcu_cpu_kthread(unsigned int cpu)
{
- unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
- char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
+ unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
+ char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
int spincnt;
for (spincnt = 0; spincnt < 10; spincnt++) {
trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
- this_cpu_inc(rcu_cpu_kthread_loops);
local_irq_disable();
work = *workp;
*workp = 0;
local_irq_enable();
if (work)
- rcu_kthread_do_work();
+ rcu_do_batch(this_cpu_ptr(&rcu_data));
local_bh_enable();
if (*workp == 0) {
trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
@@ -1459,7 +1362,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
}
static struct smp_hotplug_thread rcu_cpu_thread_spec = {
- .store = &rcu_cpu_kthread_task,
+ .store = &rcu_data.rcu_cpu_kthread_task,
.thread_should_run = rcu_cpu_kthread_should_run,
.thread_fn = rcu_cpu_kthread,
.thread_comm = "rcuc/%u",
@@ -1476,7 +1379,7 @@ static void __init rcu_spawn_boost_kthreads(void)
int cpu;
for_each_possible_cpu(cpu)
- per_cpu(rcu_cpu_has_work, cpu) = 0;
+ per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__))
return;
rcu_for_each_leaf_node(rnp)
@@ -1543,7 +1446,7 @@ static void rcu_prepare_kthreads(int cpu)
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
*nextevt = KTIME_MAX;
- return rcu_cpu_has_callbacks(NULL);
+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist);
}
/*
@@ -1562,14 +1465,6 @@ static void rcu_prepare_for_idle(void)
{
}
-/*
- * Don't bother keeping a running count of the number of RCU callbacks
- * posted because CONFIG_RCU_FAST_NO_HZ=n.
- */
-static void rcu_idle_count_callbacks_posted(void)
-{
-}
-
#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
/*
@@ -1652,11 +1547,8 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
lockdep_assert_irqs_disabled();
- /* Snapshot to detect later posting of non-lazy callback. */
- rdp->nonlazy_posted_snap = rdp->nonlazy_posted;
-
/* If no callbacks, RCU doesn't need the CPU. */
- if (!rcu_cpu_has_callbacks(&rdp->all_lazy)) {
+ if (rcu_segcblist_empty(&rdp->cblist)) {
*nextevt = KTIME_MAX;
return 0;
}
@@ -1670,11 +1562,12 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
rdp->last_accelerate = jiffies;
/* Request timer delay depending on laziness, and round. */
- if (!rdp->all_lazy) {
+ rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist);
+ if (rdp->all_lazy) {
+ dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+ } else {
dj = round_up(rcu_idle_gp_delay + jiffies,
rcu_idle_gp_delay) - jiffies;
- } else {
- dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
}
*nextevt = basemono + dj * TICK_NSEC;
return 0;
@@ -1704,7 +1597,7 @@ static void rcu_prepare_for_idle(void)
/* Handle nohz enablement switches conservatively. */
tne = READ_ONCE(tick_nohz_active);
if (tne != rdp->tick_nohz_enabled_snap) {
- if (rcu_cpu_has_callbacks(NULL))
+ if (!rcu_segcblist_empty(&rdp->cblist))
invoke_rcu_core(); /* force nohz to see update. */
rdp->tick_nohz_enabled_snap = tne;
return;
@@ -1717,10 +1610,8 @@ static void rcu_prepare_for_idle(void)
* callbacks, invoke RCU core for the side-effect of recalculating
* idle duration on re-entry to idle.
*/
- if (rdp->all_lazy &&
- rdp->nonlazy_posted != rdp->nonlazy_posted_snap) {
+ if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) {
rdp->all_lazy = false;
- rdp->nonlazy_posted_snap = rdp->nonlazy_posted;
invoke_rcu_core();
return;
}
@@ -1756,19 +1647,6 @@ static void rcu_cleanup_after_idle(void)
invoke_rcu_core();
}
-/*
- * Keep a running count of the number of non-lazy callbacks posted
- * on this CPU. This running counter (which is never decremented) allows
- * rcu_prepare_for_idle() to detect when something out of the idle loop
- * posts a callback, even if an equal number of callbacks are invoked.
- * Of course, callbacks should only be posted from within a trace event
- * designed to be called from idle or from within RCU_NONIDLE().
- */
-static void rcu_idle_count_callbacks_posted(void)
-{
- __this_cpu_add(rcu_data.nonlazy_posted, 1);
-}
-
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
#ifdef CONFIG_RCU_FAST_NO_HZ
@@ -1776,13 +1654,12 @@ static void rcu_idle_count_callbacks_posted(void)
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- unsigned long nlpd = rdp->nonlazy_posted - rdp->nonlazy_posted_snap;
- sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
+ sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
rdp->last_accelerate & 0xffff, jiffies & 0xffff,
- ulong2long(nlpd),
- rdp->all_lazy ? 'L' : '.',
- rdp->tick_nohz_enabled_snap ? '.' : 'D');
+ ".l"[rdp->all_lazy],
+ ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
+ ".D"[!rdp->tick_nohz_enabled_snap]);
}
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -1868,22 +1745,24 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
/*
* Offload callback processing from the boot-time-specified set of CPUs
- * specified by rcu_nocb_mask. For each CPU in the set, there is a
- * kthread created that pulls the callbacks from the corresponding CPU,
- * waits for a grace period to elapse, and invokes the callbacks.
- * The no-CBs CPUs do a wake_up() on their kthread when they insert
- * a callback into any empty list, unless the rcu_nocb_poll boot parameter
- * has been specified, in which case each kthread actively polls its
- * CPU. (Which isn't so great for energy efficiency, but which does
- * reduce RCU's overhead on that CPU.)
+ * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
+ * created that pull the callbacks from the corresponding CPU, wait for
+ * a grace period to elapse, and invoke the callbacks. These kthreads
+ * are organized into leaders, which manage incoming callbacks, wait for
+ * grace periods, and awaken followers, and the followers, which only
+ * invoke callbacks. Each leader is its own follower. The no-CBs CPUs
+ * do a wake_up() on their kthread when they insert a callback into any
+ * empty list, unless the rcu_nocb_poll boot parameter has been specified,
+ * in which case each kthread actively polls its CPU. (Which isn't so great
+ * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
*
* This is intended to be used in conjunction with Frederic Weisbecker's
* adaptive-idle work, which would seriously reduce OS jitter on CPUs
* running CPU-bound user-mode computations.
*
- * Offloading of callback processing could also in theory be used as
- * an energy-efficiency measure because CPUs with no RCU callbacks
- * queued are more aggressive about entering dyntick-idle mode.
+ * Offloading of callbacks can also be used as an energy-efficiency
+ * measure because CPUs with no RCU callbacks queued are more aggressive
+ * about entering dyntick-idle mode.
*/
@@ -1987,10 +1866,7 @@ static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
-/*
- * Does the specified CPU need an RCU callback for this invocation
- * of rcu_barrier()?
- */
+/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */
static bool rcu_nocb_cpu_needs_barrier(int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
@@ -2006,8 +1882,8 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu)
* callbacks would be posted. In the worst case, the first
* barrier in rcu_barrier() suffices (but the caller cannot
* necessarily rely on this, not a substitute for the caller
- * getting the concurrency design right!). There must also be
- * a barrier between the following load an posting of a callback
+ * getting the concurrency design right!). There must also be a
+ * barrier between the following load and posting of a callback
* (if a callback is in fact needed). This is associated with an
* atomic_inc() in the caller.
*/
@@ -2517,9 +2393,9 @@ static void rcu_spawn_one_nocb_kthread(int cpu)
/*
* If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthreads, spawn them.
+ * rcuo kthread, spawn it.
*/
-static void rcu_spawn_all_nocb_kthreads(int cpu)
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
{
if (rcu_scheduler_fully_active)
rcu_spawn_one_nocb_kthread(cpu);
@@ -2536,7 +2412,7 @@ static void __init rcu_spawn_nocb_kthreads(void)
int cpu;
for_each_online_cpu(cpu)
- rcu_spawn_all_nocb_kthreads(cpu);
+ rcu_spawn_cpu_nocb_kthread(cpu);
}
/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
@@ -2670,7 +2546,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
}
-static void rcu_spawn_all_nocb_kthreads(int cpu)
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
{
}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 1971869c4072..cbaa976c5945 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -1,26 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update mechanism for mutual exclusion
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2001
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
*
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
* http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
@@ -52,6 +39,7 @@
#include <linux/tick.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/isolation.h>
+#include <linux/kprobes.h>
#define CREATE_TRACE_POINTS
@@ -249,6 +237,7 @@ int notrace debug_lockdep_rcu_enabled(void)
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
+NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled);
/**
* rcu_read_lock_held() - might we be in RCU read-side critical section?
diff --git a/kernel/relay.c b/kernel/relay.c
index 9e0f52375487..ade14fb7ce2e 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1177,7 +1177,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
}
static const struct pipe_buf_operations relay_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = relay_pipe_buf_release,
.steal = generic_pipe_buf_steal,
diff --git a/kernel/resource.c b/kernel/resource.c
index 915c02e8e5dd..92190f62ebc5 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -382,7 +382,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
int (*func)(struct resource *, void *))
{
struct resource res;
- int ret = -1;
+ int ret = -EINVAL;
while (start < end &&
!find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
@@ -448,12 +448,13 @@ int walk_mem_res(u64 start, u64 end, void *arg,
arg, func);
}
-#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
-
/*
* This function calls the @func callback against all memory ranges of type
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
* It is to be used only for System RAM.
+ *
+ * This will find System RAM ranges that are children of top-level resources
+ * in addition to top-level System RAM resources.
*/
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -462,14 +463,14 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
unsigned long flags;
struct resource res;
unsigned long pfn, end_pfn;
- int ret = -1;
+ int ret = -EINVAL;
start = (u64) start_pfn << PAGE_SHIFT;
end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
while (start < end &&
!find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
- true, &res)) {
+ false, &res)) {
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
end_pfn = (res.end + 1) >> PAGE_SHIFT;
if (end_pfn > pfn)
@@ -481,8 +482,6 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
return ret;
}
-#endif
-
static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
{
return 1;
@@ -1132,6 +1131,15 @@ struct resource * __request_region(struct resource *parent,
conflict = __request_resource(parent, res);
if (!conflict)
break;
+ /*
+ * mm/hmm.c reserves physical addresses which then
+ * become unavailable to other users. Conflicts are
+ * not expected. Warn to aid debugging if encountered.
+ */
+ if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
+ pr_warn("Unaddressable device %s %pR conflicts with %pR",
+ conflict->name, conflict, res);
+ }
if (conflict != parent) {
if (!(conflict->flags & IORESOURCE_BUSY)) {
parent = conflict;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8d76a65cfdd..4778c48a7fda 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -107,11 +107,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* [L] ->on_rq
* RELEASE (rq->lock)
*
- * If we observe the old CPU in task_rq_lock, the acquire of
+ * If we observe the old CPU in task_rq_lock(), the acquire of
* the old rq->lock will fully serialize against the stores.
*
- * If we observe the new CPU in task_rq_lock, the acquire will
- * pair with the WMB to ensure we must then also see migrating.
+ * If we observe the new CPU in task_rq_lock(), the address
+ * dependency headed by '[L] rq = task_rq()' and the acquire
+ * will pair with the WMB to ensure we then also see migrating.
*/
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
@@ -180,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal);
#endif
+ update_rq_clock_pelt(rq, delta);
}
void update_rq_clock(struct rq *rq)
@@ -396,19 +398,7 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
#endif
-/**
- * wake_q_add() - queue a wakeup for 'later' waking.
- * @head: the wake_q_head to add @task to
- * @task: the task to queue for 'later' wakeup
- *
- * Queue a task for later wakeup, most likely by the wake_up_q() call in the
- * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
- * instantly.
- *
- * This function must be used as-if it were wake_up_process(); IOW the task
- * must be ready to be woken at this location.
- */
-void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
struct wake_q_node *node = &task->wake_q;
@@ -421,16 +411,56 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
* state, even in the failed case, an explicit smp_mb() must be used.
*/
smp_mb__before_atomic();
- if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
- return;
-
- get_task_struct(task);
+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
+ return false;
/*
* The head is context local, there can be no concurrency.
*/
*head->lastp = node;
head->lastp = &node->next;
+ return true;
+}
+
+/**
+ * wake_q_add() - queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ */
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ if (__wake_q_add(head, task))
+ get_task_struct(task);
+}
+
+/**
+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ *
+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
+ * that already hold reference to @task can call the 'safe' version and trust
+ * wake_q to do the right thing depending whether or not the @task is already
+ * queued for wakeup.
+ */
+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
+{
+ if (!__wake_q_add(head, task))
+ put_task_struct(task);
}
void wake_up_q(struct wake_q_head *head)
@@ -928,7 +958,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
{
lockdep_assert_held(&rq->lock);
- p->on_rq = TASK_ON_RQ_MIGRATING;
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
dequeue_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
@@ -2190,6 +2220,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+#ifdef CONFIG_COMPACTION
+ p->capture_control = NULL;
+#endif
init_numa_balancing(clone_flags, p);
}
@@ -2431,7 +2464,7 @@ void wake_up_new_task(struct task_struct *p)
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
- post_init_entity_util_avg(&p->se);
+ post_init_entity_util_avg(p);
activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -5265,9 +5298,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
- compat_pid_t, pid,
- struct old_timespec32 __user *, interval)
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
+ struct old_timespec32 __user *, interval)
{
struct timespec64 t;
int retval = sched_rr_get_interval(pid, &t);
@@ -5867,14 +5899,11 @@ void __init sched_init_smp(void)
/*
* There's no userspace yet to cause hotplug operations; hence all the
* CPU masks are stable and all blatant races in the below code cannot
- * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
- * but there won't be any contention on it.
+ * happen.
*/
- cpus_read_lock();
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
mutex_unlock(&sched_domains_mutex);
- cpus_read_unlock();
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
@@ -6162,6 +6191,34 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL(___might_sleep);
+
+void __cant_sleep(const char *file, int line, int preempt_offset)
+{
+ static unsigned long prev_jiffy;
+
+ if (irqs_disabled())
+ return;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ if (preempt_count() > preempt_offset)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
@@ -6941,7 +6998,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf,
{
char tok[21]; /* U64_MAX */
- if (!sscanf(buf, "%s %llu", tok, periodp))
+ if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
return -EINVAL;
*periodp *= NSEC_PER_USEC;
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 22bd8980f32f..835671f0f917 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -48,8 +48,8 @@ EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
*
* Clear the update_util_data pointer for the given CPU.
*
- * Callers must use RCU-sched callbacks to free any memory that might be
- * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * Callers must use RCU callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_rcu()
* right after this function to avoid use-after-free.
*/
void cpufreq_remove_update_util_hook(int cpu)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 033ec7c45f13..5c41ea367422 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -48,10 +48,10 @@ struct sugov_cpu {
bool iowait_boost_pending;
unsigned int iowait_boost;
- unsigned int iowait_boost_max;
u64 last_update;
unsigned long bw_dl;
+ unsigned long min;
unsigned long max;
/* The field below is for single-CPU policies only: */
@@ -303,8 +303,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
if (delta_ns <= TICK_NSEC)
return false;
- sg_cpu->iowait_boost = set_iowait_boost
- ? sg_cpu->sg_policy->policy->min : 0;
+ sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0;
sg_cpu->iowait_boost_pending = set_iowait_boost;
return true;
@@ -344,14 +343,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
/* Double the boost at each request */
if (sg_cpu->iowait_boost) {
- sg_cpu->iowait_boost <<= 1;
- if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ sg_cpu->iowait_boost =
+ min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
return;
}
/* First wakeup after IO: start with minimum boost */
- sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ sg_cpu->iowait_boost = sg_cpu->min;
}
/**
@@ -373,47 +371,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* This mechanism is designed to boost high frequently IO waiting tasks, while
* being more conservative on tasks which does sporadic IO operations.
*/
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
- unsigned long *util, unsigned long *max)
+static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long util, unsigned long max)
{
- unsigned int boost_util, boost_max;
+ unsigned long boost;
/* No boost currently required */
if (!sg_cpu->iowait_boost)
- return;
+ return util;
/* Reset boost if the CPU appears to have been idle enough */
if (sugov_iowait_reset(sg_cpu, time, false))
- return;
+ return util;
- /*
- * An IO waiting task has just woken up:
- * allow to further double the boost value
- */
- if (sg_cpu->iowait_boost_pending) {
- sg_cpu->iowait_boost_pending = false;
- } else {
+ if (!sg_cpu->iowait_boost_pending) {
/*
- * Otherwise: reduce the boost value and disable it when we
- * reach the minimum.
+ * No boost pending; reduce the boost value.
*/
sg_cpu->iowait_boost >>= 1;
- if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+ if (sg_cpu->iowait_boost < sg_cpu->min) {
sg_cpu->iowait_boost = 0;
- return;
+ return util;
}
}
+ sg_cpu->iowait_boost_pending = false;
+
/*
- * Apply the current boost value: a CPU is boosted only if its current
- * utilization is smaller then the current IO boost level.
+ * @util is already in capacity scale; convert iowait_boost
+ * into the same scale so we can compare.
*/
- boost_util = sg_cpu->iowait_boost;
- boost_max = sg_cpu->iowait_boost_max;
- if (*util * boost_max < *max * boost_util) {
- *util = boost_util;
- *max = boost_max;
- }
+ boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
+ return max(boost, util);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -460,7 +449,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
util = sugov_get_util(sg_cpu);
max = sg_cpu->max;
- sugov_iowait_apply(sg_cpu, time, &util, &max);
+ util = sugov_iowait_apply(sg_cpu, time, util, max);
next_f = get_next_freq(sg_policy, util, max);
/*
* Do not reduce the frequency if the CPU has not been idle
@@ -500,7 +489,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
j_util = sugov_get_util(j_sg_cpu);
j_max = j_sg_cpu->max;
- sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
+ j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
if (j_util * max > j_max * util) {
util = j_util;
@@ -837,7 +826,9 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
- sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ sg_cpu->min =
+ (SCHED_CAPACITY_SCALE * policy->cpuinfo.min_freq) /
+ policy->cpuinfo.max_freq;
}
for_each_cpu(cpu, policy->cpus) {
@@ -859,7 +850,7 @@ static void sugov_stop(struct cpufreq_policy *policy)
for_each_cpu(cpu, policy->cpus)
cpufreq_remove_update_util_hook(cpu);
- synchronize_sched();
+ synchronize_rcu();
if (!policy->fast_switch_enabled) {
irq_work_sync(&sg_policy->irq_work);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fb8b7b5d745d..6a73e41a2016 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1767,7 +1767,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
deadline_queue_push_tasks(rq);
if (rq->curr->sched_class != &dl_sched_class)
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
return p;
}
@@ -1776,7 +1776,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1793,7 +1793,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
/*
* Even when we have runtime, update_curr_dl() might have resulted in us
* not being the leftmost task anymore. In that case NEED_RESCHED will
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index de3de997e245..8039d62ae36e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -315,6 +315,7 @@ void register_sched_domain_sysctl(void)
{
static struct ctl_table *cpu_entries;
static struct ctl_table **cpu_idx;
+ static bool init_done = false;
char buf[32];
int i;
@@ -344,7 +345,10 @@ void register_sched_domain_sysctl(void)
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
+ }
+ if (!init_done) {
+ init_done = true;
/* init to possible to not have holes in @cpu_entries */
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 310d0637fe4b..40bd1e27b1b7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -248,13 +248,6 @@ const struct sched_class fair_sched_class;
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->rq;
-}
-
static inline struct task_struct *task_of(struct sched_entity *se)
{
SCHED_WARN_ON(!entity_is_task(se));
@@ -282,79 +275,103 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
- if (!cfs_rq->on_list) {
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+
+ if (cfs_rq->on_list)
+ return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
+
+ cfs_rq->on_list = 1;
+
+ /*
+ * Ensure we either appear before our parent (if already
+ * enqueued) or force our parent to appear after us when it is
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the top of the tree
+ */
+ if (cfs_rq->tg->parent &&
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
/*
- * Ensure we either appear before our parent (if already
- * enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases and a special case for the root
- * cfs_rq. Furthermore, it also means that we will always reset
- * tmp_alone_branch either when the branch is connected
- * to a tree or when we reach the beg of the tree
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
*/
- if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
- /*
- * If parent is already on the list, we add the child
- * just before. Thanks to circular linked property of
- * the list, this means to put the child at the tail
- * of the list that starts by parent.
- */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
- /*
- * The branch is now connected to its tree so we can
- * reset tmp_alone_branch to the beginning of the
- * list.
- */
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- } else if (!cfs_rq->tg->parent) {
- /*
- * cfs rq without parent should be put
- * at the tail of the list.
- */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq->leaf_cfs_rq_list);
- /*
- * We have reach the beg of a tree so we can reset
- * tmp_alone_branch to the beginning of the list.
- */
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- } else {
- /*
- * The parent has not already been added so we want to
- * make sure that it will be put after us.
- * tmp_alone_branch points to the beg of the branch
- * where we will add parent.
- */
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- rq->tmp_alone_branch);
- /*
- * update tmp_alone_branch to points to the new beg
- * of the branch
- */
- rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
- }
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
+ }
- cfs_rq->on_list = 1;
+ if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the top of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
}
+
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the begin of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new begin
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
+ return false;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ /*
+ * With cfs_rq being unthrottled/throttled during an enqueue,
+ * it can happen the tmp_alone_branch points the a leaf that
+ * we finally want to del. In this case, tmp_alone_branch moves
+ * to the prev element but it will point to rq->leaf_cfs_rq_list
+ * at the end of the enqueue.
+ */
+ if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
+ rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
+
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
}
-/* Iterate through all leaf cfs_rq's on a runqueue: */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+ SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
+ leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
@@ -410,12 +427,6 @@ static inline struct task_struct *task_of(struct sched_entity *se)
return container_of(se, struct task_struct, se);
}
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return container_of(cfs_rq, struct rq, cfs);
-}
-
-
#define for_each_sched_entity(se) \
for (; se; se = NULL)
@@ -438,16 +449,21 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return NULL;
}
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
+ return true;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+}
+
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
@@ -686,9 +702,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
-#ifdef CONFIG_SMP
#include "pelt.h"
-#include "sched-pelt.h"
+#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
@@ -744,8 +759,9 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
* if util_avg > util_avg_cap.
*/
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
{
+ struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
@@ -763,22 +779,19 @@ void post_init_entity_util_avg(struct sched_entity *se)
}
}
- if (entity_is_task(se)) {
- struct task_struct *p = task_of(se);
- if (p->sched_class != &fair_sched_class) {
- /*
- * For !fair tasks do:
- *
- update_cfs_rq_load_avg(now, cfs_rq);
- attach_entity_load_avg(cfs_rq, se, 0);
- switched_from_fair(rq, p);
- *
- * such that the next switched_to_fair() has the
- * expected state.
- */
- se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
- return;
- }
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq);
+ attach_entity_load_avg(cfs_rq, se, 0);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
+ return;
}
attach_entity_cfs_rq(se);
@@ -788,7 +801,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
void init_entity_runnable_average(struct sched_entity *se)
{
}
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
{
}
static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
@@ -1035,7 +1048,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
unsigned int sysctl_numa_balancing_scan_delay = 1000;
struct numa_group {
- atomic_t refcount;
+ refcount_t refcount;
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
@@ -1104,7 +1117,7 @@ static unsigned int task_scan_start(struct task_struct *p)
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
- period *= atomic_read(&ng->refcount);
+ period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
}
@@ -1127,7 +1140,7 @@ static unsigned int task_scan_max(struct task_struct *p)
unsigned long private = group_faults_priv(ng);
unsigned long period = smax;
- period *= atomic_read(&ng->refcount);
+ period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
@@ -1160,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = -1;
+ p->numa_preferred_nid = NUMA_NO_NODE;
return;
}
@@ -1180,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}
static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
@@ -1400,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
- if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;
@@ -1848,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -2095,7 +2108,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
static void task_numa_placement(struct task_struct *p)
{
- int seq, nid, max_nid = -1;
+ int seq, nid, max_nid = NUMA_NO_NODE;
unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
@@ -2203,12 +2216,12 @@ static void task_numa_placement(struct task_struct *p)
static inline int get_numa_group(struct numa_group *grp)
{
- return atomic_inc_not_zero(&grp->refcount);
+ return refcount_inc_not_zero(&grp->refcount);
}
static inline void put_numa_group(struct numa_group *grp)
{
- if (atomic_dec_and_test(&grp->refcount))
+ if (refcount_dec_and_test(&grp->refcount))
kfree_rcu(grp, rcu);
}
@@ -2229,7 +2242,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!grp)
return;
- atomic_set(&grp->refcount, 1);
+ refcount_set(&grp->refcount, 1);
grp->active_nodes = 1;
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
@@ -2638,7 +2651,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
* the preferred node.
*/
if (dst_nid == p->numa_preferred_nid ||
- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+ (p->numa_preferred_nid != NUMA_NO_NODE &&
+ src_nid != p->numa_preferred_nid))
return;
}
@@ -3122,7 +3136,7 @@ void set_task_rq_fair(struct sched_entity *se,
p_last_update_time = prev->avg.last_update_time;
n_last_update_time = next->avg.last_update_time;
#endif
- __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+ __update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
@@ -3257,11 +3271,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/*
* runnable_sum can't be lower than running_sum
- * As running sum is scale with CPU capacity wehreas the runnable sum
- * is not we rescale running_sum 1st
+ * Rescale running sum to be in the same range as runnable sum
+ * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
+ * runnable_sum is in [0 : LOAD_AVG_MAX]
*/
- running_sum = se->avg.util_sum /
- arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
load_sum = (s64)se_weight(se) * runnable_sum;
@@ -3364,7 +3378,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
- * @now: current time, as per cfs_rq_clock_task()
+ * @now: current time, as per cfs_rq_clock_pelt()
* @cfs_rq: cfs_rq to update
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
@@ -3409,7 +3423,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
decayed = 1;
}
- decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+ decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -3499,9 +3513,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- u64 now = cfs_rq_clock_task(cfs_rq);
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
+ u64 now = cfs_rq_clock_pelt(cfs_rq);
int decayed;
/*
@@ -3509,7 +3521,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* track group sched_entity load average for task_h_load calc in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
- __update_load_avg_se(now, cpu, cfs_rq, se);
+ __update_load_avg_se(now, cfs_rq, se);
decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
@@ -3561,7 +3573,7 @@ void sync_entity_load_avg(struct sched_entity *se)
u64 last_update_time;
last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
+ __update_load_avg_blocked_se(last_update_time, se);
}
/*
@@ -3577,10 +3589,6 @@ void remove_entity_load_avg(struct sched_entity *se)
* tasks cannot exit without having gone through wake_up_new_task() ->
* post_init_entity_util_avg() which will have added things to the
* cfs_rq, so we can remove unconditionally.
- *
- * Similarly for groups, they will have passed through
- * post_init_entity_util_avg() before unregister_sched_fair_group()
- * calls this.
*/
sync_entity_load_avg(se);
@@ -3654,6 +3662,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
{
long last_ewma_diff;
struct util_est ue;
+ int cpu;
if (!sched_feat(UTIL_EST))
return;
@@ -3688,6 +3697,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
return;
/*
+ * To avoid overestimation of actual task utilization, skip updates if
+ * we cannot grant there is idle time in this CPU.
+ */
+ cpu = cpu_of(rq_of(cfs_rq));
+ if (task_util(p) > capacity_orig_of(cpu))
+ return;
+
+ /*
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
@@ -4429,6 +4446,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
/* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
+
+ /* Add cfs_rq with already running entity in the list */
+ if (cfs_rq->nr_running >= 1)
+ list_add_leaf_cfs_rq(cfs_rq);
}
return 0;
@@ -4440,8 +4461,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
/* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count)
+ if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
cfs_rq->throttle_count++;
return 0;
@@ -4544,6 +4567,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
break;
}
+ assert_list_leaf_cfs_rq(rq);
+
if (!se)
add_nr_running(rq, task_delta);
@@ -4565,7 +4590,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;
- rq_lock(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
@@ -4582,7 +4607,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
unthrottle_cfs_rq(cfs_rq);
next:
- rq_unlock(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);
if (!remaining)
break;
@@ -4598,7 +4623,7 @@ next:
* period the timer is deactivated until scheduling resumes; cfs_b->idle is
* used to track this state.
*/
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
u64 runtime, runtime_expires;
int throttled;
@@ -4640,11 +4665,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
cfs_b->distribute_running = 1;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
@@ -4753,17 +4778,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+ unsigned long flags;
u64 expires;
/* confirm we're still not at a refresh boundary */
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
if (cfs_b->distribute_running) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
@@ -4774,18 +4800,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (runtime)
cfs_b->distribute_running = 1;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
return;
runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
if (expires == cfs_b->runtime_expires)
lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
/*
@@ -4863,20 +4889,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
+ unsigned long flags;
int overrun;
int idle = 0;
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;
- idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -4986,6 +5013,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
}
#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return false;
+}
+
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
return rq_clock_task(rq_of(cfs_rq));
@@ -5177,6 +5210,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
+ if (cfs_bandwidth_used()) {
+ /*
+ * When bandwidth control is enabled; the cfs_rq_throttled()
+ * breaks in the above iteration can result in incomplete
+ * leaf list maintenance, resulting in triggering the assertion
+ * below.
+ */
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (list_add_leaf_cfs_rq(cfs_rq))
+ break;
+ }
+ }
+
+ assert_list_leaf_cfs_rq(rq);
+
hrtick_update(rq);
}
@@ -5556,11 +5606,6 @@ static unsigned long capacity_of(int cpu)
return cpu_rq(cpu)->cpu_capacity;
}
-static unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
-
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -6053,7 +6098,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
- cpumask_clear_cpu(cpu, cpus);
+ __cpumask_clear_cpu(cpu, cpus);
if (!available_idle_cpu(cpu))
idle = false;
}
@@ -6073,7 +6118,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_smt(struct task_struct *p, int target)
{
int cpu;
@@ -6097,7 +6142,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1;
}
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline int select_idle_smt(struct task_struct *p, int target)
{
return -1;
}
@@ -6202,7 +6247,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_smt(p, sd, target);
+ i = select_idle_smt(p, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@@ -6608,7 +6653,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- if (static_branch_unlikely(&sched_energy_present)) {
+ if (sched_energy_enabled()) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
return new_cpu;
@@ -7027,6 +7072,12 @@ idle:
if (new_tasks > 0)
goto again;
+ /*
+ * rq is about to be idle, check if we need to update the
+ * lost_idle_time of clock_pelt
+ */
+ update_idle_rq_clock_pelt(rq);
+
return NULL;
}
@@ -7647,10 +7698,27 @@ static inline bool others_have_blocked(struct rq *rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load.weight)
+ return false;
+
+ if (cfs_rq->avg.load_sum)
+ return false;
+
+ if (cfs_rq->avg.util_sum)
+ return false;
+
+ if (cfs_rq->avg.runnable_load_sum)
+ return false;
+
+ return true;
+}
+
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
const struct sched_class *curr_class;
struct rq_flags rf;
bool done = true;
@@ -7662,14 +7730,10 @@ static void update_blocked_averages(int cpu)
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
- for_each_leaf_cfs_rq(rq, cfs_rq) {
+ for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- continue;
-
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */
@@ -7677,14 +7741,21 @@ static void update_blocked_averages(int cpu)
if (se && !skip_blocked_update(se))
update_load_avg(cfs_rq_of(se), se, 0);
+ /*
+ * There can be a lot of idle CPU cgroups. Don't let fully
+ * decayed cfs_rqs linger on the list.
+ */
+ if (cfs_rq_is_decayed(cfs_rq))
+ list_del_leaf_cfs_rq(cfs_rq);
+
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
done = false;
}
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
@@ -7713,10 +7784,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
if (cfs_rq->last_h_load_update == now)
return;
- cfs_rq->h_load_next = NULL;
+ WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- cfs_rq->h_load_next = se;
+ WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
}
@@ -7726,7 +7797,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
cfs_rq->last_h_load_update = now;
}
- while ((se = cfs_rq->h_load_next) != NULL) {
+ while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
load = cfs_rq->h_load;
load = div64_ul(load * se->avg.load_avg,
cfs_rq_load_avg(cfs_rq) + 1);
@@ -7754,11 +7825,11 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
@@ -7989,6 +8060,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
}
/*
+ * Check whether a rq has a misfit task and if it looks like we can actually
+ * help that task: we can migrate the task to a CPU of higher capacity, or
+ * the task's current CPU is heavily pressured.
+ */
+static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+{
+ return rq->misfit_task_load &&
+ (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+ check_cpu_capacity(rq, sd));
+}
+
+/*
* Group imbalance indicates (and tries to solve) the problem where balancing
* groups is inadequate due to ->cpus_allowed constraints.
*
@@ -8452,9 +8535,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
return 0;
- env->imbalance = DIV_ROUND_CLOSEST(
- sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
- SCHED_CAPACITY_SCALE);
+ env->imbalance = sds->busiest_stat.group_load;
return 1;
}
@@ -8636,7 +8717,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
update_sd_lb_stats(env, &sds);
- if (static_branch_unlikely(&sched_energy_present)) {
+ if (sched_energy_enabled()) {
struct root_domain *rd = env->dst_rq->rd;
if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
@@ -8827,21 +8908,25 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
#define MAX_PINNED_INTERVAL 512
-static int need_active_balance(struct lb_env *env)
+static inline bool
+asym_active_balance(struct lb_env *env)
{
- struct sched_domain *sd = env->sd;
+ /*
+ * ASYM_PACKING needs to force migrate tasks from busy but
+ * lower priority CPUs in order to pack all tasks in the
+ * highest priority CPUs.
+ */
+ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
+ sched_asym_prefer(env->dst_cpu, env->src_cpu);
+}
- if (env->idle == CPU_NEWLY_IDLE) {
+static inline bool
+voluntary_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
- /*
- * ASYM_PACKING needs to force migrate tasks from busy but
- * lower priority CPUs in order to pack all tasks in the
- * highest priority CPUs.
- */
- if ((sd->flags & SD_ASYM_PACKING) &&
- sched_asym_prefer(env->dst_cpu, env->src_cpu))
- return 1;
- }
+ if (asym_active_balance(env))
+ return 1;
/*
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
@@ -8859,6 +8944,16 @@ static int need_active_balance(struct lb_env *env)
if (env->src_grp_type == group_misfit_task)
return 1;
+ return 0;
+}
+
+static int need_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
+
+ if (voluntary_active_balance(env))
+ return 1;
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -9023,7 +9118,7 @@ more_balance:
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
/* Prevent to re-select dst_cpu via env's CPUs */
- cpumask_clear_cpu(env.dst_cpu, env.cpus);
+ __cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
@@ -9050,7 +9145,7 @@ more_balance:
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
- cpumask_clear_cpu(cpu_of(busiest), cpus);
+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
/*
* Attempting to continue load balancing at the current
* sched_domain level only makes sense if there are
@@ -9120,7 +9215,7 @@ more_balance:
} else
sd->nr_balance_failed = 0;
- if (likely(!active_balance)) {
+ if (likely(!active_balance) || voluntary_active_balance(&env)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
@@ -9469,15 +9564,8 @@ static void kick_ilb(unsigned int flags)
}
/*
- * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu in the system.
- * - This rq has more than one task.
- * - This rq has at least one CFS task and the capacity of the CPU is
- * significantly reduced because of RT tasks or IRQs.
- * - At parent of LLC scheduler domain level, this cpu's scheduler group has
- * multiple busy cpu.
- * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
- * domain span are idle.
+ * Current decision point for kicking the idle load balancer in the presence
+ * of idle CPUs in the system.
*/
static void nohz_balancer_kick(struct rq *rq)
{
@@ -9510,30 +9598,21 @@ static void nohz_balancer_kick(struct rq *rq)
if (time_before(now, nohz.next_balance))
goto out;
- if (rq->nr_running >= 2 || rq->misfit_task_load) {
+ if (rq->nr_running >= 2) {
flags = NOHZ_KICK_MASK;
goto out;
}
rcu_read_lock();
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- if (sds) {
- /*
- * XXX: write a coherent comment on why we do this.
- * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
- */
- nr_busy = atomic_read(&sds->nr_busy_cpus);
- if (nr_busy > 1) {
- flags = NOHZ_KICK_MASK;
- goto unlock;
- }
-
- }
sd = rcu_dereference(rq->sd);
if (sd) {
- if ((rq->cfs.h_nr_running >= 1) &&
- check_cpu_capacity(rq, sd)) {
+ /*
+ * If there's a CFS task and the current CPU has reduced
+ * capacity; kick the ILB to see if there's a better CPU to run
+ * on.
+ */
+ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
@@ -9541,17 +9620,57 @@ static void nohz_balancer_kick(struct rq *rq)
sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
if (sd) {
- for_each_cpu(i, sched_domain_span(sd)) {
- if (i == cpu ||
- !cpumask_test_cpu(i, nohz.idle_cpus_mask))
- continue;
-
+ /*
+ * When ASYM_PACKING; see if there's a more preferred CPU
+ * currently idle; in which case, kick the ILB to move tasks
+ * around.
+ */
+ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
}
}
+
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
+ if (sd) {
+ /*
+ * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
+ * to run the misfit task on.
+ */
+ if (check_misfit_status(rq, sd)) {
+ flags = NOHZ_KICK_MASK;
+ goto unlock;
+ }
+
+ /*
+ * For asymmetric systems, we do not want to nicely balance
+ * cache use, instead we want to embrace asymmetry and only
+ * ensure tasks have enough CPU capacity.
+ *
+ * Skip the LLC logic because it's not relevant in that case.
+ */
+ goto unlock;
+ }
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds) {
+ /*
+ * If there is an imbalance between LLC domains (IOW we could
+ * increase the overall cache use), we need some less-loaded LLC
+ * domain to pull some load. Likewise, we may need to spread
+ * load within the current LLC domain (e.g. packed SMT cores but
+ * other CPUs are idle). We can't really know from here how busy
+ * the others are - so just get a nohz balance going if it looks
+ * like this LLC domain has tasks we could move.
+ */
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
+ if (nr_busy > 1) {
+ flags = NOHZ_KICK_MASK;
+ goto unlock;
+ }
+ }
unlock:
rcu_read_unlock();
out:
@@ -10546,10 +10665,10 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
rcu_read_lock();
- for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 81faddba9e20..b02d148e7672 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -80,7 +80,7 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
cpumask_andnot(housekeeping_mask,
cpu_possible_mask, non_housekeeping_mask);
if (cpumask_empty(housekeeping_mask))
- cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+ __cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
} else {
cpumask_var_t tmp;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 90fb5bc12ad4..befce29bd882 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -26,7 +26,6 @@
#include <linux/sched.h>
#include "sched.h"
-#include "sched-pelt.h"
#include "pelt.h"
/*
@@ -106,16 +105,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
* n=1
*/
static __always_inline u32
-accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+accumulate_sum(u64 delta, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
- unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
- scale_freq = arch_scale_freq_capacity(cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
delta += sa->period_contrib;
periods = delta / 1024; /* A period is 1024us (~1ms) */
@@ -137,13 +132,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
}
sa->period_contrib = delta;
- contrib = cap_scale(contrib, scale_freq);
if (load)
sa->load_sum += load * contrib;
if (runnable)
sa->runnable_load_sum += runnable * contrib;
if (running)
- sa->util_sum += contrib * scale_cpu;
+ sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
return periods;
}
@@ -177,7 +171,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+___update_load_sum(u64 now, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
u64 delta;
@@ -221,7 +215,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
+ if (!accumulate_sum(delta, sa, load, runnable, running))
return 0;
return 1;
@@ -267,9 +261,9 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
* runnable_load_avg = \Sum se->avg.runable_load_avg
*/
-int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
- if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+ if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
return 1;
}
@@ -277,9 +271,9 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
return 0;
}
-int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+ if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,
cfs_rq->curr == se)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
@@ -290,9 +284,9 @@ int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_e
return 0;
}
-int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
- if (___update_load_sum(now, cpu, &cfs_rq->avg,
+ if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
scale_load_down(cfs_rq->runnable_weight),
cfs_rq->curr != NULL)) {
@@ -317,7 +311,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
{
- if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
+ if (___update_load_sum(now, &rq->avg_rt,
running,
running,
running)) {
@@ -340,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
{
- if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
+ if (___update_load_sum(now, &rq->avg_dl,
running,
running,
running)) {
@@ -365,22 +359,31 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
int update_irq_load_avg(struct rq *rq, u64 running)
{
int ret = 0;
+
+ /*
+ * We can't use clock_pelt because irq time is not accounted in
+ * clock_task. Instead we directly scale the running time to
+ * reflect the real amount of computation
+ */
+ running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
+ running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+
/*
* We know the time that has been used by interrupt since last update
* but we don't when. Let be pessimistic and assume that interrupt has
* happened just before the update. This is not so far from reality
* because interrupt will most probably wake up task and trig an update
- * of rq clock during which the metric si updated.
+ * of rq clock during which the metric is updated.
* We start to decay with normal context time and then we add the
* interrupt context time.
* We can safely remove running from rq->clock because
* rq->clock += delta with delta >= running
*/
- ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
+ ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
0,
0,
0);
- ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
+ ret += ___update_load_sum(rq->clock, &rq->avg_irq,
1,
1,
1);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7e56b489ff32..7489d5f56960 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,8 +1,9 @@
#ifdef CONFIG_SMP
+#include "sched-pelt.h"
-int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
-int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
-int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
+int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
+int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
@@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
+/*
+ * The clock_pelt scales the time to reflect the effective amount of
+ * computation done during the running delta time but then sync back to
+ * clock_task when rq is idle.
+ *
+ *
+ * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
+ * @ max capacity ------******---------------******---------------
+ * @ half capacity ------************---------************---------
+ * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16
+ *
+ */
+static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
+{
+ if (unlikely(is_idle_task(rq->curr))) {
+ /* The rq is idle, we can sync to clock_task */
+ rq->clock_pelt = rq_clock_task(rq);
+ return;
+ }
+
+ /*
+ * When a rq runs at a lower compute capacity, it will need
+ * more time to do the same amount of work than at max
+ * capacity. In order to be invariant, we scale the delta to
+ * reflect how much work has been really done.
+ * Running longer results in stealing idle time that will
+ * disturb the load signal compared to max capacity. This
+ * stolen idle time will be automatically reflected when the
+ * rq will be idle and the clock will be synced with
+ * rq_clock_task.
+ */
+
+ /*
+ * Scale the elapsed time to reflect the real amount of
+ * computation
+ */
+ delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+ delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
+
+ rq->clock_pelt += delta;
+}
+
+/*
+ * When rq becomes idle, we have to check if it has lost idle time
+ * because it was fully busy. A rq is fully used when the /Sum util_sum
+ * is greater or equal to:
+ * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
+ * For optimization and computing rounding purpose, we don't take into account
+ * the position in the current window (period_contrib) and we use the higher
+ * bound of util_sum to decide.
+ */
+static inline void update_idle_rq_clock_pelt(struct rq *rq)
+{
+ u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX;
+ u32 util_sum = rq->cfs.avg.util_sum;
+ util_sum += rq->avg_rt.util_sum;
+ util_sum += rq->avg_dl.util_sum;
+
+ /*
+ * Reflecting stolen time makes sense only if the idle
+ * phase would be present at max capacity. As soon as the
+ * utilization of a rq has reached the maximum value, it is
+ * considered as an always runnig rq without idle time to
+ * steal. This potential idle time is considered as lost in
+ * this case. We keep track of this lost idle time compare to
+ * rq's clock_task.
+ */
+ if (util_sum >= divider)
+ rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+}
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ assert_clock_updated(rq);
+
+ return rq->clock_pelt - rq->lost_idle_time;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+
+ return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+}
+#else
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ return rq_clock_pelt(rq_of(cfs_rq));
+}
+#endif
+
#else
static inline int
@@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
}
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ return rq_clock_task(rq);
+}
+
+static inline void
+update_rq_clock_pelt(struct rq *rq, s64 delta) { }
+
+static inline void
+update_idle_rq_clock_pelt(struct rq *rq) { }
+
#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e4f398ad9e73..90fa23d36565 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1587,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* rt task
*/
if (rq->curr->sched_class != &rt_sched_class)
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
return p;
}
@@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
update_curr_rt(rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
/*
* The previous task needs to be made eligible for pushing
@@ -2325,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
watchdog(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d04530bf251f..efa686eeff26 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -861,7 +861,10 @@ struct rq {
unsigned int clock_update_flags;
u64 clock;
- u64 clock_task;
+ /* Ensure that all clocks are in the same cache line */
+ u64 clock_task ____cacheline_aligned;
+ u64 clock_pelt;
+ unsigned long lost_idle_time;
atomic_t nr_iowait;
@@ -951,6 +954,22 @@ struct rq {
#endif
};
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* CPU runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->rq;
+}
+
+#else
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return container_of(cfs_rq, struct rq, cfs);
+}
+#endif
+
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
@@ -1260,7 +1279,7 @@ extern void sched_ttwu_pending(void);
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See detach_destroy_domains: synchronize_sched for details.
+ * See destroy_sched_domains: call_rcu for details.
*
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
@@ -1460,9 +1479,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
*/
smp_wmb();
#ifdef CONFIG_THREAD_INFO_IN_TASK
- p->cpu = cpu;
+ WRITE_ONCE(p->cpu, cpu);
#else
- task_thread_info(p)->cpu = cpu;
+ WRITE_ONCE(task_thread_info(p)->cpu, cpu);
#endif
p->wake_cpu = cpu;
#endif
@@ -1563,7 +1582,7 @@ static inline int task_on_rq_queued(struct task_struct *p)
static inline int task_on_rq_migrating(struct task_struct *p)
{
- return p->on_rq == TASK_ON_RQ_MIGRATING;
+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
}
/*
@@ -1781,7 +1800,7 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
-extern void post_init_entity_util_avg(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct task_struct *p);
#ifdef CONFIG_NO_HZ_FULL
extern bool sched_can_stop_tick(struct rq *rq);
@@ -2211,6 +2230,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
# define arch_scale_freq_invariant() false
#endif
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+#endif
+
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
/**
* enum schedutil_type - CPU utilization type
@@ -2299,11 +2325,19 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
#endif
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+
#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
-#else
+
+DECLARE_STATIC_KEY_FALSE(sched_energy_present);
+
+static inline bool sched_energy_enabled(void)
+{
+ return static_branch_unlikely(&sched_energy_present);
+}
+
+#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
+
#define perf_domain_span(pd) NULL
-#endif
+static inline bool sched_energy_enabled(void) { return false; }
-#ifdef CONFIG_SMP
-extern struct static_key_false sched_energy_present;
-#endif
+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3f35ba1d8fde..ab7f371a3a17 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -201,11 +201,37 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 1;
}
-DEFINE_STATIC_KEY_FALSE(sched_energy_present);
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+DEFINE_STATIC_KEY_FALSE(sched_energy_present);
+unsigned int sysctl_sched_energy_aware = 1;
DEFINE_MUTEX(sched_energy_mutex);
bool sched_energy_update;
+#ifdef CONFIG_PROC_SYSCTL
+int sched_energy_aware_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, state;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!ret && write) {
+ state = static_branch_unlikely(&sched_energy_present);
+ if (state != sysctl_sched_energy_aware) {
+ mutex_lock(&sched_energy_mutex);
+ sched_energy_update = 1;
+ rebuild_sched_domains();
+ sched_energy_update = 0;
+ mutex_unlock(&sched_energy_mutex);
+ }
+ }
+
+ return ret;
+}
+#endif
+
static void free_pd(struct perf_domain *pd)
{
struct perf_domain *tmp;
@@ -322,6 +348,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
struct cpufreq_policy *policy;
struct cpufreq_governor *gov;
+ if (!sysctl_sched_energy_aware)
+ goto free;
+
/* EAS is enabled for asymmetric CPU capacity topologies. */
if (!per_cpu(sd_asym_cpucapacity, cpu)) {
if (sched_debug()) {
@@ -442,7 +471,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
raw_spin_unlock_irqrestore(&rq->lock, flags);
if (old_rd)
- call_rcu_sched(&old_rd->rcu, free_rootdomain);
+ call_rcu(&old_rd->rcu, free_rootdomain);
}
void sched_get_rd(struct root_domain *rd)
@@ -455,7 +484,7 @@ void sched_put_rd(struct root_domain *rd)
if (!atomic_dec_and_test(&rd->refcount))
return;
- call_rcu_sched(&rd->rcu, free_rootdomain);
+ call_rcu(&rd->rcu, free_rootdomain);
}
static int init_rootdomain(struct root_domain *rd)
@@ -676,7 +705,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
}
struct s_data {
- struct sched_domain ** __percpu sd;
+ struct sched_domain * __percpu *sd;
struct root_domain *rd;
};
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e815781ed751..df27e499956a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -149,7 +149,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
sd->nr = syscall_get_nr(task, regs);
sd->arch = syscall_get_arch();
- syscall_get_arguments(task, regs, 0, 6, args);
+ syscall_get_arguments(task, regs, args);
sd->args[0] = args[0];
sd->args[1] = args[1];
sd->args[2] = args[2];
@@ -267,6 +267,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
+ preempt_disable();
for (; f; f = f->prev) {
u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
@@ -275,6 +276,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
*match = f;
}
}
+ preempt_enable();
return ret;
}
#endif /* CONFIG_SECCOMP_FILTER */
@@ -443,8 +445,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
* behavior of privileged children.
*/
if (!task_no_new_privs(current) &&
- security_capable_noaudit(current_cred(), current_user_ns(),
- CAP_SYS_ADMIN) != 0)
+ security_capable(current_cred(), current_user_ns(),
+ CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0)
return ERR_PTR(-EACCES);
/* Allocate a new seccomp_filter */
diff --git a/kernel/signal.c b/kernel/signal.c
index 57b7771e20d7..f98448cf2def 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -19,7 +19,9 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/proc_fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/coredump.h>
@@ -3455,7 +3457,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
struct compat_siginfo __user *, uinfo,
struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)
{
@@ -3487,6 +3489,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
#endif
#endif
+static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
+{
+ clear_siginfo(info);
+ info->si_signo = sig;
+ info->si_errno = 0;
+ info->si_code = SI_USER;
+ info->si_pid = task_tgid_vnr(current);
+ info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
+}
+
/**
* sys_kill - send a signal to a process
* @pid: the PID of the process
@@ -3496,16 +3508,120 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct kernel_siginfo info;
- clear_siginfo(&info);
- info.si_signo = sig;
- info.si_errno = 0;
- info.si_code = SI_USER;
- info.si_pid = task_tgid_vnr(current);
- info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+ prepare_kill_siginfo(sig, &info);
return kill_something_info(sig, &info, pid);
}
+#ifdef CONFIG_PROC_FS
+/*
+ * Verify that the signaler and signalee either are in the same pid namespace
+ * or that the signaler's pid namespace is an ancestor of the signalee's pid
+ * namespace.
+ */
+static bool access_pidfd_pidns(struct pid *pid)
+{
+ struct pid_namespace *active = task_active_pid_ns(current);
+ struct pid_namespace *p = ns_of_pid(pid);
+
+ for (;;) {
+ if (!p)
+ return false;
+ if (p == active)
+ break;
+ p = p->parent;
+ }
+
+ return true;
+}
+
+static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
+{
+#ifdef CONFIG_COMPAT
+ /*
+ * Avoid hooking up compat syscalls and instead handle necessary
+ * conversions here. Note, this is a stop-gap measure and should not be
+ * considered a generic solution.
+ */
+ if (in_compat_syscall())
+ return copy_siginfo_from_user32(
+ kinfo, (struct compat_siginfo __user *)info);
+#endif
+ return copy_siginfo_from_user(kinfo, info);
+}
+
+/**
+ * sys_pidfd_send_signal - send a signal to a process through a task file
+ * descriptor
+ * @pidfd: the file descriptor of the process
+ * @sig: signal to be sent
+ * @info: the signal info
+ * @flags: future flags to be passed
+ *
+ * The syscall currently only signals via PIDTYPE_PID which covers
+ * kill(<positive-pid>, <signal>. It does not signal threads or process
+ * groups.
+ * In order to extend the syscall to threads and process groups the @flags
+ * argument should be used. In essence, the @flags argument will determine
+ * what is signaled and not the file descriptor itself. Put in other words,
+ * grouping is a property of the flags argument not a property of the file
+ * descriptor.
+ *
+ * Return: 0 on success, negative errno on failure
+ */
+SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
+ siginfo_t __user *, info, unsigned int, flags)
+{
+ int ret;
+ struct fd f;
+ struct pid *pid;
+ kernel_siginfo_t kinfo;
+
+ /* Enforce flags be set to 0 until we add an extension. */
+ if (flags)
+ return -EINVAL;
+
+ f = fdget_raw(pidfd);
+ if (!f.file)
+ return -EBADF;
+
+ /* Is this a pidfd? */
+ pid = tgid_pidfd_to_pid(f.file);
+ if (IS_ERR(pid)) {
+ ret = PTR_ERR(pid);
+ goto err;
+ }
+
+ ret = -EINVAL;
+ if (!access_pidfd_pidns(pid))
+ goto err;
+
+ if (info) {
+ ret = copy_siginfo_from_user_any(&kinfo, info);
+ if (unlikely(ret))
+ goto err;
+
+ ret = -EINVAL;
+ if (unlikely(sig != kinfo.si_signo))
+ goto err;
+
+ /* Only allow sending arbitrary signals to yourself. */
+ ret = -EPERM;
+ if ((task_pid(current) != pid) &&
+ (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
+ goto err;
+ } else {
+ prepare_kill_siginfo(sig, &kinfo);
+ }
+
+ ret = kill_pid_info(sig, &kinfo, pid);
+
+err:
+ fdput(f);
+ return ret;
+}
+#endif /* CONFIG_PROC_FS */
+
static int
do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
{
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d28813306b2c..10277429ed84 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -89,7 +89,8 @@ static bool ksoftirqd_running(unsigned long pending)
if (pending & SOFTIRQ_NOW_MASK)
return false;
- return tsk && (tsk->state == TASK_RUNNING);
+ return tsk && (tsk->state == TASK_RUNNING) &&
+ !__kthread_should_park(tsk);
}
/*
diff --git a/kernel/sys.c b/kernel/sys.c
index f7eb62eceb24..12df0e5434b8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -516,7 +516,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
new->uid = kruid;
if (!uid_eq(old->uid, kruid) &&
!uid_eq(old->euid, kruid) &&
- !ns_capable(old->user_ns, CAP_SETUID))
+ !ns_capable_setid(old->user_ns, CAP_SETUID))
goto error;
}
@@ -525,7 +525,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
if (!uid_eq(old->uid, keuid) &&
!uid_eq(old->euid, keuid) &&
!uid_eq(old->suid, keuid) &&
- !ns_capable(old->user_ns, CAP_SETUID))
+ !ns_capable_setid(old->user_ns, CAP_SETUID))
goto error;
}
@@ -584,7 +584,7 @@ long __sys_setuid(uid_t uid)
old = current_cred();
retval = -EPERM;
- if (ns_capable(old->user_ns, CAP_SETUID)) {
+ if (ns_capable_setid(old->user_ns, CAP_SETUID)) {
new->suid = new->uid = kuid;
if (!uid_eq(kuid, old->uid)) {
retval = set_user(new);
@@ -646,7 +646,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
old = current_cred();
retval = -EPERM;
- if (!ns_capable(old->user_ns, CAP_SETUID)) {
+ if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
!uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
goto error;
@@ -814,7 +814,7 @@ long __sys_setfsuid(uid_t uid)
if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) ||
uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
- ns_capable(old->user_ns, CAP_SETUID)) {
+ ns_capable_setid(old->user_ns, CAP_SETUID)) {
if (!uid_eq(kuid, old->fsuid)) {
new->fsuid = kuid;
if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -1747,6 +1747,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
if (who == RUSAGE_CHILDREN)
break;
+ /* fall through */
case RUSAGE_SELF:
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ab9d0e3c6d50..d21f4befaea4 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,10 +42,15 @@ COND_SYSCALL(io_destroy);
COND_SYSCALL(io_submit);
COND_SYSCALL_COMPAT(io_submit);
COND_SYSCALL(io_cancel);
+COND_SYSCALL(io_getevents_time32);
COND_SYSCALL(io_getevents);
+COND_SYSCALL(io_pgetevents_time32);
COND_SYSCALL(io_pgetevents);
-COND_SYSCALL_COMPAT(io_getevents);
+COND_SYSCALL_COMPAT(io_pgetevents_time32);
COND_SYSCALL_COMPAT(io_pgetevents);
+COND_SYSCALL(io_uring_setup);
+COND_SYSCALL(io_uring_enter);
+COND_SYSCALL(io_uring_register);
/* fs/xattr.c */
@@ -114,9 +119,9 @@ COND_SYSCALL_COMPAT(signalfd4);
/* fs/timerfd.c */
COND_SYSCALL(timerfd_create);
COND_SYSCALL(timerfd_settime);
-COND_SYSCALL_COMPAT(timerfd_settime);
+COND_SYSCALL(timerfd_settime32);
COND_SYSCALL(timerfd_gettime);
-COND_SYSCALL_COMPAT(timerfd_gettime);
+COND_SYSCALL(timerfd_gettime32);
/* fs/utimes.c */
@@ -135,7 +140,7 @@ COND_SYSCALL(capset);
/* kernel/futex.c */
COND_SYSCALL(futex);
-COND_SYSCALL_COMPAT(futex);
+COND_SYSCALL(futex_time32);
COND_SYSCALL(set_robust_list);
COND_SYSCALL_COMPAT(set_robust_list);
COND_SYSCALL(get_robust_list);
@@ -163,6 +168,7 @@ COND_SYSCALL(syslog);
/* kernel/sched/core.c */
/* kernel/signal.c */
+COND_SYSCALL(pidfd_send_signal);
/* kernel/sys.c */
COND_SYSCALL(setregid);
@@ -187,9 +193,9 @@ COND_SYSCALL(mq_open);
COND_SYSCALL_COMPAT(mq_open);
COND_SYSCALL(mq_unlink);
COND_SYSCALL(mq_timedsend);
-COND_SYSCALL_COMPAT(mq_timedsend);
+COND_SYSCALL(mq_timedsend_time32);
COND_SYSCALL(mq_timedreceive);
-COND_SYSCALL_COMPAT(mq_timedreceive);
+COND_SYSCALL(mq_timedreceive_time32);
COND_SYSCALL(mq_notify);
COND_SYSCALL_COMPAT(mq_notify);
COND_SYSCALL(mq_getsetattr);
@@ -197,8 +203,10 @@ COND_SYSCALL_COMPAT(mq_getsetattr);
/* ipc/msg.c */
COND_SYSCALL(msgget);
+COND_SYSCALL(old_msgctl);
COND_SYSCALL(msgctl);
COND_SYSCALL_COMPAT(msgctl);
+COND_SYSCALL_COMPAT(old_msgctl);
COND_SYSCALL(msgrcv);
COND_SYSCALL_COMPAT(msgrcv);
COND_SYSCALL(msgsnd);
@@ -206,16 +214,20 @@ COND_SYSCALL_COMPAT(msgsnd);
/* ipc/sem.c */
COND_SYSCALL(semget);
+COND_SYSCALL(old_semctl);
COND_SYSCALL(semctl);
COND_SYSCALL_COMPAT(semctl);
+COND_SYSCALL_COMPAT(old_semctl);
COND_SYSCALL(semtimedop);
-COND_SYSCALL_COMPAT(semtimedop);
+COND_SYSCALL(semtimedop_time32);
COND_SYSCALL(semop);
/* ipc/shm.c */
COND_SYSCALL(shmget);
+COND_SYSCALL(old_shmctl);
COND_SYSCALL(shmctl);
COND_SYSCALL_COMPAT(shmctl);
+COND_SYSCALL_COMPAT(old_shmctl);
COND_SYSCALL(shmat);
COND_SYSCALL_COMPAT(shmat);
COND_SYSCALL(shmdt);
@@ -285,7 +297,7 @@ COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);
COND_SYSCALL(recvmmsg);
COND_SYSCALL(recvmmsg_time32);
-COND_SYSCALL_COMPAT(recvmmsg);
+COND_SYSCALL_COMPAT(recvmmsg_time32);
COND_SYSCALL_COMPAT(recvmmsg_time64);
/*
@@ -366,6 +378,7 @@ COND_SYSCALL(kexec_file_load);
/* s390 */
COND_SYSCALL(s390_pci_mmio_read);
COND_SYSCALL(s390_pci_mmio_write);
+COND_SYSCALL(s390_ipc);
COND_SYSCALL_COMPAT(s390_ipc);
/* powerpc */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba4d9e85feb8..c9ec050bcf46 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -67,6 +67,8 @@
#include <linux/bpf.h>
#include <linux/mount.h>
+#include "../lib/kstrtox.h"
+
#include <linux/uaccess.h>
#include <asm/processor.h>
@@ -126,7 +128,9 @@ static int zero;
static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
+static unsigned long zero_ul;
static unsigned long one_ul = 1;
+static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
@@ -224,6 +228,11 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
#endif
static int proc_dopipe_max_size(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_BPF_SYSCALL
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+#endif
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses its own private copy */
@@ -467,6 +476,17 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
},
#endif
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+ {
+ .procname = "sched_energy_aware",
+ .data = &sysctl_sched_energy_aware,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_energy_aware_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -1229,6 +1249,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
.extra2 = &one,
},
+ {
+ .procname = "bpf_stats_enabled",
+ .data = &sysctl_bpf_stats_enabled,
+ .maxlen = sizeof(sysctl_bpf_stats_enabled),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_bpf_stats,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
{
@@ -1446,7 +1475,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_extfrag_threshold,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = sysctl_extfrag_handler,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_extfrag_threshold,
.extra2 = &max_extfrag_threshold,
},
@@ -1722,6 +1751,8 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(files_stat.max_files),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
+ .extra1 = &zero_ul,
+ .extra2 = &long_max,
},
{
.procname = "nr_open",
@@ -2092,6 +2123,41 @@ static void proc_skip_char(char **buf, size_t *size, const char v)
}
}
+/**
+ * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
+ * fail on overflow
+ *
+ * @cp: kernel buffer containing the string to parse
+ * @endp: pointer to store the trailing characters
+ * @base: the base to use
+ * @res: where the parsed integer will be stored
+ *
+ * In case of success 0 is returned and @res will contain the parsed integer,
+ * @endp will hold any trailing characters.
+ * This function will fail the parse on overflow. If there wasn't an overflow
+ * the function will defer the decision what characters count as invalid to the
+ * caller.
+ */
+static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
+ unsigned long *res)
+{
+ unsigned long long result;
+ unsigned int rv;
+
+ cp = _parse_integer_fixup_radix(cp, &base);
+ rv = _parse_integer(cp, base, &result);
+ if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
+ return -ERANGE;
+
+ cp += rv;
+
+ if (endp)
+ *endp = (char *)cp;
+
+ *res = (unsigned long)result;
+ return 0;
+}
+
#define TMPBUFLEN 22
/**
* proc_get_long - reads an ASCII formatted integer from a user buffer
@@ -2135,7 +2201,8 @@ static int proc_get_long(char **buf, size_t *size,
if (!isdigit(*p))
return -EINVAL;
- *val = simple_strtoul(p, &p, 0);
+ if (strtoul_lenient(p, &p, 0, val))
+ return -EINVAL;
len = p - tmp;
@@ -2577,23 +2644,25 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{
+ int tmp, ret;
struct do_proc_dointvec_minmax_conv_param *param = data;
+ /*
+ * If writing, first do so via a temporary local int so we can
+ * bounds-check it before touching *valp.
+ */
+ int *ip = write ? &tmp : valp;
+
+ ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
+ if (ret)
+ return ret;
+
if (write) {
- int val = *negp ? -*lvalp : *lvalp;
- if ((param->min && *param->min > val) ||
- (param->max && *param->max < val))
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
return -EINVAL;
- *valp = val;
- } else {
- int val = *valp;
- if (val < 0) {
- *negp = true;
- *lvalp = -(unsigned long)val;
- } else {
- *negp = false;
- *lvalp = (unsigned long)val;
- }
+ *valp = tmp;
}
+
return 0;
}
@@ -2642,22 +2711,22 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
unsigned int *valp,
int write, void *data)
{
+ int ret;
+ unsigned int tmp;
struct do_proc_douintvec_minmax_conv_param *param = data;
+ /* write via temporary local uint for bounds-checking */
+ unsigned int *up = write ? &tmp : valp;
- if (write) {
- unsigned int val = *lvalp;
+ ret = do_proc_douintvec_conv(lvalp, up, write, data);
+ if (ret)
+ return ret;
- if (*lvalp > UINT_MAX)
- return -EINVAL;
-
- if ((param->min && *param->min > val) ||
- (param->max && *param->max < val))
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
return -ERANGE;
- *valp = val;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long) val;
+ *valp = tmp;
}
return 0;
@@ -3260,6 +3329,29 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
#endif /* CONFIG_PROC_SYSCTL */
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret, bpf_stats = *(int *)table->data;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &bpf_stats;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ *(int *)table->data = bpf_stats;
+ if (bpf_stats)
+ static_branch_enable(&bpf_stats_enabled_key);
+ else
+ static_branch_disable(&bpf_stats_enabled_key);
+ }
+ return ret;
+}
+#endif
/*
* No sense putting this after each symbol definition, twice,
* exception granted :-)
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 58b981f4bb5d..e2c038d6c13c 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -117,6 +117,35 @@ config NO_HZ_FULL
endchoice
+config CONTEXT_TRACKING
+ bool
+
+config CONTEXT_TRACKING_FORCE
+ bool "Force context tracking"
+ depends on CONTEXT_TRACKING
+ default y if !NO_HZ_FULL
+ help
+ The major pre-requirement for full dynticks to work is to
+ support the context tracking subsystem. But there are also
+ other dependencies to provide in order to make the full
+ dynticks working.
+
+ This option stands for testing when an arch implements the
+ context tracking backend but doesn't yet fullfill all the
+ requirements to make the full dynticks feature working.
+ Without the full dynticks, there is no way to test the support
+ for context tracking and the subsystems that rely on it: RCU
+ userspace extended quiescent state and tickless cputime
+ accounting. This option copes with the absence of the full
+ dynticks subsystem by forcing the context tracking on all
+ CPUs in the system.
+
+ Say Y only if you're working on the development of an
+ architecture backend for the context tracking.
+
+ Say N otherwise, this option brings an overhead that you
+ don't want in production.
+
config NO_HZ
bool "Old Idle dynticks config"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2c97e8c2d29f..0519a8805aab 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -594,7 +594,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
- return ktime_sub(now, alarm->node.expires);
+ return ktime_sub(alarm->node.expires, now);
}
/**
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index f5cfa1b73d6f..41dfff23c1f9 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -364,7 +364,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
-
+ /* fall through */
default:
return false;
}
@@ -1771,7 +1771,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp,
+SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
struct old_timespec32 __user *, rmtp)
{
struct timespec64 tu;
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index dc1b6f1929f9..ac9c03dd6c7d 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -89,7 +89,7 @@ struct clocksource * __init __weak clocksource_default_clock(void)
return &clocksource_jiffies;
}
-struct clocksource refined_jiffies;
+static struct clocksource refined_jiffies;
int register_refined_jiffies(long cycles_per_second)
{
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 36a2bef00125..92a90014a925 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -188,13 +188,13 @@ static inline int is_error_status(int status)
&& (status & (STA_PPSWANDER|STA_PPSERROR)));
}
-static inline void pps_fill_timex(struct timex *txc)
+static inline void pps_fill_timex(struct __kernel_timex *txc)
{
txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
PPM_SCALE_INV, NTP_SCALE_SHIFT);
txc->jitter = pps_jitter;
if (!(time_status & STA_NANO))
- txc->jitter /= NSEC_PER_USEC;
+ txc->jitter = pps_jitter / NSEC_PER_USEC;
txc->shift = pps_shift;
txc->stabil = pps_stabil;
txc->jitcnt = pps_jitcnt;
@@ -220,7 +220,7 @@ static inline int is_error_status(int status)
return status & (STA_UNSYNC|STA_CLOCKERR);
}
-static inline void pps_fill_timex(struct timex *txc)
+static inline void pps_fill_timex(struct __kernel_timex *txc)
{
/* PPS is not implemented, so these are zero */
txc->ppsfreq = 0;
@@ -633,7 +633,7 @@ void ntp_notify_cmos_timer(void)
/*
* Propagate a new txc->status value into the NTP state:
*/
-static inline void process_adj_status(const struct timex *txc)
+static inline void process_adj_status(const struct __kernel_timex *txc)
{
if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
time_state = TIME_OK;
@@ -656,7 +656,8 @@ static inline void process_adj_status(const struct timex *txc)
}
-static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai)
+static inline void process_adjtimex_modes(const struct __kernel_timex *txc,
+ s32 *time_tai)
{
if (txc->modes & ADJ_STATUS)
process_adj_status(txc);
@@ -707,7 +708,8 @@ static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai
* adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
-int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai)
+int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai)
{
int result;
@@ -729,7 +731,7 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai)
txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
NTP_SCALE_SHIFT);
if (!(time_status & STA_NANO))
- txc->offset /= NSEC_PER_USEC;
+ txc->offset = (u32)txc->offset / NSEC_PER_USEC;
}
result = time_state; /* mostly `TIME_OK' */
@@ -754,7 +756,7 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai)
txc->time.tv_sec = (time_t)ts->tv_sec;
txc->time.tv_usec = ts->tv_nsec;
if (!(time_status & STA_NANO))
- txc->time.tv_usec /= NSEC_PER_USEC;
+ txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC;
/* Handle leapsec adjustments */
if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) {
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index c24b0e13f011..40e6122e634e 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -8,6 +8,6 @@ extern void ntp_clear(void);
extern u64 ntp_tick_length(void);
extern ktime_t ntp_get_next_leap(void);
extern int second_overflow(time64_t secs);
-extern int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai);
+extern int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai);
extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 425bbfce6819..ec960bb939fd 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -228,7 +228,7 @@ static void put_clock_desc(struct posix_clock_desc *cd)
fput(cd->fp);
}
-static int pc_clock_adjtime(clockid_t id, struct timex *tx)
+static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx)
{
struct posix_clock_desc cd;
int err;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 80f955210861..0a426f4e3125 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -67,13 +67,13 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)
int i;
u64 delta, incr;
- if (timer->it.cpu.incr == 0)
+ if (!timer->it_interval)
return;
if (now < timer->it.cpu.expires)
return;
- incr = timer->it.cpu.incr;
+ incr = timer->it_interval;
delta = now + incr - timer->it.cpu.expires;
/* Don't use (incr*2 < delta), incr*2 might overflow. */
@@ -520,7 +520,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
*/
wake_up_process(timer->it_process);
timer->it.cpu.expires = 0;
- } else if (timer->it.cpu.incr == 0) {
+ } else if (!timer->it_interval) {
/*
* One-shot timer. Clear it as soon as it's fired.
*/
@@ -606,7 +606,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
*/
ret = 0;
- old_incr = timer->it.cpu.incr;
+ old_incr = timer->it_interval;
old_expires = timer->it.cpu.expires;
if (unlikely(timer->it.cpu.firing)) {
timer->it.cpu.firing = -1;
@@ -684,8 +684,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* Install the new reload setting, and
* set up the signal and overrun bookkeeping.
*/
- timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
- timer->it_interval = ns_to_ktime(timer->it.cpu.incr);
+ timer->it_interval = timespec64_to_ktime(new->it_interval);
/*
* This acts as a modification timestamp for the timer,
@@ -724,7 +723,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
/*
* Easy part: convert the reload time.
*/
- itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
+ itp->it_interval = ktime_to_timespec64(timer->it_interval);
if (!timer->it.cpu.expires)
return;
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index a51895486e5e..67df65f887ac 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -45,6 +45,7 @@ SYS_NI(timer_delete);
SYS_NI(clock_adjtime);
SYS_NI(getitimer);
SYS_NI(setitimer);
+SYS_NI(clock_adjtime32);
#ifdef __ARCH_WANT_SYS_ALARM
SYS_NI(alarm);
#endif
@@ -150,16 +151,16 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
#ifdef CONFIG_COMPAT
COMPAT_SYS_NI(timer_create);
-COMPAT_SYS_NI(clock_adjtime);
-COMPAT_SYS_NI(timer_settime);
-COMPAT_SYS_NI(timer_gettime);
COMPAT_SYS_NI(getitimer);
COMPAT_SYS_NI(setitimer);
#endif
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYS_NI(timer_settime32);
+SYS_NI(timer_gettime32);
+
+SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
struct timespec64 new_tp;
@@ -171,8 +172,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
return do_sys_settimeofday64(&new_tp, NULL);
}
-COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
int ret;
struct timespec64 kernel_tp;
@@ -186,8 +187,8 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
return 0;
}
-COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
struct timespec64 rtn_tp = {
.tv_sec = 0,
@@ -206,9 +207,9 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
}
}
-COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
- struct old_timespec32 __user *, rqtp,
- struct old_timespec32 __user *, rmtp)
+SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
+ struct old_timespec32 __user *, rqtp,
+ struct old_timespec32 __user *, rmtp)
{
struct timespec64 t;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0e84bb72a3da..29176635991f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -179,7 +179,7 @@ static int posix_clock_realtime_set(const clockid_t which_clock,
}
static int posix_clock_realtime_adj(const clockid_t which_clock,
- struct timex *t)
+ struct __kernel_timex *t)
{
return do_adjtimex(t);
}
@@ -730,8 +730,8 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
- struct old_itimerspec32 __user *, setting)
+SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
+ struct old_itimerspec32 __user *, setting)
{
struct itimerspec64 cur_setting;
@@ -903,9 +903,9 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
- struct old_itimerspec32 __user *, new,
- struct old_itimerspec32 __user *, old)
+SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags,
+ struct old_itimerspec32 __user *, new,
+ struct old_itimerspec32 __user *, old)
{
struct itimerspec64 new_spec, old_spec;
struct itimerspec64 *rtn = old ? &old_spec : NULL;
@@ -1047,22 +1047,28 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
return error;
}
-SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
- struct timex __user *, utx)
+int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timex ktx;
- int err;
if (!kc)
return -EINVAL;
if (!kc->clock_adj)
return -EOPNOTSUPP;
+ return kc->clock_adj(which_clock, ktx);
+}
+
+SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
+ struct __kernel_timex __user *, utx)
+{
+ struct __kernel_timex ktx;
+ int err;
+
if (copy_from_user(&ktx, utx, sizeof(ktx)))
return -EFAULT;
- err = kc->clock_adj(which_clock, &ktx);
+ err = do_clock_adjtime(which_clock, &ktx);
if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
return -EFAULT;
@@ -1090,8 +1096,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 ts;
@@ -1105,8 +1111,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
return kc->clock_set(which_clock, &ts);
}
-COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 ts;
@@ -1123,40 +1129,26 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
return err;
}
-#endif
-
-#ifdef CONFIG_COMPAT
-
-COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
- struct compat_timex __user *, utp)
+SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock,
+ struct old_timex32 __user *, utp)
{
- const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timex ktx;
+ struct __kernel_timex ktx;
int err;
- if (!kc)
- return -EINVAL;
- if (!kc->clock_adj)
- return -EOPNOTSUPP;
-
- err = compat_get_timex(&ktx, utp);
+ err = get_old_timex32(&ktx, utp);
if (err)
return err;
- err = kc->clock_adj(which_clock, &ktx);
+ err = do_clock_adjtime(which_clock, &ktx);
if (err >= 0)
- err = compat_put_timex(utp, &ktx);
+ err = put_old_timex32(utp, &ktx);
return err;
}
-#endif
-
-#ifdef CONFIG_COMPAT_32BIT_TIME
-
-COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 ts;
@@ -1212,9 +1204,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
- struct old_timespec32 __user *, rqtp,
- struct old_timespec32 __user *, rmtp)
+SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
+ struct old_timespec32 __user *, rqtp,
+ struct old_timespec32 __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 t;
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index ddb21145211a..de5daa6d975a 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -8,7 +8,7 @@ struct k_clock {
const struct timespec64 *tp);
int (*clock_get)(const clockid_t which_clock,
struct timespec64 *tp);
- int (*clock_adj)(const clockid_t which_clock, struct timex *tx);
+ int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx);
int (*timer_create)(struct k_itimer *timer);
int (*nsleep)(const clockid_t which_clock, int flags,
const struct timespec64 *);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 803fa67aace9..ee834d4fb814 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -375,6 +375,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
switch (mode) {
case TICK_BROADCAST_FORCE:
tick_broadcast_forced = 1;
+ /* fall through */
case TICK_BROADCAST_ON:
cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 2edb5088a70b..c3f756f8534b 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -98,11 +98,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr)
#endif /* __ARCH_WANT_SYS_TIME */
-#ifdef CONFIG_COMPAT
-#ifdef __ARCH_WANT_COMPAT_SYS_TIME
+#ifdef CONFIG_COMPAT_32BIT_TIME
+#ifdef __ARCH_WANT_SYS_TIME32
/* old_time32_t is a 32 bit "long" and needs to get converted. */
-COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc)
+SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc)
{
old_time32_t i;
@@ -116,7 +116,7 @@ COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc)
return i;
}
-COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr)
+SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
{
struct timespec64 tv;
int err;
@@ -134,7 +134,7 @@ COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr)
return 0;
}
-#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
+#endif /* __ARCH_WANT_SYS_TIME32 */
#endif
SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
@@ -263,35 +263,99 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
}
#endif
-SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
+#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)
{
- struct timex txc; /* Local copy of parameter */
+ struct __kernel_timex txc; /* Local copy of parameter */
int ret;
/* Copy the user data space into the kernel copy
* structure. But bear in mind that the structures
* may change
*/
- if (copy_from_user(&txc, txc_p, sizeof(struct timex)))
+ if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex)))
return -EFAULT;
ret = do_adjtimex(&txc);
- return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
+ return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret;
}
+#endif
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
+int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp)
+{
+ struct old_timex32 tx32;
+
+ memset(txc, 0, sizeof(struct __kernel_timex));
+ if (copy_from_user(&tx32, utp, sizeof(struct old_timex32)))
+ return -EFAULT;
+
+ txc->modes = tx32.modes;
+ txc->offset = tx32.offset;
+ txc->freq = tx32.freq;
+ txc->maxerror = tx32.maxerror;
+ txc->esterror = tx32.esterror;
+ txc->status = tx32.status;
+ txc->constant = tx32.constant;
+ txc->precision = tx32.precision;
+ txc->tolerance = tx32.tolerance;
+ txc->time.tv_sec = tx32.time.tv_sec;
+ txc->time.tv_usec = tx32.time.tv_usec;
+ txc->tick = tx32.tick;
+ txc->ppsfreq = tx32.ppsfreq;
+ txc->jitter = tx32.jitter;
+ txc->shift = tx32.shift;
+ txc->stabil = tx32.stabil;
+ txc->jitcnt = tx32.jitcnt;
+ txc->calcnt = tx32.calcnt;
+ txc->errcnt = tx32.errcnt;
+ txc->stbcnt = tx32.stbcnt;
+
+ return 0;
+}
+
+int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc)
+{
+ struct old_timex32 tx32;
+
+ memset(&tx32, 0, sizeof(struct old_timex32));
+ tx32.modes = txc->modes;
+ tx32.offset = txc->offset;
+ tx32.freq = txc->freq;
+ tx32.maxerror = txc->maxerror;
+ tx32.esterror = txc->esterror;
+ tx32.status = txc->status;
+ tx32.constant = txc->constant;
+ tx32.precision = txc->precision;
+ tx32.tolerance = txc->tolerance;
+ tx32.time.tv_sec = txc->time.tv_sec;
+ tx32.time.tv_usec = txc->time.tv_usec;
+ tx32.tick = txc->tick;
+ tx32.ppsfreq = txc->ppsfreq;
+ tx32.jitter = txc->jitter;
+ tx32.shift = txc->shift;
+ tx32.stabil = txc->stabil;
+ tx32.jitcnt = txc->jitcnt;
+ tx32.calcnt = txc->calcnt;
+ tx32.errcnt = txc->errcnt;
+ tx32.stbcnt = txc->stbcnt;
+ tx32.tai = txc->tai;
+ if (copy_to_user(utp, &tx32, sizeof(struct old_timex32)))
+ return -EFAULT;
+ return 0;
+}
-COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
+SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
{
- struct timex txc;
+ struct __kernel_timex txc;
int err, ret;
- err = compat_get_timex(&txc, utp);
+ err = get_old_timex32(&txc, utp);
if (err)
return err;
ret = do_adjtimex(&txc);
- err = compat_put_timex(utp, &txc);
+ err = put_old_timex32(utp, &txc);
if (err)
return err;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ac5dbf2cd4a2..f986e1918d12 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2234,7 +2234,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
/**
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
-static int timekeeping_validate_timex(const struct timex *txc)
+static int timekeeping_validate_timex(const struct __kernel_timex *txc)
{
if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
@@ -2300,7 +2300,7 @@ static int timekeeping_validate_timex(const struct timex *txc)
/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
*/
-int do_adjtimex(struct timex *txc)
+int do_adjtimex(struct __kernel_timex *txc)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 86489950d690..b73e8850e58d 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -37,15 +37,8 @@ DEFINE_SHOW_ATTRIBUTE(tk_debug_sleep_time);
static int __init tk_debug_sleep_time_init(void)
{
- struct dentry *d;
-
- d = debugfs_create_file("sleep_time", 0444, NULL, NULL,
- &tk_debug_sleep_time_fops);
- if (!d) {
- pr_err("Failed to create sleep_time debug file\n");
- return -ENOMEM;
- }
-
+ debugfs_create_file("sleep_time", 0444, NULL, NULL,
+ &tk_debug_sleep_time_fops);
return 0;
}
late_initcall(tk_debug_sleep_time_init);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 444156debfa0..2fce056f8a49 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -647,7 +647,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
-
+ /* fall through */
default:
return false;
}
@@ -1632,7 +1632,7 @@ void update_process_times(int user_tick)
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
- rcu_check_callbacks(user_tick);
+ rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_tick();
diff --git a/kernel/torture.c b/kernel/torture.c
index bbf6d473e50c..8faa1a9aaeb9 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Common functions for in-kernel torture tests.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2014
*
- * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
* Based on kernel/rcu/torture.c.
*/
@@ -53,7 +40,7 @@
#include "rcu/rcu.h"
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
static char *torture_type;
static int verbose;
@@ -75,6 +62,7 @@ static DEFINE_MUTEX(fullstop_mutex);
static struct task_struct *onoff_task;
static long onoff_holdoff;
static long onoff_interval;
+static torture_ofl_func *onoff_f;
static long n_offline_attempts;
static long n_offline_successes;
static unsigned long sum_offline;
@@ -118,6 +106,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
pr_alert("%s" TORTURE_FLAG
"torture_onoff task: offlined %d\n",
torture_type, cpu);
+ if (onoff_f)
+ onoff_f();
(*n_offl_successes)++;
delta = jiffies - starttime;
*sum_offl += delta;
@@ -243,11 +233,12 @@ stop:
/*
* Initiate online-offline handling.
*/
-int torture_onoff_init(long ooholdoff, long oointerval)
+int torture_onoff_init(long ooholdoff, long oointerval, torture_ofl_func *f)
{
#ifdef CONFIG_HOTPLUG_CPU
onoff_holdoff = ooholdoff;
onoff_interval = oointerval;
+ onoff_f = f;
if (onoff_interval <= 0)
return 0;
return torture_create_kthread(torture_onoff, NULL, onoff_task);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fa8b1fe824f3..8bd1d6d001d7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -370,6 +370,7 @@ config PROFILE_ANNOTATED_BRANCHES
config PROFILE_ALL_BRANCHES
bool "Profile all if conditionals" if !FORTIFY_SOURCE
select TRACE_BRANCH_PROFILING
+ imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives
help
This tracer profiles all branch conditions. Every if ()
taken in the kernel is recorded whether it hit or miss.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fac0ddf8a8e2..e1c6d79fb4cc 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -723,6 +723,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
#endif
case BLKTRACESTART:
start = 1;
+ /* fall through */
case BLKTRACESTOP:
ret = __blk_trace_startstop(q, start);
break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f1a86a0d881d..d64c00afceb5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -431,8 +431,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
- perf_event_output(event, sd, regs);
- return 0;
+ return perf_event_output(event, sd, regs);
}
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index aac7847c0214..26c8ca9bd06b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1992,7 +1992,7 @@ static void print_bug_type(void)
* modifying the code. @failed should be one of either:
* EFAULT - if the problem happens on reading the @ip address
* EINVAL - if what is read at @ip is not what was expected
- * EPERM - if the problem happens on writting to the @ip address
+ * EPERM - if the problem happens on writing to the @ip address
*/
void ftrace_bug(int failed, struct dyn_ftrace *rec)
{
@@ -2391,7 +2391,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
}
- return -1; /* unknow ftrace bug */
+ return -1; /* unknown ftrace bug */
}
void __weak ftrace_replace_code(int mod_flags)
@@ -3004,7 +3004,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
int cnt;
if (!num_to_init)
- return 0;
+ return NULL;
start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
if (!pg)
@@ -3702,6 +3702,31 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter)
}
static int
+add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g,
+ int clear_filter)
+{
+ long index = simple_strtoul(func_g->search, NULL, 0);
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+
+ /* The index starts at 1 */
+ if (--index < 0)
+ return 0;
+
+ do_for_each_ftrace_rec(pg, rec) {
+ if (pg->index <= index) {
+ index -= pg->index;
+ /* this is a double loop, break goes to the next page */
+ break;
+ }
+ rec = &pg->records[index];
+ enter_record(hash, rec, clear_filter);
+ return 1;
+ } while_for_each_ftrace_rec();
+ return 0;
+}
+
+static int
ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
struct ftrace_glob *mod_g, int exclude_mod)
{
@@ -3769,6 +3794,11 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
if (unlikely(ftrace_disabled))
goto out_unlock;
+ if (func_g.type == MATCH_INDEX) {
+ found = add_rec_by_index(hash, &func_g, clear_filter);
+ goto out_unlock;
+ }
+
do_for_each_ftrace_rec(pg, rec) {
if (rec->flags & FTRACE_FL_DISABLED)
@@ -4725,7 +4755,7 @@ static int
ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
int reset, int enable)
{
- return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);
+ return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
}
/**
@@ -5433,7 +5463,7 @@ void ftrace_create_filter_files(struct ftrace_ops *ops,
/*
* The name "destroy_filter_files" is really a misnomer. Although
- * in the future, it may actualy delete the files, but this is
+ * in the future, it may actually delete the files, but this is
* really intended to make sure the ops passed in are disabled
* and that when this function returns, the caller is free to
* free the ops.
@@ -5756,7 +5786,7 @@ void ftrace_module_enable(struct module *mod)
/*
* If the tracing is enabled, go ahead and enable the record.
*
- * The reason not to enable the record immediatelly is the
+ * The reason not to enable the record immediately is the
* inherent check of ftrace_make_nop/ftrace_make_call for
* correct previous instructions. Making first the NOP
* conversion puts the module to the correct state, thus
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 06e864a334bb..41b6f96e5366 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -353,20 +353,6 @@ static void rb_init_page(struct buffer_data_page *bpage)
local_set(&bpage->commit, 0);
}
-/**
- * ring_buffer_page_len - the size of data on the page.
- * @page: The page to read
- *
- * Returns the amount of data on the page, including buffer page header.
- */
-size_t ring_buffer_page_len(void *page)
-{
- struct buffer_data_page *bpage = page;
-
- return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
- + BUF_PAGE_HDR_SIZE;
-}
-
/*
* Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
* this issue out.
@@ -4205,6 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
* @buffer: The ring buffer to read from
* @cpu: The cpu buffer to iterate over
+ * @flags: gfp flags to use for memory allocation
*
* This performs the initial preparations necessary to iterate
* through the buffer. Memory is allocated, buffer recording
@@ -4222,7 +4209,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* This overall must be paired with ring_buffer_read_finish.
*/
struct ring_buffer_iter *
-ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
@@ -4230,7 +4217,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
- iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ iter = kmalloc(sizeof(*iter), flags);
if (!iter)
return NULL;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0f300d488c9f..6c24755655c7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -894,7 +894,7 @@ int __trace_bputs(unsigned long ip, const char *str)
EXPORT_SYMBOL_GPL(__trace_bputs);
#ifdef CONFIG_TRACER_SNAPSHOT
-void tracing_snapshot_instance(struct trace_array *tr)
+void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data)
{
struct tracer *tracer = tr->current_trace;
unsigned long flags;
@@ -920,10 +920,15 @@ void tracing_snapshot_instance(struct trace_array *tr)
}
local_irq_save(flags);
- update_max_tr(tr, current, smp_processor_id());
+ update_max_tr(tr, current, smp_processor_id(), cond_data);
local_irq_restore(flags);
}
+void tracing_snapshot_instance(struct trace_array *tr)
+{
+ tracing_snapshot_instance_cond(tr, NULL);
+}
+
/**
* tracing_snapshot - take a snapshot of the current buffer.
*
@@ -946,6 +951,54 @@ void tracing_snapshot(void)
}
EXPORT_SYMBOL_GPL(tracing_snapshot);
+/**
+ * tracing_snapshot_cond - conditionally take a snapshot of the current buffer.
+ * @tr: The tracing instance to snapshot
+ * @cond_data: The data to be tested conditionally, and possibly saved
+ *
+ * This is the same as tracing_snapshot() except that the snapshot is
+ * conditional - the snapshot will only happen if the
+ * cond_snapshot.update() implementation receiving the cond_data
+ * returns true, which means that the trace array's cond_snapshot
+ * update() operation used the cond_data to determine whether the
+ * snapshot should be taken, and if it was, presumably saved it along
+ * with the snapshot.
+ */
+void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
+{
+ tracing_snapshot_instance_cond(tr, cond_data);
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
+
+/**
+ * tracing_snapshot_cond_data - get the user data associated with a snapshot
+ * @tr: The tracing instance
+ *
+ * When the user enables a conditional snapshot using
+ * tracing_snapshot_cond_enable(), the user-defined cond_data is saved
+ * with the snapshot. This accessor is used to retrieve it.
+ *
+ * Should not be called from cond_snapshot.update(), since it takes
+ * the tr->max_lock lock, which the code calling
+ * cond_snapshot.update() has already done.
+ *
+ * Returns the cond_data associated with the trace array's snapshot.
+ */
+void *tracing_cond_snapshot_data(struct trace_array *tr)
+{
+ void *cond_data = NULL;
+
+ arch_spin_lock(&tr->max_lock);
+
+ if (tr->cond_snapshot)
+ cond_data = tr->cond_snapshot->cond_data;
+
+ arch_spin_unlock(&tr->max_lock);
+
+ return cond_data;
+}
+EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
+
static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
struct trace_buffer *size_buf, int cpu_id);
static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
@@ -1025,12 +1078,111 @@ void tracing_snapshot_alloc(void)
tracing_snapshot();
}
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+
+/**
+ * tracing_snapshot_cond_enable - enable conditional snapshot for an instance
+ * @tr: The tracing instance
+ * @cond_data: User data to associate with the snapshot
+ * @update: Implementation of the cond_snapshot update function
+ *
+ * Check whether the conditional snapshot for the given instance has
+ * already been enabled, or if the current tracer is already using a
+ * snapshot; if so, return -EBUSY, else create a cond_snapshot and
+ * save the cond_data and update function inside.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
+ cond_update_fn_t update)
+{
+ struct cond_snapshot *cond_snapshot;
+ int ret = 0;
+
+ cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL);
+ if (!cond_snapshot)
+ return -ENOMEM;
+
+ cond_snapshot->cond_data = cond_data;
+ cond_snapshot->update = update;
+
+ mutex_lock(&trace_types_lock);
+
+ ret = tracing_alloc_snapshot_instance(tr);
+ if (ret)
+ goto fail_unlock;
+
+ if (tr->current_trace->use_max_tr) {
+ ret = -EBUSY;
+ goto fail_unlock;
+ }
+
+ /*
+ * The cond_snapshot can only change to NULL without the
+ * trace_types_lock. We don't care if we race with it going
+ * to NULL, but we want to make sure that it's not set to
+ * something other than NULL when we get here, which we can
+ * do safely with only holding the trace_types_lock and not
+ * having to take the max_lock.
+ */
+ if (tr->cond_snapshot) {
+ ret = -EBUSY;
+ goto fail_unlock;
+ }
+
+ arch_spin_lock(&tr->max_lock);
+ tr->cond_snapshot = cond_snapshot;
+ arch_spin_unlock(&tr->max_lock);
+
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+
+ fail_unlock:
+ mutex_unlock(&trace_types_lock);
+ kfree(cond_snapshot);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
+
+/**
+ * tracing_snapshot_cond_disable - disable conditional snapshot for an instance
+ * @tr: The tracing instance
+ *
+ * Check whether the conditional snapshot for the given instance is
+ * enabled; if so, free the cond_snapshot associated with it,
+ * otherwise return -EINVAL.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int tracing_snapshot_cond_disable(struct trace_array *tr)
+{
+ int ret = 0;
+
+ arch_spin_lock(&tr->max_lock);
+
+ if (!tr->cond_snapshot)
+ ret = -EINVAL;
+ else {
+ kfree(tr->cond_snapshot);
+ tr->cond_snapshot = NULL;
+ }
+
+ arch_spin_unlock(&tr->max_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#else
void tracing_snapshot(void)
{
WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
}
EXPORT_SYMBOL_GPL(tracing_snapshot);
+void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
+{
+ WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used");
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
int tracing_alloc_snapshot(void)
{
WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used");
@@ -1043,6 +1195,21 @@ void tracing_snapshot_alloc(void)
tracing_snapshot();
}
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+void *tracing_cond_snapshot_data(struct trace_array *tr)
+{
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
+int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update)
+{
+ return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
+int tracing_snapshot_cond_disable(struct trace_array *tr)
+{
+ return false;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#endif /* CONFIG_TRACER_SNAPSHOT */
void tracer_tracing_off(struct trace_array *tr)
@@ -1330,7 +1497,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
max_data->critical_start = data->critical_start;
max_data->critical_end = data->critical_end;
- memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
+ strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
max_data->pid = tsk->pid;
/*
* If tsk == current, then use current_uid(), as that does not use
@@ -1354,12 +1521,14 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
* @tr: tracer
* @tsk: the task with the latency
* @cpu: The cpu that initiated the trace.
+ * @cond_data: User data associated with a conditional snapshot
*
* Flip the buffers between the @tr and the max_tr and record information
* about which task was the cause of this latency.
*/
void
-update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
+ void *cond_data)
{
if (tr->stop_count)
return;
@@ -1380,9 +1549,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
else
ring_buffer_record_off(tr->max_buffer.buffer);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data))
+ goto out_unlock;
+#endif
swap(tr->trace_buffer.buffer, tr->max_buffer.buffer);
__update_max_tr(tr, tsk, cpu);
+
+ out_unlock:
arch_spin_unlock(&tr->max_lock);
}
@@ -1748,7 +1923,7 @@ static inline char *get_saved_cmdlines(int idx)
static inline void set_cmdline(int idx, const char *cmdline)
{
- memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
+ strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}
static int allocate_cmdlines_buffer(unsigned int val,
@@ -3904,7 +4079,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
+ ring_buffer_read_prepare(iter->trace_buffer->buffer,
+ cpu, GFP_KERNEL);
}
ring_buffer_read_prepare_sync();
for_each_tracing_cpu(cpu) {
@@ -3914,7 +4090,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
+ ring_buffer_read_prepare(iter->trace_buffer->buffer,
+ cpu, GFP_KERNEL);
ring_buffer_read_prepare_sync();
ring_buffer_read_start(iter->buffer_iter[cpu]);
tracing_iter_reset(iter, cpu);
@@ -4702,6 +4879,7 @@ static const char readme_msg[] =
"\t [:size=#entries]\n"
"\t [:pause][:continue][:clear]\n"
"\t [:name=histname1]\n"
+ "\t [:<handler>.<action>]\n"
"\t [if <filter>]\n\n"
"\t When a matching event is hit, an entry is added to a hash\n"
"\t table using the key(s) and value(s) named, and the value of a\n"
@@ -4742,8 +4920,21 @@ static const char readme_msg[] =
"\t unchanged.\n\n"
"\t The enable_hist and disable_hist triggers can be used to\n"
"\t have one event conditionally start and stop another event's\n"
- "\t already-attached hist trigger. The syntax is analagous to\n"
- "\t the enable_event and disable_event triggers.\n"
+ "\t already-attached hist trigger. The syntax is analogous to\n"
+ "\t the enable_event and disable_event triggers.\n\n"
+ "\t Hist trigger handlers and actions are executed whenever a\n"
+ "\t a histogram entry is added or updated. They take the form:\n\n"
+ "\t <handler>.<action>\n\n"
+ "\t The available handlers are:\n\n"
+ "\t onmatch(matching.event) - invoke on addition or update\n"
+ "\t onmax(var) - invoke if var exceeds current max\n"
+ "\t onchange(var) - invoke action if var changes\n\n"
+ "\t The available actions are:\n\n"
+ "\t trace(<synthetic_event>,param list) - generate synthetic event\n"
+ "\t save(field,...) - save current event fields\n"
+#ifdef CONFIG_TRACER_SNAPSHOT
+ "\t snapshot() - snapshot the trace buffer\n"
+#endif
#endif
;
@@ -5388,6 +5579,16 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (t == tr->current_trace)
goto out;
+#ifdef CONFIG_TRACER_SNAPSHOT
+ if (t->use_max_tr) {
+ arch_spin_lock(&tr->max_lock);
+ if (tr->cond_snapshot)
+ ret = -EBUSY;
+ arch_spin_unlock(&tr->max_lock);
+ if (ret)
+ goto out;
+ }
+#endif
/* Some tracers won't work on kernel command line */
if (system_state < SYSTEM_RUNNING && t->noboot) {
pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
@@ -5626,7 +5827,6 @@ out:
return ret;
fail:
- kfree(iter->trace);
kfree(iter);
__trace_array_put(tr);
mutex_unlock(&trace_types_lock);
@@ -5825,7 +6025,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
}
static const struct pipe_buf_operations tracing_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -6470,6 +6669,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
goto out;
}
+ arch_spin_lock(&tr->max_lock);
+ if (tr->cond_snapshot)
+ ret = -EBUSY;
+ arch_spin_unlock(&tr->max_lock);
+ if (ret)
+ goto out;
+
switch (val) {
case 0:
if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
@@ -6495,7 +6701,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
local_irq_disable();
/* Now, we're going to swap */
if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
- update_max_tr(tr, current, smp_processor_id());
+ update_max_tr(tr, current, smp_processor_id(), NULL);
else
update_max_tr_single(tr, current, iter->cpu_file);
local_irq_enable();
@@ -6849,7 +7055,6 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
/* Pipe buffer operations for a buffer. */
static const struct pipe_buf_operations buffer_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
.steal = generic_pipe_buf_steal,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 08900828d282..d80cee49e0eb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -194,6 +194,51 @@ struct trace_pid_list {
unsigned long *pids;
};
+typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data);
+
+/**
+ * struct cond_snapshot - conditional snapshot data and callback
+ *
+ * The cond_snapshot structure encapsulates a callback function and
+ * data associated with the snapshot for a given tracing instance.
+ *
+ * When a snapshot is taken conditionally, by invoking
+ * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is
+ * passed in turn to the cond_snapshot.update() function. That data
+ * can be compared by the update() implementation with the cond_data
+ * contained wihin the struct cond_snapshot instance associated with
+ * the trace_array. Because the tr->max_lock is held throughout the
+ * update() call, the update() function can directly retrieve the
+ * cond_snapshot and cond_data associated with the per-instance
+ * snapshot associated with the trace_array.
+ *
+ * The cond_snapshot.update() implementation can save data to be
+ * associated with the snapshot if it decides to, and returns 'true'
+ * in that case, or it returns 'false' if the conditional snapshot
+ * shouldn't be taken.
+ *
+ * The cond_snapshot instance is created and associated with the
+ * user-defined cond_data by tracing_cond_snapshot_enable().
+ * Likewise, the cond_snapshot instance is destroyed and is no longer
+ * associated with the trace instance by
+ * tracing_cond_snapshot_disable().
+ *
+ * The method below is required.
+ *
+ * @update: When a conditional snapshot is invoked, the update()
+ * callback function is invoked with the tr->max_lock held. The
+ * update() implementation signals whether or not to actually
+ * take the snapshot, by returning 'true' if so, 'false' if no
+ * snapshot should be taken. Because the max_lock is held for
+ * the duration of update(), the implementation is safe to
+ * directly retrieven and save any implementation data it needs
+ * to in association with the snapshot.
+ */
+struct cond_snapshot {
+ void *cond_data;
+ cond_update_fn_t update;
+};
+
/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
@@ -277,6 +322,9 @@ struct trace_array {
#endif
int time_stamp_abs_ref;
struct list_head hist_vars;
+#ifdef CONFIG_TRACER_SNAPSHOT
+ struct cond_snapshot *cond_snapshot;
+#endif
};
enum {
@@ -727,7 +775,8 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
const char __user *ubuf, size_t cnt);
#ifdef CONFIG_TRACER_MAX_TRACE
-void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
+void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
+ void *cond_data);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
#endif /* CONFIG_TRACER_MAX_TRACE */
@@ -855,10 +904,11 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
#define TRACE_GRAPH_PRINT_PROC 0x8
#define TRACE_GRAPH_PRINT_DURATION 0x10
#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
-#define TRACE_GRAPH_PRINT_IRQS 0x40
-#define TRACE_GRAPH_PRINT_TAIL 0x80
-#define TRACE_GRAPH_SLEEP_TIME 0x100
-#define TRACE_GRAPH_GRAPH_TIME 0x200
+#define TRACE_GRAPH_PRINT_REL_TIME 0x40
+#define TRACE_GRAPH_PRINT_IRQS 0x80
+#define TRACE_GRAPH_PRINT_TAIL 0x100
+#define TRACE_GRAPH_SLEEP_TIME 0x200
+#define TRACE_GRAPH_GRAPH_TIME 0x400
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
@@ -1458,6 +1508,7 @@ enum regex_type {
MATCH_MIDDLE_ONLY,
MATCH_END_ONLY,
MATCH_GLOB,
+ MATCH_INDEX,
};
struct regex {
@@ -1808,6 +1859,11 @@ static inline bool event_command_needs_rec(struct event_command *cmd_ops)
extern int trace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable);
extern int tracing_alloc_snapshot(void);
+extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data);
+extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update);
+
+extern int tracing_snapshot_cond_disable(struct trace_array *tr);
+extern void *tracing_cond_snapshot_data(struct trace_array *tr);
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index dd1f43588d70..fa100ed3b4de 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -74,7 +74,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
static int create_dyn_event(int argc, char **argv)
{
struct dyn_event_operations *ops;
- int ret;
+ int ret = -ENODEV;
if (argv[0][0] == '-' || argv[0][0] == '!')
return dyn_event_release(argc, argv, NULL);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 06bb2fd9a56c..fc8e97328e54 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -65,7 +65,8 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
__field( unsigned long, parent_ip )
),
- F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip),
+ F_printk(" %ps <-- %ps",
+ (void *)__entry->ip, (void *)__entry->parent_ip),
FILTER_TRACE_FN,
@@ -83,7 +84,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
__field_desc( int, graph_ent, depth )
),
- F_printk("--> %lx (%d)", __entry->func, __entry->depth),
+ F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth),
FILTER_OTHER
);
@@ -102,8 +103,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
__field_desc( int, ret, depth )
),
- F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
- __entry->func, __entry->depth,
+ F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
+ (void *)__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
__entry->depth),
@@ -167,12 +168,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
#define FTRACE_STACK_ENTRIES 8
-#ifndef CONFIG_64BIT
-# define IP_FMT "%08lx"
-#else
-# define IP_FMT "%016lx"
-#endif
-
FTRACE_ENTRY(kernel_stack, stack_entry,
TRACE_STACK,
@@ -182,12 +177,13 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
__dynamic_array(unsigned long, caller )
),
- F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
- "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
- "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
- __entry->caller[0], __entry->caller[1], __entry->caller[2],
- __entry->caller[3], __entry->caller[4], __entry->caller[5],
- __entry->caller[6], __entry->caller[7]),
+ F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n"
+ "\t=> %ps\n\t=> %ps\n\t=> %ps\n"
+ "\t=> %ps\n\t=> %ps\n",
+ (void *)__entry->caller[0], (void *)__entry->caller[1],
+ (void *)__entry->caller[2], (void *)__entry->caller[3],
+ (void *)__entry->caller[4], (void *)__entry->caller[5],
+ (void *)__entry->caller[6], (void *)__entry->caller[7]),
FILTER_OTHER
);
@@ -201,12 +197,13 @@ FTRACE_ENTRY(user_stack, userstack_entry,
__array( unsigned long, caller, FTRACE_STACK_ENTRIES )
),
- F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
- "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
- "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
- __entry->caller[0], __entry->caller[1], __entry->caller[2],
- __entry->caller[3], __entry->caller[4], __entry->caller[5],
- __entry->caller[6], __entry->caller[7]),
+ F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n"
+ "\t=> %ps\n\t=> %ps\n\t=> %ps\n"
+ "\t=> %ps\n\t=> %ps\n",
+ (void *)__entry->caller[0], (void *)__entry->caller[1],
+ (void *)__entry->caller[2], (void *)__entry->caller[3],
+ (void *)__entry->caller[4], (void *)__entry->caller[5],
+ (void *)__entry->caller[6], (void *)__entry->caller[7]),
FILTER_OTHER
);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 76217bbef815..4629a6104474 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -299,15 +299,13 @@ int perf_uprobe_init(struct perf_event *p_event,
if (!p_event->attr.uprobe_path)
return -EINVAL;
- path = kzalloc(PATH_MAX, GFP_KERNEL);
- if (!path)
- return -ENOMEM;
- ret = strncpy_from_user(
- path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX);
- if (ret == PATH_MAX)
- return -E2BIG;
- if (ret < 0)
- goto out;
+
+ path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path),
+ PATH_MAX);
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
+ return (ret == -EINVAL) ? -E2BIG : ret;
+ }
if (path[0] == '\0') {
ret = -EINVAL;
goto out;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 27821480105e..05a66493a164 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -491,10 +491,12 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
break;
case '&':
case '|':
+ /* accepting only "&&" or "||" */
if (next[1] == next[0]) {
ptr++;
break;
}
+ /* fall through */
default:
parse_error(pe, FILT_ERR_TOO_MANY_PREDS,
next - str);
@@ -823,6 +825,9 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
*search = buff;
+ if (isdigit(buff[0]))
+ return MATCH_INDEX;
+
for (i = 0; i < len; i++) {
if (buff[i] == '*') {
if (!i) {
@@ -860,6 +865,8 @@ static void filter_build_regex(struct filter_pred *pred)
}
switch (type) {
+ /* MATCH_INDEX should not happen, but if it does, match full */
+ case MATCH_INDEX:
case MATCH_FULL:
r->match = regex_match_full;
break;
@@ -1301,7 +1308,7 @@ static int parse_pred(const char *str, void *data,
/* go past the last quote */
i++;
- } else if (isdigit(str[i])) {
+ } else if (isdigit(str[i]) || str[i] == '-') {
/* Make sure the field is not a string */
if (is_string_field(field)) {
@@ -1314,6 +1321,9 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
+ if (str[i] == '-')
+ i++;
+
/* We allow 0xDEADBEEF */
while (isalnum(str[i]))
i++;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 449d90cfa151..795aa2038377 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -313,9 +313,9 @@ struct hist_trigger_data {
struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX];
unsigned int n_field_var_hists;
- struct field_var *max_vars[SYNTH_FIELDS_MAX];
- unsigned int n_max_vars;
- unsigned int n_max_var_str;
+ struct field_var *save_vars[SYNTH_FIELDS_MAX];
+ unsigned int n_save_vars;
+ unsigned int n_save_var_str;
};
static int synth_event_create(int argc, const char **argv);
@@ -383,41 +383,157 @@ struct action_data;
typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
struct tracing_map_elt *elt, void *rec,
- struct ring_buffer_event *rbe,
+ struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals);
+typedef bool (*check_track_val_fn_t) (u64 track_val, u64 var_val);
+
+enum handler_id {
+ HANDLER_ONMATCH = 1,
+ HANDLER_ONMAX,
+ HANDLER_ONCHANGE,
+};
+
+enum action_id {
+ ACTION_SAVE = 1,
+ ACTION_TRACE,
+ ACTION_SNAPSHOT,
+};
+
struct action_data {
+ enum handler_id handler;
+ enum action_id action;
+ char *action_name;
action_fn_t fn;
+
unsigned int n_params;
char *params[SYNTH_FIELDS_MAX];
+ /*
+ * When a histogram trigger is hit, the values of any
+ * references to variables, including variables being passed
+ * as parameters to synthetic events, are collected into a
+ * var_ref_vals array. This var_ref_idx is the index of the
+ * first param in the array to be passed to the synthetic
+ * event invocation.
+ */
+ unsigned int var_ref_idx;
+ struct synth_event *synth_event;
+ bool use_trace_keyword;
+ char *synth_event_name;
+
union {
struct {
- /*
- * When a histogram trigger is hit, the values of any
- * references to variables, including variables being passed
- * as parameters to synthetic events, are collected into a
- * var_ref_vals array. This var_ref_idx is the index of the
- * first param in the array to be passed to the synthetic
- * event invocation.
- */
- unsigned int var_ref_idx;
- char *match_event;
- char *match_event_system;
- char *synth_event_name;
- struct synth_event *synth_event;
- } onmatch;
+ char *event;
+ char *event_system;
+ } match_data;
struct {
+ /*
+ * var_str contains the $-unstripped variable
+ * name referenced by var_ref, and used when
+ * printing the action. Because var_ref
+ * creation is deferred to create_actions(),
+ * we need a per-action way to save it until
+ * then, thus var_str.
+ */
char *var_str;
- char *fn_name;
- unsigned int max_var_ref_idx;
- struct hist_field *max_var;
- struct hist_field *var;
- } onmax;
+
+ /*
+ * var_ref refers to the variable being
+ * tracked e.g onmax($var).
+ */
+ struct hist_field *var_ref;
+
+ /*
+ * track_var contains the 'invisible' tracking
+ * variable created to keep the current
+ * e.g. max value.
+ */
+ struct hist_field *track_var;
+
+ check_track_val_fn_t check_val;
+ action_fn_t save_data;
+ } track_data;
};
};
+struct track_data {
+ u64 track_val;
+ bool updated;
+
+ unsigned int key_len;
+ void *key;
+ struct tracing_map_elt elt;
+
+ struct action_data *action_data;
+ struct hist_trigger_data *hist_data;
+};
+
+struct hist_elt_data {
+ char *comm;
+ u64 *var_ref_vals;
+ char *field_var_str[SYNTH_FIELDS_MAX];
+};
+
+struct snapshot_context {
+ struct tracing_map_elt *elt;
+ void *key;
+};
+
+static void track_data_free(struct track_data *track_data)
+{
+ struct hist_elt_data *elt_data;
+
+ if (!track_data)
+ return;
+
+ kfree(track_data->key);
+
+ elt_data = track_data->elt.private_data;
+ if (elt_data) {
+ kfree(elt_data->comm);
+ kfree(elt_data);
+ }
+
+ kfree(track_data);
+}
+
+static struct track_data *track_data_alloc(unsigned int key_len,
+ struct action_data *action_data,
+ struct hist_trigger_data *hist_data)
+{
+ struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL);
+ struct hist_elt_data *elt_data;
+
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ data->key = kzalloc(key_len, GFP_KERNEL);
+ if (!data->key) {
+ track_data_free(data);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ data->key_len = key_len;
+ data->action_data = action_data;
+ data->hist_data = hist_data;
+
+ elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
+ if (!elt_data) {
+ track_data_free(data);
+ return ERR_PTR(-ENOMEM);
+ }
+ data->elt.private_data = elt_data;
+
+ elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL);
+ if (!elt_data->comm) {
+ track_data_free(data);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return data;
+}
static char last_hist_cmd[MAX_FILTER_STR_VAL];
static char hist_err_str[MAX_FILTER_STR_VAL];
@@ -1078,12 +1194,12 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields,
static void action_trace(struct hist_trigger_data *hist_data,
struct tracing_map_elt *elt, void *rec,
- struct ring_buffer_event *rbe,
+ struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
- struct synth_event *event = data->onmatch.synth_event;
+ struct synth_event *event = data->synth_event;
- trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx);
+ trace_synth(event, var_ref_vals, data->var_ref_idx);
}
struct hist_var_data {
@@ -1200,8 +1316,8 @@ static int synth_event_create(int argc, const char **argv)
/* This interface accepts group name prefix */
if (strchr(name, '/')) {
- len = sizeof(SYNTH_SYSTEM "/") - 1;
- if (strncmp(name, SYNTH_SYSTEM "/", len))
+ len = str_has_prefix(name, SYNTH_SYSTEM "/");
+ if (len == 0)
return -EINVAL;
name += len;
}
@@ -1644,9 +1760,9 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name)
for (i = 0; i < hist_data->n_actions; i++) {
struct action_data *data = hist_data->actions[i];
- if (data->fn == action_trace) {
- char *system = data->onmatch.match_event_system;
- char *event_name = data->onmatch.match_event;
+ if (data->handler == HANDLER_ONMATCH) {
+ char *system = data->match_data.event_system;
+ char *event_name = data->match_data.event;
file = find_var_file(tr, system, event_name, var_name);
if (!file)
@@ -1691,12 +1807,6 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
return hist_field;
}
-struct hist_elt_data {
- char *comm;
- u64 *var_ref_vals;
- char *field_var_str[SYNTH_FIELDS_MAX];
-};
-
static u64 hist_field_var_ref(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct ring_buffer_event *rbe,
@@ -1882,7 +1992,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
return ret;
if ((str_has_prefix(str, "onmatch(")) ||
- (str_has_prefix(str, "onmax("))) {
+ (str_has_prefix(str, "onmax(")) ||
+ (str_has_prefix(str, "onchange("))) {
attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
if (!attrs->action_str[attrs->n_actions]) {
ret = -ENOMEM;
@@ -2030,7 +2141,7 @@ static inline void save_comm(char *comm, struct task_struct *task)
return;
}
- memcpy(comm, task->comm, TASK_COMM_LEN);
+ strncpy(comm, task->comm, TASK_COMM_LEN);
}
static void hist_elt_data_free(struct hist_elt_data *elt_data)
@@ -2076,7 +2187,7 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
}
}
- n_str = hist_data->n_field_var_str + hist_data->n_max_var_str;
+ n_str = hist_data->n_field_var_str + hist_data->n_save_var_str;
size = STR_VAR_LEN_MAX;
@@ -3050,7 +3161,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
int ret;
if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
- hist_err_event("onmatch: Too many field variables defined: ",
+ hist_err_event("trace action: Too many field variables defined: ",
subsys_name, event_name, field_name);
return ERR_PTR(-EINVAL);
}
@@ -3058,7 +3169,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
file = event_file(tr, subsys_name, event_name);
if (IS_ERR(file)) {
- hist_err_event("onmatch: Event file not found: ",
+ hist_err_event("trace action: Event file not found: ",
subsys_name, event_name, field_name);
ret = PTR_ERR(file);
return ERR_PTR(ret);
@@ -3072,7 +3183,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
*/
hist_data = find_compatible_hist(target_hist_data, file);
if (!hist_data) {
- hist_err_event("onmatch: Matching event histogram not found: ",
+ hist_err_event("trace action: Matching event histogram not found: ",
subsys_name, event_name, field_name);
return ERR_PTR(-EINVAL);
}
@@ -3134,7 +3245,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
kfree(cmd);
kfree(var_hist->cmd);
kfree(var_hist);
- hist_err_event("onmatch: Couldn't create histogram for field: ",
+ hist_err_event("trace action: Couldn't create histogram for field: ",
subsys_name, event_name, field_name);
return ERR_PTR(ret);
}
@@ -3147,7 +3258,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
if (IS_ERR_OR_NULL(event_var)) {
kfree(var_hist->cmd);
kfree(var_hist);
- hist_err_event("onmatch: Couldn't find synthetic variable: ",
+ hist_err_event("trace action: Couldn't find synthetic variable: ",
subsys_name, event_name, field_name);
return ERR_PTR(-EINVAL);
}
@@ -3225,13 +3336,13 @@ static void update_field_vars(struct hist_trigger_data *hist_data,
hist_data->n_field_vars, 0);
}
-static void update_max_vars(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt,
- struct ring_buffer_event *rbe,
- void *rec)
+static void save_track_data_vars(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe, void *key,
+ struct action_data *data, u64 *var_ref_vals)
{
- __update_field_vars(elt, rbe, rec, hist_data->max_vars,
- hist_data->n_max_vars, hist_data->n_field_var_str);
+ __update_field_vars(elt, rbe, rec, hist_data->save_vars,
+ hist_data->n_save_vars, hist_data->n_field_var_str);
}
static struct hist_field *create_var(struct hist_trigger_data *hist_data,
@@ -3366,18 +3477,190 @@ create_target_field_var(struct hist_trigger_data *target_hist_data,
return create_field_var(target_hist_data, file, var_name);
}
-static void onmax_print(struct seq_file *m,
- struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt,
- struct action_data *data)
+static bool check_track_val_max(u64 track_val, u64 var_val)
{
- unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx;
+ if (var_val <= track_val)
+ return false;
+
+ return true;
+}
- seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx));
+static bool check_track_val_changed(u64 track_val, u64 var_val)
+{
+ if (var_val == track_val)
+ return false;
+
+ return true;
+}
+
+static u64 get_track_val(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ struct action_data *data)
+{
+ unsigned int track_var_idx = data->track_data.track_var->var.idx;
+ u64 track_val;
+
+ track_val = tracing_map_read_var(elt, track_var_idx);
+
+ return track_val;
+}
+
+static void save_track_val(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ struct action_data *data, u64 var_val)
+{
+ unsigned int track_var_idx = data->track_data.track_var->var.idx;
+
+ tracing_map_set_var(elt, track_var_idx, var_val);
+}
+
+static void save_track_data(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe, void *key,
+ struct action_data *data, u64 *var_ref_vals)
+{
+ if (data->track_data.save_data)
+ data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+}
+
+static bool check_track_val(struct tracing_map_elt *elt,
+ struct action_data *data,
+ u64 var_val)
+{
+ struct hist_trigger_data *hist_data;
+ u64 track_val;
+
+ hist_data = data->track_data.track_var->hist_data;
+ track_val = get_track_val(hist_data, elt, data);
+
+ return data->track_data.check_val(track_val, var_val);
+}
- for (i = 0; i < hist_data->n_max_vars; i++) {
- struct hist_field *save_val = hist_data->max_vars[i]->val;
- struct hist_field *save_var = hist_data->max_vars[i]->var;
+#ifdef CONFIG_TRACER_SNAPSHOT
+static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
+{
+ /* called with tr->max_lock held */
+ struct track_data *track_data = tr->cond_snapshot->cond_data;
+ struct hist_elt_data *elt_data, *track_elt_data;
+ struct snapshot_context *context = cond_data;
+ u64 track_val;
+
+ if (!track_data)
+ return false;
+
+ track_val = get_track_val(track_data->hist_data, context->elt,
+ track_data->action_data);
+
+ track_data->track_val = track_val;
+ memcpy(track_data->key, context->key, track_data->key_len);
+
+ elt_data = context->elt->private_data;
+ track_elt_data = track_data->elt.private_data;
+ if (elt_data->comm)
+ strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN);
+
+ track_data->updated = true;
+
+ return true;
+}
+
+static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe, void *key,
+ struct action_data *data,
+ u64 *var_ref_vals)
+{
+ struct trace_event_file *file = hist_data->event_file;
+ struct snapshot_context context;
+
+ context.elt = elt;
+ context.key = key;
+
+ tracing_snapshot_cond(file->tr, &context);
+}
+
+static void hist_trigger_print_key(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ void *key,
+ struct tracing_map_elt *elt);
+
+static struct action_data *snapshot_action(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ if (!hist_data->n_actions)
+ return NULL;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *data = hist_data->actions[i];
+
+ if (data->action == ACTION_SNAPSHOT)
+ return data;
+ }
+
+ return NULL;
+}
+
+static void track_data_snapshot_print(struct seq_file *m,
+ struct hist_trigger_data *hist_data)
+{
+ struct trace_event_file *file = hist_data->event_file;
+ struct track_data *track_data;
+ struct action_data *action;
+
+ track_data = tracing_cond_snapshot_data(file->tr);
+ if (!track_data)
+ return;
+
+ if (!track_data->updated)
+ return;
+
+ action = snapshot_action(hist_data);
+ if (!action)
+ return;
+
+ seq_puts(m, "\nSnapshot taken (see tracing/snapshot). Details:\n");
+ seq_printf(m, "\ttriggering value { %s(%s) }: %10llu",
+ action->handler == HANDLER_ONMAX ? "onmax" : "onchange",
+ action->track_data.var_str, track_data->track_val);
+
+ seq_puts(m, "\ttriggered by event with key: ");
+ hist_trigger_print_key(m, hist_data, track_data->key, &track_data->elt);
+ seq_putc(m, '\n');
+}
+#else
+static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
+{
+ return false;
+}
+static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe, void *key,
+ struct action_data *data,
+ u64 *var_ref_vals) {}
+static void track_data_snapshot_print(struct seq_file *m,
+ struct hist_trigger_data *hist_data) {}
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+static void track_data_print(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ struct action_data *data)
+{
+ u64 track_val = get_track_val(hist_data, elt, data);
+ unsigned int i, save_var_idx;
+
+ if (data->handler == HANDLER_ONMAX)
+ seq_printf(m, "\n\tmax: %10llu", track_val);
+ else if (data->handler == HANDLER_ONCHANGE)
+ seq_printf(m, "\n\tchanged: %10llu", track_val);
+
+ if (data->action == ACTION_SNAPSHOT)
+ return;
+
+ for (i = 0; i < hist_data->n_save_vars; i++) {
+ struct hist_field *save_val = hist_data->save_vars[i]->val;
+ struct hist_field *save_var = hist_data->save_vars[i]->var;
u64 val;
save_var_idx = save_var->var.idx;
@@ -3392,64 +3675,81 @@ static void onmax_print(struct seq_file *m,
}
}
-static void onmax_save(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
- struct ring_buffer_event *rbe,
- struct action_data *data, u64 *var_ref_vals)
+static void ontrack_action(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe, void *key,
+ struct action_data *data, u64 *var_ref_vals)
{
- unsigned int max_idx = data->onmax.max_var->var.idx;
- unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx;
+ u64 var_val = var_ref_vals[data->track_data.var_ref->var_ref_idx];
- u64 var_val, max_val;
-
- var_val = var_ref_vals[max_var_ref_idx];
- max_val = tracing_map_read_var(elt, max_idx);
-
- if (var_val <= max_val)
- return;
-
- tracing_map_set_var(elt, max_idx, var_val);
-
- update_max_vars(hist_data, elt, rbe, rec);
+ if (check_track_val(elt, data, var_val)) {
+ save_track_val(hist_data, elt, data, var_val);
+ save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+ }
}
-static void onmax_destroy(struct action_data *data)
+static void action_data_destroy(struct action_data *data)
{
unsigned int i;
- destroy_hist_field(data->onmax.max_var, 0);
- destroy_hist_field(data->onmax.var, 0);
+ lockdep_assert_held(&event_mutex);
- kfree(data->onmax.var_str);
- kfree(data->onmax.fn_name);
+ kfree(data->action_name);
for (i = 0; i < data->n_params; i++)
kfree(data->params[i]);
+ if (data->synth_event)
+ data->synth_event->ref--;
+
+ kfree(data->synth_event_name);
+
kfree(data);
}
-static int onmax_create(struct hist_trigger_data *hist_data,
- struct action_data *data)
+static void track_data_destroy(struct hist_trigger_data *hist_data,
+ struct action_data *data)
{
struct trace_event_file *file = hist_data->event_file;
- struct hist_field *var_field, *ref_field, *max_var;
- unsigned int var_ref_idx = hist_data->n_var_refs;
- struct field_var *field_var;
- char *onmax_var_str, *param;
- unsigned int i;
+
+ destroy_hist_field(data->track_data.track_var, 0);
+
+ if (data->action == ACTION_SNAPSHOT) {
+ struct track_data *track_data;
+
+ track_data = tracing_cond_snapshot_data(file->tr);
+ if (track_data && track_data->hist_data == hist_data) {
+ tracing_snapshot_cond_disable(file->tr);
+ track_data_free(track_data);
+ }
+ }
+
+ kfree(data->track_data.var_str);
+
+ action_data_destroy(data);
+}
+
+static int action_create(struct hist_trigger_data *hist_data,
+ struct action_data *data);
+
+static int track_data_create(struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ struct hist_field *var_field, *ref_field, *track_var = NULL;
+ struct trace_event_file *file = hist_data->event_file;
+ char *track_data_var_str;
int ret = 0;
- onmax_var_str = data->onmax.var_str;
- if (onmax_var_str[0] != '$') {
- hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str);
+ track_data_var_str = data->track_data.var_str;
+ if (track_data_var_str[0] != '$') {
+ hist_err("For onmax(x) or onchange(x), x must be a variable: ", track_data_var_str);
return -EINVAL;
}
- onmax_var_str++;
+ track_data_var_str++;
- var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str);
+ var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str);
if (!var_field) {
- hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str);
+ hist_err("Couldn't find onmax or onchange variable: ", track_data_var_str);
return -EINVAL;
}
@@ -3457,39 +3757,26 @@ static int onmax_create(struct hist_trigger_data *hist_data,
if (!ref_field)
return -ENOMEM;
- data->onmax.var = ref_field;
+ data->track_data.var_ref = ref_field;
- data->fn = onmax_save;
- data->onmax.max_var_ref_idx = var_ref_idx;
- max_var = create_var(hist_data, file, "max", sizeof(u64), "u64");
- if (IS_ERR(max_var)) {
- hist_err("onmax: Couldn't create onmax variable: ", "max");
- ret = PTR_ERR(max_var);
+ if (data->handler == HANDLER_ONMAX)
+ track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64");
+ if (IS_ERR(track_var)) {
+ hist_err("Couldn't create onmax variable: ", "__max");
+ ret = PTR_ERR(track_var);
goto out;
}
- data->onmax.max_var = max_var;
-
- for (i = 0; i < data->n_params; i++) {
- param = kstrdup(data->params[i], GFP_KERNEL);
- if (!param) {
- ret = -ENOMEM;
- goto out;
- }
-
- field_var = create_target_field_var(hist_data, NULL, NULL, param);
- if (IS_ERR(field_var)) {
- hist_err("onmax: Couldn't create field variable: ", param);
- ret = PTR_ERR(field_var);
- kfree(param);
- goto out;
- }
-
- hist_data->max_vars[hist_data->n_max_vars++] = field_var;
- if (field_var->val->flags & HIST_FIELD_FL_STRING)
- hist_data->n_max_var_str++;
- kfree(param);
+ if (data->handler == HANDLER_ONCHANGE)
+ track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64");
+ if (IS_ERR(track_var)) {
+ hist_err("Couldn't create onchange variable: ", "__change");
+ ret = PTR_ERR(track_var);
+ goto out;
}
+ data->track_data.track_var = track_var;
+
+ ret = action_create(hist_data, data);
out:
return ret;
}
@@ -3497,14 +3784,18 @@ static int onmax_create(struct hist_trigger_data *hist_data,
static int parse_action_params(char *params, struct action_data *data)
{
char *param, *saved_param;
+ bool first_param = true;
int ret = 0;
while (params) {
- if (data->n_params >= SYNTH_FIELDS_MAX)
+ if (data->n_params >= SYNTH_FIELDS_MAX) {
+ hist_err("Too many action params", "");
goto out;
+ }
param = strsep(&params, ",");
if (!param) {
+ hist_err("No action param found", "");
ret = -EINVAL;
goto out;
}
@@ -3522,86 +3813,164 @@ static int parse_action_params(char *params, struct action_data *data)
goto out;
}
+ if (first_param && data->use_trace_keyword) {
+ data->synth_event_name = saved_param;
+ first_param = false;
+ continue;
+ }
+ first_param = false;
+
data->params[data->n_params++] = saved_param;
}
out:
return ret;
}
-static struct action_data *onmax_parse(char *str)
+static int action_parse(char *str, struct action_data *data,
+ enum handler_id handler)
{
- char *onmax_fn_name, *onmax_var_str;
- struct action_data *data;
- int ret = -EINVAL;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return ERR_PTR(-ENOMEM);
+ char *action_name;
+ int ret = 0;
- onmax_var_str = strsep(&str, ")");
- if (!onmax_var_str || !str) {
+ strsep(&str, ".");
+ if (!str) {
+ hist_err("action parsing: No action found", "");
ret = -EINVAL;
- goto free;
+ goto out;
}
- data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL);
- if (!data->onmax.var_str) {
- ret = -ENOMEM;
- goto free;
+ action_name = strsep(&str, "(");
+ if (!action_name || !str) {
+ hist_err("action parsing: No action found", "");
+ ret = -EINVAL;
+ goto out;
}
- strsep(&str, ".");
- if (!str)
- goto free;
-
- onmax_fn_name = strsep(&str, "(");
- if (!onmax_fn_name || !str)
- goto free;
-
- if (str_has_prefix(onmax_fn_name, "save")) {
+ if (str_has_prefix(action_name, "save")) {
char *params = strsep(&str, ")");
if (!params) {
+ hist_err("action parsing: No params found for %s", "save");
ret = -EINVAL;
- goto free;
+ goto out;
}
ret = parse_action_params(params, data);
if (ret)
- goto free;
- } else
+ goto out;
+
+ if (handler == HANDLER_ONMAX)
+ data->track_data.check_val = check_track_val_max;
+ else if (handler == HANDLER_ONCHANGE)
+ data->track_data.check_val = check_track_val_changed;
+ else {
+ hist_err("action parsing: Handler doesn't support action: ", action_name);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ data->track_data.save_data = save_track_data_vars;
+ data->fn = ontrack_action;
+ data->action = ACTION_SAVE;
+ } else if (str_has_prefix(action_name, "snapshot")) {
+ char *params = strsep(&str, ")");
+
+ if (!str) {
+ hist_err("action parsing: No closing paren found: %s", params);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (handler == HANDLER_ONMAX)
+ data->track_data.check_val = check_track_val_max;
+ else if (handler == HANDLER_ONCHANGE)
+ data->track_data.check_val = check_track_val_changed;
+ else {
+ hist_err("action parsing: Handler doesn't support action: ", action_name);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ data->track_data.save_data = save_track_data_snapshot;
+ data->fn = ontrack_action;
+ data->action = ACTION_SNAPSHOT;
+ } else {
+ char *params = strsep(&str, ")");
+
+ if (str_has_prefix(action_name, "trace"))
+ data->use_trace_keyword = true;
+
+ if (params) {
+ ret = parse_action_params(params, data);
+ if (ret)
+ goto out;
+ }
+
+ if (handler == HANDLER_ONMAX)
+ data->track_data.check_val = check_track_val_max;
+ else if (handler == HANDLER_ONCHANGE)
+ data->track_data.check_val = check_track_val_changed;
+
+ if (handler != HANDLER_ONMATCH) {
+ data->track_data.save_data = action_trace;
+ data->fn = ontrack_action;
+ } else
+ data->fn = action_trace;
+
+ data->action = ACTION_TRACE;
+ }
+
+ data->action_name = kstrdup(action_name, GFP_KERNEL);
+ if (!data->action_name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ data->handler = handler;
+ out:
+ return ret;
+}
+
+static struct action_data *track_data_parse(struct hist_trigger_data *hist_data,
+ char *str, enum handler_id handler)
+{
+ struct action_data *data;
+ int ret = -EINVAL;
+ char *var_str;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ var_str = strsep(&str, ")");
+ if (!var_str || !str) {
+ ret = -EINVAL;
goto free;
+ }
- data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL);
- if (!data->onmax.fn_name) {
+ data->track_data.var_str = kstrdup(var_str, GFP_KERNEL);
+ if (!data->track_data.var_str) {
ret = -ENOMEM;
goto free;
}
+
+ ret = action_parse(str, data, handler);
+ if (ret)
+ goto free;
out:
return data;
free:
- onmax_destroy(data);
+ track_data_destroy(hist_data, data);
data = ERR_PTR(ret);
goto out;
}
static void onmatch_destroy(struct action_data *data)
{
- unsigned int i;
-
- lockdep_assert_held(&event_mutex);
-
- kfree(data->onmatch.match_event);
- kfree(data->onmatch.match_event_system);
- kfree(data->onmatch.synth_event_name);
-
- for (i = 0; i < data->n_params; i++)
- kfree(data->params[i]);
+ kfree(data->match_data.event);
+ kfree(data->match_data.event_system);
- if (data->onmatch.synth_event)
- data->onmatch.synth_event->ref--;
-
- kfree(data);
+ action_data_destroy(data);
}
static void destroy_field_var(struct field_var *field_var)
@@ -3651,8 +4020,9 @@ static int check_synth_field(struct synth_event *event,
}
static struct hist_field *
-onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
- char *system, char *event, char *var)
+trace_action_find_var(struct hist_trigger_data *hist_data,
+ struct action_data *data,
+ char *system, char *event, char *var)
{
struct hist_field *hist_field;
@@ -3660,24 +4030,24 @@ onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
hist_field = find_target_event_var(hist_data, system, event, var);
if (!hist_field) {
- if (!system) {
- system = data->onmatch.match_event_system;
- event = data->onmatch.match_event;
+ if (!system && data->handler == HANDLER_ONMATCH) {
+ system = data->match_data.event_system;
+ event = data->match_data.event;
}
hist_field = find_event_var(hist_data, system, event, var);
}
if (!hist_field)
- hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var);
+ hist_err_event("trace action: Couldn't find param: $", system, event, var);
return hist_field;
}
static struct hist_field *
-onmatch_create_field_var(struct hist_trigger_data *hist_data,
- struct action_data *data, char *system,
- char *event, char *var)
+trace_action_create_field_var(struct hist_trigger_data *hist_data,
+ struct action_data *data, char *system,
+ char *event, char *var)
{
struct hist_field *hist_field = NULL;
struct field_var *field_var;
@@ -3700,9 +4070,9 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data,
* looking for fields on the onmatch(system.event.xxx)
* event.
*/
- if (!system) {
- system = data->onmatch.match_event_system;
- event = data->onmatch.match_event;
+ if (!system && data->handler == HANDLER_ONMATCH) {
+ system = data->match_data.event_system;
+ event = data->match_data.event;
}
/*
@@ -3724,24 +4094,30 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data,
goto out;
}
-static int onmatch_create(struct hist_trigger_data *hist_data,
- struct trace_event_file *file,
- struct action_data *data)
+static int trace_action_create(struct hist_trigger_data *hist_data,
+ struct action_data *data)
{
char *event_name, *param, *system = NULL;
struct hist_field *hist_field, *var_ref;
unsigned int i, var_ref_idx;
unsigned int field_pos = 0;
struct synth_event *event;
+ char *synth_event_name;
int ret = 0;
lockdep_assert_held(&event_mutex);
- event = find_synth_event(data->onmatch.synth_event_name);
+ if (data->use_trace_keyword)
+ synth_event_name = data->synth_event_name;
+ else
+ synth_event_name = data->action_name;
+
+ event = find_synth_event(synth_event_name);
if (!event) {
- hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
+ hist_err("trace action: Couldn't find synthetic event: ", synth_event_name);
return -EINVAL;
}
+
event->ref++;
var_ref_idx = hist_data->n_var_refs;
@@ -3769,13 +4145,15 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
}
if (param[0] == '$')
- hist_field = onmatch_find_var(hist_data, data, system,
- event_name, param);
+ hist_field = trace_action_find_var(hist_data, data,
+ system, event_name,
+ param);
else
- hist_field = onmatch_create_field_var(hist_data, data,
- system,
- event_name,
- param);
+ hist_field = trace_action_create_field_var(hist_data,
+ data,
+ system,
+ event_name,
+ param);
if (!hist_field) {
kfree(p);
@@ -3797,7 +4175,7 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
continue;
}
- hist_err_event("onmatch: Param type doesn't match synthetic event field type: ",
+ hist_err_event("trace action: Param type doesn't match synthetic event field type: ",
system, event_name, param);
kfree(p);
ret = -EINVAL;
@@ -3805,14 +4183,13 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
}
if (field_pos != event->n_fields) {
- hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name);
+ hist_err("trace action: Param count doesn't match synthetic event field count: ", event->name);
ret = -EINVAL;
goto err;
}
- data->fn = action_trace;
- data->onmatch.synth_event = event;
- data->onmatch.var_ref_idx = var_ref_idx;
+ data->synth_event = event;
+ data->var_ref_idx = var_ref_idx;
out:
return ret;
err:
@@ -3821,10 +4198,75 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
goto out;
}
+static int action_create(struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ struct trace_event_file *file = hist_data->event_file;
+ struct track_data *track_data;
+ struct field_var *field_var;
+ unsigned int i;
+ char *param;
+ int ret = 0;
+
+ if (data->action == ACTION_TRACE)
+ return trace_action_create(hist_data, data);
+
+ if (data->action == ACTION_SNAPSHOT) {
+ track_data = track_data_alloc(hist_data->key_size, data, hist_data);
+ if (IS_ERR(track_data)) {
+ ret = PTR_ERR(track_data);
+ goto out;
+ }
+
+ ret = tracing_snapshot_cond_enable(file->tr, track_data,
+ cond_snapshot_update);
+ if (ret)
+ track_data_free(track_data);
+
+ goto out;
+ }
+
+ if (data->action == ACTION_SAVE) {
+ if (hist_data->n_save_vars) {
+ ret = -EEXIST;
+ hist_err("save action: Can't have more than one save() action per hist", "");
+ goto out;
+ }
+
+ for (i = 0; i < data->n_params; i++) {
+ param = kstrdup(data->params[i], GFP_KERNEL);
+ if (!param) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ field_var = create_target_field_var(hist_data, NULL, NULL, param);
+ if (IS_ERR(field_var)) {
+ hist_err("save action: Couldn't create field variable: ", param);
+ ret = PTR_ERR(field_var);
+ kfree(param);
+ goto out;
+ }
+
+ hist_data->save_vars[hist_data->n_save_vars++] = field_var;
+ if (field_var->val->flags & HIST_FIELD_FL_STRING)
+ hist_data->n_save_var_str++;
+ kfree(param);
+ }
+ }
+ out:
+ return ret;
+}
+
+static int onmatch_create(struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ return action_create(hist_data, data);
+}
+
static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
{
char *match_event, *match_event_system;
- char *synth_event_name, *params;
struct action_data *data;
int ret = -EINVAL;
@@ -3850,43 +4292,19 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
goto free;
}
- data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL);
- if (!data->onmatch.match_event) {
- ret = -ENOMEM;
- goto free;
- }
-
- data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL);
- if (!data->onmatch.match_event_system) {
+ data->match_data.event = kstrdup(match_event, GFP_KERNEL);
+ if (!data->match_data.event) {
ret = -ENOMEM;
goto free;
}
- strsep(&str, ".");
- if (!str) {
- hist_err("onmatch: Missing . after onmatch(): ", str);
- goto free;
- }
-
- synth_event_name = strsep(&str, "(");
- if (!synth_event_name || !str) {
- hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name);
- goto free;
- }
-
- data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL);
- if (!data->onmatch.synth_event_name) {
+ data->match_data.event_system = kstrdup(match_event_system, GFP_KERNEL);
+ if (!data->match_data.event_system) {
ret = -ENOMEM;
goto free;
}
- params = strsep(&str, ")");
- if (!params || !str || (str && strlen(str))) {
- hist_err("onmatch: Missing closing paramlist paren: ", params);
- goto free;
- }
-
- ret = parse_action_params(params, data);
+ ret = action_parse(str, data, HANDLER_ONMATCH);
if (ret)
goto free;
out:
@@ -4326,10 +4744,11 @@ static void destroy_actions(struct hist_trigger_data *hist_data)
for (i = 0; i < hist_data->n_actions; i++) {
struct action_data *data = hist_data->actions[i];
- if (data->fn == action_trace)
+ if (data->handler == HANDLER_ONMATCH)
onmatch_destroy(data);
- else if (data->fn == onmax_save)
- onmax_destroy(data);
+ else if (data->handler == HANDLER_ONMAX ||
+ data->handler == HANDLER_ONCHANGE)
+ track_data_destroy(hist_data, data);
else
kfree(data);
}
@@ -4355,16 +4774,24 @@ static int parse_actions(struct hist_trigger_data *hist_data)
ret = PTR_ERR(data);
break;
}
- data->fn = action_trace;
} else if ((len = str_has_prefix(str, "onmax("))) {
char *action_str = str + len;
- data = onmax_parse(action_str);
+ data = track_data_parse(hist_data, action_str,
+ HANDLER_ONMAX);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ break;
+ }
+ } else if ((len = str_has_prefix(str, "onchange("))) {
+ char *action_str = str + len;
+
+ data = track_data_parse(hist_data, action_str,
+ HANDLER_ONCHANGE);
if (IS_ERR(data)) {
ret = PTR_ERR(data);
break;
}
- data->fn = onmax_save;
} else {
ret = -EINVAL;
break;
@@ -4376,8 +4803,7 @@ static int parse_actions(struct hist_trigger_data *hist_data)
return ret;
}
-static int create_actions(struct hist_trigger_data *hist_data,
- struct trace_event_file *file)
+static int create_actions(struct hist_trigger_data *hist_data)
{
struct action_data *data;
unsigned int i;
@@ -4386,14 +4812,18 @@ static int create_actions(struct hist_trigger_data *hist_data,
for (i = 0; i < hist_data->attrs->n_actions; i++) {
data = hist_data->actions[i];
- if (data->fn == action_trace) {
- ret = onmatch_create(hist_data, file, data);
+ if (data->handler == HANDLER_ONMATCH) {
+ ret = onmatch_create(hist_data, data);
if (ret)
- return ret;
- } else if (data->fn == onmax_save) {
- ret = onmax_create(hist_data, data);
+ break;
+ } else if (data->handler == HANDLER_ONMAX ||
+ data->handler == HANDLER_ONCHANGE) {
+ ret = track_data_create(hist_data, data);
if (ret)
- return ret;
+ break;
+ } else {
+ ret = -EINVAL;
+ break;
}
}
@@ -4409,26 +4839,51 @@ static void print_actions(struct seq_file *m,
for (i = 0; i < hist_data->n_actions; i++) {
struct action_data *data = hist_data->actions[i];
- if (data->fn == onmax_save)
- onmax_print(m, hist_data, elt, data);
+ if (data->action == ACTION_SNAPSHOT)
+ continue;
+
+ if (data->handler == HANDLER_ONMAX ||
+ data->handler == HANDLER_ONCHANGE)
+ track_data_print(m, hist_data, elt, data);
}
}
-static void print_onmax_spec(struct seq_file *m,
- struct hist_trigger_data *hist_data,
- struct action_data *data)
+static void print_action_spec(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct action_data *data)
{
unsigned int i;
- seq_puts(m, ":onmax(");
- seq_printf(m, "%s", data->onmax.var_str);
- seq_printf(m, ").%s(", data->onmax.fn_name);
-
- for (i = 0; i < hist_data->n_max_vars; i++) {
- seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name);
- if (i < hist_data->n_max_vars - 1)
- seq_puts(m, ",");
+ if (data->action == ACTION_SAVE) {
+ for (i = 0; i < hist_data->n_save_vars; i++) {
+ seq_printf(m, "%s", hist_data->save_vars[i]->var->var.name);
+ if (i < hist_data->n_save_vars - 1)
+ seq_puts(m, ",");
+ }
+ } else if (data->action == ACTION_TRACE) {
+ if (data->use_trace_keyword)
+ seq_printf(m, "%s", data->synth_event_name);
+ for (i = 0; i < data->n_params; i++) {
+ if (i || data->use_trace_keyword)
+ seq_puts(m, ",");
+ seq_printf(m, "%s", data->params[i]);
+ }
}
+}
+
+static void print_track_data_spec(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ if (data->handler == HANDLER_ONMAX)
+ seq_puts(m, ":onmax(");
+ else if (data->handler == HANDLER_ONCHANGE)
+ seq_puts(m, ":onchange(");
+ seq_printf(m, "%s", data->track_data.var_str);
+ seq_printf(m, ").%s(", data->action_name);
+
+ print_action_spec(m, hist_data, data);
+
seq_puts(m, ")");
}
@@ -4436,18 +4891,12 @@ static void print_onmatch_spec(struct seq_file *m,
struct hist_trigger_data *hist_data,
struct action_data *data)
{
- unsigned int i;
-
- seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system,
- data->onmatch.match_event);
+ seq_printf(m, ":onmatch(%s.%s).", data->match_data.event_system,
+ data->match_data.event);
- seq_printf(m, "%s(", data->onmatch.synth_event->name);
+ seq_printf(m, "%s(", data->action_name);
- for (i = 0; i < data->n_params; i++) {
- if (i)
- seq_puts(m, ",");
- seq_printf(m, "%s", data->params[i]);
- }
+ print_action_spec(m, hist_data, data);
seq_puts(m, ")");
}
@@ -4463,8 +4912,11 @@ static bool actions_match(struct hist_trigger_data *hist_data,
for (i = 0; i < hist_data->n_actions; i++) {
struct action_data *data = hist_data->actions[i];
struct action_data *data_test = hist_data_test->actions[i];
+ char *action_name, *action_name_test;
- if (data->fn != data_test->fn)
+ if (data->handler != data_test->handler)
+ return false;
+ if (data->action != data_test->action)
return false;
if (data->n_params != data_test->n_params)
@@ -4475,22 +4927,30 @@ static bool actions_match(struct hist_trigger_data *hist_data,
return false;
}
- if (data->fn == action_trace) {
- if (strcmp(data->onmatch.synth_event_name,
- data_test->onmatch.synth_event_name) != 0)
- return false;
- if (strcmp(data->onmatch.match_event_system,
- data_test->onmatch.match_event_system) != 0)
- return false;
- if (strcmp(data->onmatch.match_event,
- data_test->onmatch.match_event) != 0)
+ if (data->use_trace_keyword)
+ action_name = data->synth_event_name;
+ else
+ action_name = data->action_name;
+
+ if (data_test->use_trace_keyword)
+ action_name_test = data_test->synth_event_name;
+ else
+ action_name_test = data_test->action_name;
+
+ if (strcmp(action_name, action_name_test) != 0)
+ return false;
+
+ if (data->handler == HANDLER_ONMATCH) {
+ if (strcmp(data->match_data.event_system,
+ data_test->match_data.event_system) != 0)
return false;
- } else if (data->fn == onmax_save) {
- if (strcmp(data->onmax.var_str,
- data_test->onmax.var_str) != 0)
+ if (strcmp(data->match_data.event,
+ data_test->match_data.event) != 0)
return false;
- if (strcmp(data->onmax.fn_name,
- data_test->onmax.fn_name) != 0)
+ } else if (data->handler == HANDLER_ONMAX ||
+ data->handler == HANDLER_ONCHANGE) {
+ if (strcmp(data->track_data.var_str,
+ data_test->track_data.var_str) != 0)
return false;
}
}
@@ -4507,10 +4967,11 @@ static void print_actions_spec(struct seq_file *m,
for (i = 0; i < hist_data->n_actions; i++) {
struct action_data *data = hist_data->actions[i];
- if (data->fn == action_trace)
+ if (data->handler == HANDLER_ONMATCH)
print_onmatch_spec(m, hist_data, data);
- else if (data->fn == onmax_save)
- print_onmax_spec(m, hist_data, data);
+ else if (data->handler == HANDLER_ONMAX ||
+ data->handler == HANDLER_ONCHANGE)
+ print_track_data_spec(m, hist_data, data);
}
}
@@ -4695,22 +5156,24 @@ static inline void add_to_key(char *compound_key, void *key,
/* ensure NULL-termination */
if (size > key_field->size - 1)
size = key_field->size - 1;
- }
- memcpy(compound_key + key_field->offset, key, size);
+ strncpy(compound_key + key_field->offset, (char *)key, size);
+ } else
+ memcpy(compound_key + key_field->offset, key, size);
}
static void
hist_trigger_actions(struct hist_trigger_data *hist_data,
struct tracing_map_elt *elt, void *rec,
- struct ring_buffer_event *rbe, u64 *var_ref_vals)
+ struct ring_buffer_event *rbe, void *key,
+ u64 *var_ref_vals)
{
struct action_data *data;
unsigned int i;
for (i = 0; i < hist_data->n_actions; i++) {
data = hist_data->actions[i];
- data->fn(hist_data, elt, rec, rbe, data, var_ref_vals);
+ data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals);
}
}
@@ -4771,7 +5234,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
if (resolve_var_refs(hist_data, key, var_ref_vals, true))
- hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals);
+ hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals);
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -4792,10 +5255,10 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
}
}
-static void
-hist_trigger_entry_print(struct seq_file *m,
- struct hist_trigger_data *hist_data, void *key,
- struct tracing_map_elt *elt)
+static void hist_trigger_print_key(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ void *key,
+ struct tracing_map_elt *elt)
{
struct hist_field *key_field;
char str[KSYM_SYMBOL_LEN];
@@ -4871,6 +5334,17 @@ hist_trigger_entry_print(struct seq_file *m,
seq_puts(m, " ");
seq_puts(m, "}");
+}
+
+static void hist_trigger_entry_print(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ void *key,
+ struct tracing_map_elt *elt)
+{
+ const char *field_name;
+ unsigned int i;
+
+ hist_trigger_print_key(m, hist_data, key, elt);
seq_printf(m, " hitcount: %10llu",
tracing_map_read_sum(elt, HITCOUNT_IDX));
@@ -4937,6 +5411,8 @@ static void hist_trigger_show(struct seq_file *m,
if (n_entries < 0)
n_entries = 0;
+ track_data_snapshot_print(m, hist_data);
+
seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n",
(u64)atomic64_read(&hist_data->map->hits),
n_entries, (u64)atomic64_read(&hist_data->map->drops));
@@ -5683,7 +6159,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
if (has_hist_vars(hist_data))
save_hist_vars(hist_data);
- ret = create_actions(hist_data, file);
+ ret = create_actions(hist_data);
if (ret)
goto out_unreg;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c2af1560e856..69ebf3c2f1b5 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -380,6 +380,7 @@ static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
trace_seq_putc(s, ' ');
trace_print_lat_fmt(s, entry);
+ trace_seq_puts(s, " | ");
}
/* If the pid changed since the last trace, output this event */
@@ -501,6 +502,17 @@ static void print_graph_abs_time(u64 t, struct trace_seq *s)
}
static void
+print_graph_rel_time(struct trace_iterator *iter, struct trace_seq *s)
+{
+ unsigned long long usecs;
+
+ usecs = iter->ts - iter->trace_buffer->time_start;
+ do_div(usecs, NSEC_PER_USEC);
+
+ trace_seq_printf(s, "%9llu us | ", usecs);
+}
+
+static void
print_graph_irq(struct trace_iterator *iter, unsigned long addr,
enum trace_type type, int cpu, pid_t pid, u32 flags)
{
@@ -517,6 +529,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
print_graph_abs_time(iter->ts, s);
+ /* Relative time */
+ if (flags & TRACE_GRAPH_PRINT_REL_TIME)
+ print_graph_rel_time(iter, s);
+
/* Cpu */
if (flags & TRACE_GRAPH_PRINT_CPU)
print_graph_cpu(s, cpu);
@@ -725,6 +741,10 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
print_graph_abs_time(iter->ts, s);
+ /* Relative time */
+ if (flags & TRACE_GRAPH_PRINT_REL_TIME)
+ print_graph_rel_time(iter, s);
+
/* Cpu */
if (flags & TRACE_GRAPH_PRINT_CPU)
print_graph_cpu(s, cpu);
@@ -1101,6 +1121,8 @@ static void print_lat_header(struct seq_file *s, u32 flags)
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
size += 16;
+ if (flags & TRACE_GRAPH_PRINT_REL_TIME)
+ size += 16;
if (flags & TRACE_GRAPH_PRINT_CPU)
size += 4;
if (flags & TRACE_GRAPH_PRINT_PROC)
@@ -1125,12 +1147,14 @@ static void __print_graph_headers_flags(struct trace_array *tr,
seq_putc(s, '#');
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
seq_puts(s, " TIME ");
+ if (flags & TRACE_GRAPH_PRINT_REL_TIME)
+ seq_puts(s, " REL TIME ");
if (flags & TRACE_GRAPH_PRINT_CPU)
seq_puts(s, " CPU");
if (flags & TRACE_GRAPH_PRINT_PROC)
seq_puts(s, " TASK/PID ");
if (lat)
- seq_puts(s, "||||");
+ seq_puts(s, "|||| ");
if (flags & TRACE_GRAPH_PRINT_DURATION)
seq_puts(s, " DURATION ");
seq_puts(s, " FUNCTION CALLS\n");
@@ -1139,12 +1163,14 @@ static void __print_graph_headers_flags(struct trace_array *tr,
seq_putc(s, '#');
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
seq_puts(s, " | ");
+ if (flags & TRACE_GRAPH_PRINT_REL_TIME)
+ seq_puts(s, " | ");
if (flags & TRACE_GRAPH_PRINT_CPU)
seq_puts(s, " | ");
if (flags & TRACE_GRAPH_PRINT_PROC)
seq_puts(s, " | | ");
if (lat)
- seq_puts(s, "||||");
+ seq_puts(s, "|||| ");
if (flags & TRACE_GRAPH_PRINT_DURATION)
seq_puts(s, " | | ");
seq_puts(s, " | | | |\n");
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index d3294721f119..a745b0cee5d3 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -14,6 +14,7 @@
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include "trace.h"
@@ -238,7 +239,7 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
TRACE_GRAPH_PRINT_PROC | \
- TRACE_GRAPH_PRINT_ABS_TIME | \
+ TRACE_GRAPH_PRINT_REL_TIME | \
TRACE_GRAPH_PRINT_DURATION)
static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
@@ -365,7 +366,7 @@ out:
__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
}
-static inline void
+static nokprobe_inline void
start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
{
int cpu;
@@ -401,7 +402,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
atomic_dec(&data->disabled);
}
-static inline void
+static nokprobe_inline void
stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
{
int cpu;
@@ -443,6 +444,7 @@ void start_critical_timings(void)
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
}
EXPORT_SYMBOL_GPL(start_critical_timings);
+NOKPROBE_SYMBOL(start_critical_timings);
void stop_critical_timings(void)
{
@@ -452,6 +454,7 @@ void stop_critical_timings(void)
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
}
EXPORT_SYMBOL_GPL(stop_critical_timings);
+NOKPROBE_SYMBOL(stop_critical_timings);
#ifdef CONFIG_FUNCTION_TRACER
static bool function_enabled;
@@ -611,6 +614,7 @@ void tracer_hardirqs_on(unsigned long a0, unsigned long a1)
if (!preempt_trace(pc) && irq_trace())
stop_critical_timing(a0, a1, pc);
}
+NOKPROBE_SYMBOL(tracer_hardirqs_on);
void tracer_hardirqs_off(unsigned long a0, unsigned long a1)
{
@@ -619,6 +623,7 @@ void tracer_hardirqs_off(unsigned long a0, unsigned long a1)
if (!preempt_trace(pc) && irq_trace())
start_critical_timing(a0, a1, pc);
}
+NOKPROBE_SYMBOL(tracer_hardirqs_off);
static int irqsoff_tracer_init(struct trace_array *tr)
{
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index d953c163a079..810d78a8d14c 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -51,14 +51,16 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter.buffer_iter[cpu] =
- ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
+ ring_buffer_read_prepare(iter.trace_buffer->buffer,
+ cpu, GFP_ATOMIC);
ring_buffer_read_start(iter.buffer_iter[cpu]);
tracing_iter_reset(&iter, cpu);
}
} else {
iter.cpu_file = cpu_file;
iter.buffer_iter[cpu_file] =
- ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
+ ring_buffer_read_prepare(iter.trace_buffer->buffer,
+ cpu_file, GFP_ATOMIC);
ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9eaf07f99212..5d5129b05df7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,7 +35,7 @@ static struct dyn_event_operations trace_kprobe_ops = {
.match = trace_kprobe_match,
};
-/**
+/*
* Kprobe event core functions
*/
struct trace_kprobe {
@@ -221,7 +221,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
tk->rp.maxactive = maxactive;
- if (!event || !is_good_name(event)) {
+ if (!event || !group) {
ret = -EINVAL;
goto error;
}
@@ -231,11 +231,6 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
if (!tk->tp.call.name)
goto error;
- if (!group || !is_good_name(group)) {
- ret = -EINVAL;
- goto error;
- }
-
tk->tp.class.system = kstrdup(group, GFP_KERNEL);
if (!tk->tp.class.system)
goto error;
@@ -624,7 +619,11 @@ static int trace_kprobe_create(int argc, const char *argv[])
if (event)
event++;
- if (is_return && isdigit(argv[0][1])) {
+ if (isdigit(argv[0][1])) {
+ if (!is_return) {
+ pr_info("Maxactive is not for kprobe");
+ return -EINVAL;
+ }
if (event)
len = event - &argv[0][1] - 1;
else
@@ -634,8 +633,8 @@ static int trace_kprobe_create(int argc, const char *argv[])
memcpy(buf, &argv[0][1], len);
buf[len] = '\0';
ret = kstrtouint(buf, 0, &maxactive);
- if (ret) {
- pr_info("Failed to parse maxactive.\n");
+ if (ret || !maxactive) {
+ pr_info("Invalid maxactive number\n");
return ret;
}
/* kretprobes instances are iterated over via a list. The
@@ -694,9 +693,9 @@ static int trace_kprobe_create(int argc, const char *argv[])
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
argc, is_return);
if (IS_ERR(tk)) {
- pr_info("Failed to allocate trace_probe.(%d)\n",
- (int)PTR_ERR(tk));
ret = PTR_ERR(tk);
+ /* This must return -ENOMEM otherwise there is a bug */
+ WARN_ON_ONCE(ret != -ENOMEM);
goto out;
}
@@ -865,7 +864,7 @@ fetch_store_strlen(unsigned long addr)
u8 c;
do {
- ret = probe_mem_read(&c, (u8 *)addr + len, 1);
+ ret = probe_kernel_read(&c, (u8 *)addr + len, 1);
len++;
} while (c && ret == 0 && len < MAX_STRING_SIZE);
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 71f553cceb3c..4d8e99fdbbbe 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -9,6 +9,7 @@
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include "trace.h"
#define CREATE_TRACE_POINTS
@@ -30,6 +31,7 @@ void trace_hardirqs_on(void)
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
+NOKPROBE_SYMBOL(trace_hardirqs_on);
void trace_hardirqs_off(void)
{
@@ -43,6 +45,7 @@ void trace_hardirqs_off(void)
lockdep_hardirqs_off(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_off);
+NOKPROBE_SYMBOL(trace_hardirqs_off);
__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
{
@@ -56,6 +59,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
+NOKPROBE_SYMBOL(trace_hardirqs_on_caller);
__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
{
@@ -69,6 +73,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
lockdep_hardirqs_off(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_off_caller);
+NOKPROBE_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_TRACE_IRQFLAGS */
#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 9962cb5da8ac..8f8411e7835f 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -13,7 +13,7 @@
#include "trace_probe.h"
-const char *reserved_field_names[] = {
+static const char *reserved_field_names[] = {
"common_type",
"common_flags",
"common_preempt_count",
@@ -159,6 +159,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
char *buf)
{
const char *slash, *event = *pevent;
+ int len;
slash = strchr(event, '/');
if (slash) {
@@ -171,12 +172,25 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
return -E2BIG;
}
strlcpy(buf, event, slash - event + 1);
+ if (!is_good_name(buf)) {
+ pr_info("Group name must follow the same rules as C identifiers\n");
+ return -EINVAL;
+ }
*pgroup = buf;
*pevent = slash + 1;
+ event = *pevent;
}
- if (strlen(event) == 0) {
+ len = strlen(event);
+ if (len == 0) {
pr_info("Event name is not specified\n");
return -EINVAL;
+ } else if (len > MAX_EVENT_NAME_LEN) {
+ pr_info("Event name is too long\n");
+ return -E2BIG;
+ }
+ if (!is_good_name(event)) {
+ pr_info("Event name must follow the same rules as C identifiers\n");
+ return -EINVAL;
}
return 0;
}
@@ -300,6 +314,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
case '+': /* deref memory */
arg++; /* Skip '+', because kstrtol() rejects it. */
+ /* fall through */
case '-':
tmp = strchr(arg, '(');
if (!tmp)
@@ -547,6 +562,8 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
body = strchr(arg, '=');
if (body) {
+ if (body - arg > MAX_ARG_NAME_LEN || body == arg)
+ return -EINVAL;
parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL);
body++;
} else {
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 8a63f8bc01bc..2177c206de15 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -32,6 +32,7 @@
#define MAX_TRACE_ARGS 128
#define MAX_ARGSTR_LEN 63
#define MAX_ARRAY_LEN 64
+#define MAX_ARG_NAME_LEN 32
#define MAX_STRING_SIZE PATH_MAX
/* Reserved field names */
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4ea7e6845efb..743b2b520d34 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -180,8 +180,11 @@ static void wakeup_trace_close(struct trace_iterator *iter)
}
#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \
- TRACE_GRAPH_PRINT_ABS_TIME | \
- TRACE_GRAPH_PRINT_DURATION)
+ TRACE_GRAPH_PRINT_CPU | \
+ TRACE_GRAPH_PRINT_REL_TIME | \
+ TRACE_GRAPH_PRINT_DURATION | \
+ TRACE_GRAPH_PRINT_OVERHEAD | \
+ TRACE_GRAPH_PRINT_IRQS)
static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
{
@@ -472,6 +475,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
+ __trace_stack(wakeup_trace, flags, 0, pc);
T0 = data->preempt_timestamp;
T1 = ftrace_now(cpu);
@@ -482,7 +486,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
if (likely(!is_tracing_stopped())) {
wakeup_trace->max_latency = delta;
- update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
+ update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu, NULL);
}
out_unlock:
@@ -583,6 +587,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
data->preempt_timestamp = ftrace_now(cpu);
tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
+ __trace_stack(wakeup_trace, flags, 0, pc);
/*
* We must be careful in using CALLER_ADDR2. But since wake_up
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f93a56d2db27..fa8fbff736d6 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -314,6 +314,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct ring_buffer_event *event;
struct ring_buffer *buffer;
unsigned long irq_flags;
+ unsigned long args[6];
int pc;
int syscall_nr;
int size;
@@ -347,7 +348,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
entry = ring_buffer_event_data(event);
entry->nr = syscall_nr;
- syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
+ syscall_get_arguments(current, regs, args);
+ memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
event_trigger_unlock_commit(trace_file, buffer, event, entry,
irq_flags, pc);
@@ -583,6 +585,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
+ unsigned long args[6];
bool valid_prog_array;
int syscall_nr;
int rctx;
@@ -613,8 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
rec->nr = syscall_nr;
- syscall_get_arguments(current, regs, 0, sys_data->nb_args,
- (unsigned long *)&rec->args);
+ syscall_get_arguments(current, regs, args);
+ memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
if ((valid_prog_array &&
!perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9bde07c06362..be78d99ee6bc 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -273,10 +273,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
{
struct trace_uprobe *tu;
- if (!event || !is_good_name(event))
- return ERR_PTR(-EINVAL);
-
- if (!group || !is_good_name(group))
+ if (!event || !group)
return ERR_PTR(-EINVAL);
tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
@@ -524,8 +521,9 @@ static int trace_uprobe_create(int argc, const char **argv)
tu = alloc_trace_uprobe(group, event, argc, is_return);
if (IS_ERR(tu)) {
- pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
ret = PTR_ERR(tu);
+ /* This must return -ENOMEM otherwise there is a bug */
+ WARN_ON_ONCE(ret != -ENOMEM);
goto fail_address_parse;
}
tu->offset = offset;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 977918d5d350..6a5787233113 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -42,9 +42,9 @@ int __read_mostly watchdog_user_enabled = 1;
int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
int __read_mostly soft_watchdog_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
-int __read_mostly nmi_watchdog_available;
+static int __read_mostly nmi_watchdog_available;
-struct cpumask watchdog_allowed_mask __read_mostly;
+static struct cpumask watchdog_allowed_mask __read_mostly;
struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -199,6 +199,13 @@ static int __init nosoftlockup_setup(char *str)
}
__setup("nosoftlockup", nosoftlockup_setup);
+static int __init watchdog_thresh_setup(char *str)
+{
+ get_option(&str, &watchdog_thresh);
+ return 1;
+}
+__setup("watchdog_thresh=", watchdog_thresh_setup);
+
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
@@ -547,13 +554,15 @@ static void softlockup_start_all(void)
int lockup_detector_online_cpu(unsigned int cpu)
{
- watchdog_enable(cpu);
+ if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
+ watchdog_enable(cpu);
return 0;
}
int lockup_detector_offline_cpu(unsigned int cpu)
{
- watchdog_disable(cpu);
+ if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
+ watchdog_disable(cpu);
return 0;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fc5d23d752a5..ddee541ea97a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -259,6 +259,8 @@ struct workqueue_struct {
struct wq_device *wq_dev; /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
+ char *lock_name;
+ struct lock_class_key key;
struct lockdep_map lockdep_map;
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
@@ -646,7 +648,7 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
* The following mb guarantees that previous clear of a PENDING bit
* will not be reordered with any speculative LOADS or STORES from
* work->current_func, which is executed afterwards. This possible
- * reordering can lead to a missed execution on attempt to qeueue
+ * reordering can lead to a missed execution on attempt to queue
* the same @work. E.g. consider this case:
*
* CPU#0 CPU#1
@@ -918,6 +920,16 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
* CONTEXT:
* spin_lock_irq(rq->lock)
*
+ * This function is called during schedule() when a kworker is going
+ * to sleep. It's used by psi to identify aggregation workers during
+ * dequeuing, to allow periodic aggregation to shut-off when that
+ * worker is the last task in the system or cgroup to go to sleep.
+ *
+ * As this function doesn't involve any workqueue-related locking, it
+ * only returns stable values when called from inside the scheduler's
+ * queuing and dequeuing paths, when @task, which must be a kworker,
+ * is guaranteed to not be processing any works.
+ *
* Return:
* The last work function %current executed as a worker, NULL if it
* hasn't executed any work yet.
@@ -1341,7 +1353,7 @@ static bool is_chained_work(struct workqueue_struct *wq)
worker = current_wq_worker();
/*
- * Return %true iff I'm a worker execuing a work item on @wq. If
+ * Return %true iff I'm a worker executing a work item on @wq. If
* I'm @worker, it's safe to dereference it without locking.
*/
return worker && worker->current_pwq->wq == wq;
@@ -1512,6 +1524,90 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL(queue_work_on);
+/**
+ * workqueue_select_cpu_near - Select a CPU based on NUMA node
+ * @node: NUMA node ID that we want to select a CPU from
+ *
+ * This function will attempt to find a "random" cpu available on a given
+ * node. If there are no CPUs available on the given node it will return
+ * WORK_CPU_UNBOUND indicating that we should just schedule to any
+ * available CPU if we need to schedule this work.
+ */
+static int workqueue_select_cpu_near(int node)
+{
+ int cpu;
+
+ /* No point in doing this if NUMA isn't enabled for workqueues */
+ if (!wq_numa_enabled)
+ return WORK_CPU_UNBOUND;
+
+ /* Delay binding to CPU if node is not valid or online */
+ if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
+ return WORK_CPU_UNBOUND;
+
+ /* Use local node/cpu if we are already there */
+ cpu = raw_smp_processor_id();
+ if (node == cpu_to_node(cpu))
+ return cpu;
+
+ /* Use "random" otherwise know as "first" online CPU of node */
+ cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);
+
+ /* If CPU is valid return that, otherwise just defer */
+ return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
+}
+
+/**
+ * queue_work_node - queue work on a "random" cpu for a given NUMA node
+ * @node: NUMA node that we are targeting the work for
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * We queue the work to a "random" CPU within a given NUMA node. The basic
+ * idea here is to provide a way to somehow associate work with a given
+ * NUMA node.
+ *
+ * This function will only make a best effort attempt at getting this onto
+ * the right NUMA node. If no node is requested or the requested node is
+ * offline then we just fall back to standard queue_work behavior.
+ *
+ * Currently the "random" CPU ends up being the first available CPU in the
+ * intersection of cpu_online_mask and the cpumask of the node, unless we
+ * are running on the node. In that case we just use the current CPU.
+ *
+ * Return: %false if @work was already on a queue, %true otherwise.
+ */
+bool queue_work_node(int node, struct workqueue_struct *wq,
+ struct work_struct *work)
+{
+ unsigned long flags;
+ bool ret = false;
+
+ /*
+ * This current implementation is specific to unbound workqueues.
+ * Specifically we only return the first available CPU for a given
+ * node instead of cycling through individual CPUs within the node.
+ *
+ * If this is used with a per-cpu workqueue then the logic in
+ * workqueue_select_cpu_near would need to be updated to allow for
+ * some round robin type logic.
+ */
+ WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));
+
+ local_irq_save(flags);
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ int cpu = workqueue_select_cpu_near(node);
+
+ __queue_work(cpu, wq, work);
+ ret = true;
+ }
+
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_work_node);
+
void delayed_work_timer_fn(struct timer_list *t)
{
struct delayed_work *dwork = from_timer(dwork, t, timer);
@@ -1639,7 +1735,7 @@ static void rcu_work_rcufn(struct rcu_head *rcu)
*
* Return: %false if @rwork was already pending, %true otherwise. Note
* that a full RCU grace period is guaranteed only after a %true return.
- * While @rwork is guarnateed to be executed after a %false return, the
+ * While @rwork is guaranteed to be executed after a %false return, the
* execution may happen before a full RCU grace period has passed.
*/
bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
@@ -2931,6 +3027,9 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
if (WARN_ON(!wq_online))
return false;
+ if (WARN_ON(!work->func))
+ return false;
+
if (!from_cancel) {
lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
@@ -3337,11 +3436,51 @@ static int init_worker_pool(struct worker_pool *pool)
return 0;
}
+#ifdef CONFIG_LOCKDEP
+static void wq_init_lockdep(struct workqueue_struct *wq)
+{
+ char *lock_name;
+
+ lockdep_register_key(&wq->key);
+ lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
+ if (!lock_name)
+ lock_name = wq->name;
+
+ wq->lock_name = lock_name;
+ lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
+}
+
+static void wq_unregister_lockdep(struct workqueue_struct *wq)
+{
+ lockdep_unregister_key(&wq->key);
+}
+
+static void wq_free_lockdep(struct workqueue_struct *wq)
+{
+ if (wq->lock_name != wq->name)
+ kfree(wq->lock_name);
+}
+#else
+static void wq_init_lockdep(struct workqueue_struct *wq)
+{
+}
+
+static void wq_unregister_lockdep(struct workqueue_struct *wq)
+{
+}
+
+static void wq_free_lockdep(struct workqueue_struct *wq)
+{
+}
+#endif
+
static void rcu_free_wq(struct rcu_head *rcu)
{
struct workqueue_struct *wq =
container_of(rcu, struct workqueue_struct, rcu);
+ wq_free_lockdep(wq);
+
if (!(wq->flags & WQ_UNBOUND))
free_percpu(wq->cpu_pwqs);
else
@@ -3532,8 +3671,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
* If we're the last pwq going away, @wq is already dead and no one
* is gonna access it anymore. Schedule RCU free.
*/
- if (is_last)
+ if (is_last) {
+ wq_unregister_lockdep(wq);
call_rcu(&wq->rcu, rcu_free_wq);
+ }
}
/**
@@ -4067,11 +4208,9 @@ static int init_rescuer(struct workqueue_struct *wq)
return 0;
}
-struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
- unsigned int flags,
- int max_active,
- struct lock_class_key *key,
- const char *lock_name, ...)
+struct workqueue_struct *alloc_workqueue(const char *fmt,
+ unsigned int flags,
+ int max_active, ...)
{
size_t tbl_size = 0;
va_list args;
@@ -4106,7 +4245,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
goto err_free_wq;
}
- va_start(args, lock_name);
+ va_start(args, max_active);
vsnprintf(wq->name, sizeof(wq->name), fmt, args);
va_end(args);
@@ -4123,11 +4262,11 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
INIT_LIST_HEAD(&wq->flusher_overflow);
INIT_LIST_HEAD(&wq->maydays);
- lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
+ wq_init_lockdep(wq);
INIT_LIST_HEAD(&wq->list);
if (alloc_and_link_pwqs(wq) < 0)
- goto err_free_wq;
+ goto err_unreg_lockdep;
if (wq_online && init_rescuer(wq) < 0)
goto err_destroy;
@@ -4153,6 +4292,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
return wq;
+err_unreg_lockdep:
+ wq_unregister_lockdep(wq);
+ wq_free_lockdep(wq);
err_free_wq:
free_workqueue_attrs(wq->unbound_attrs);
kfree(wq);
@@ -4161,7 +4303,7 @@ err_destroy:
destroy_workqueue(wq);
return NULL;
}
-EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
+EXPORT_SYMBOL_GPL(alloc_workqueue);
/**
* destroy_workqueue - safely terminate a workqueue
@@ -4214,6 +4356,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
kthread_stop(wq->rescuer->task);
if (!(wq->flags & WQ_UNBOUND)) {
+ wq_unregister_lockdep(wq);
/*
* The base ref is never dropped on per-cpu pwqs. Directly
* schedule RCU free.