summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec4
-rw-r--r--kernel/async.c85
-rw-r--r--kernel/audit.c31
-rw-r--r--kernel/bpf/arraymap.c58
-rw-r--r--kernel/bpf/core.c12
-rw-r--r--kernel/bpf/memalloc.c2
-rw-r--r--kernel/bpf/verifier.c402
-rw-r--r--kernel/cgroup/cgroup-internal.h4
-rw-r--r--kernel/cgroup/cgroup-v1.c34
-rw-r--r--kernel/cgroup/cgroup.c45
-rw-r--r--kernel/cgroup/cpuset.c297
-rw-r--r--kernel/cgroup/legacy_freezer.c8
-rw-r--r--kernel/cgroup/rstat.c150
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/crash_core.c101
-rw-r--r--kernel/cred.c251
-rw-r--r--kernel/dma/pool.c6
-rw-r--r--kernel/dma/swiotlb.c4
-rw-r--r--kernel/entry/common.c108
-rw-r--r--kernel/events/core.c115
-rw-r--r--kernel/events/ring_buffer.c10
-rw-r--r--kernel/events/uprobes.c4
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c70
-rw-r--r--kernel/freezer.c3
-rw-r--r--kernel/kexec_core.c21
-rw-r--r--kernel/kexec_file.c20
-rw-r--r--kernel/kprobes.c4
-rw-r--r--kernel/locking/lockdep.c3
-rw-r--r--kernel/locking/mutex.c5
-rw-r--r--kernel/locking/osq_lock.c37
-rw-r--r--kernel/module/main.c3
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/power/hibernate.c10
-rw-r--r--kernel/power/main.c16
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/snapshot.c16
-rw-r--r--kernel/power/swap.c41
-rw-r--r--kernel/ptrace.c141
-rw-r--r--kernel/reboot.c51
-rw-r--r--kernel/relay.c162
-rw-r--r--kernel/resource.c61
-rw-r--r--kernel/sched/core.c140
-rw-r--r--kernel/sched/cpufreq_schedutil.c90
-rw-r--r--kernel/sched/deadline.c479
-rw-r--r--kernel/sched/debug.c18
-rw-r--r--kernel/sched/fair.c462
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle.c30
-rw-r--r--kernel/sched/pelt.h4
-rw-r--r--kernel/sched/rt.c15
-rw-r--r--kernel/sched/sched.h146
-rw-r--r--kernel/sched/stop_task.c13
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c28
-rw-r--r--kernel/stacktrace.c2
-rw-r--r--kernel/sys_ni.c17
-rw-r--r--kernel/time/posix-stubs.c45
-rw-r--r--kernel/time/tick-internal.h3
-rw-r--r--kernel/time/tick-sched.c25
-rw-r--r--kernel/time/timer.c110
-rw-r--r--kernel/trace/ftrace.c100
-rw-r--r--kernel/trace/rethook.c23
-rw-r--r--kernel/trace/ring_buffer.c209
-rw-r--r--kernel/trace/synth_event_gen_test.c11
-rw-r--r--kernel/trace/trace.c190
-rw-r--r--kernel/trace/trace.h1
-rw-r--r--kernel/trace/trace_events_hist.c12
-rw-r--r--kernel/trace/trace_events_synth.c4
-rw-r--r--kernel/trace/trace_events_user.c4
-rw-r--r--kernel/trace/trace_output.c6
-rw-r--r--kernel/user_namespace.c20
-rw-r--r--kernel/watch_queue.c2
-rw-r--r--kernel/watchdog.c40
-rw-r--r--kernel/workqueue.c187
75 files changed, 2568 insertions, 2278 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 7aff28ded2f4..946dffa048b7 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -36,6 +36,8 @@ config KEXEC
config KEXEC_FILE
bool "Enable kexec file based system call"
depends on ARCH_SUPPORTS_KEXEC_FILE
+ select CRYPTO
+ select CRYPTO_SHA256
select KEXEC_CORE
help
This is new version of kexec system call. This system call is
@@ -94,10 +96,8 @@ config KEXEC_JUMP
config CRASH_DUMP
bool "kernel crash dumps"
depends on ARCH_SUPPORTS_CRASH_DUMP
- depends on ARCH_SUPPORTS_KEXEC
select CRASH_CORE
select KEXEC_CORE
- select KEXEC
help
Generate crash dump after being started by kexec.
This should be normally only set in special crash dump kernels
diff --git a/kernel/async.c b/kernel/async.c
index b2c4ba5686ee..673bba6bdf3a 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -145,6 +145,39 @@ static void async_run_entry_fn(struct work_struct *work)
wake_up(&async_done);
}
+static async_cookie_t __async_schedule_node_domain(async_func_t func,
+ void *data, int node,
+ struct async_domain *domain,
+ struct async_entry *entry)
+{
+ async_cookie_t newcookie;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&entry->domain_list);
+ INIT_LIST_HEAD(&entry->global_list);
+ INIT_WORK(&entry->work, async_run_entry_fn);
+ entry->func = func;
+ entry->data = data;
+ entry->domain = domain;
+
+ spin_lock_irqsave(&async_lock, flags);
+
+ /* allocate cookie and queue */
+ newcookie = entry->cookie = next_cookie++;
+
+ list_add_tail(&entry->domain_list, &domain->pending);
+ if (domain->registered)
+ list_add_tail(&entry->global_list, &async_global_pending);
+
+ atomic_inc(&entry_count);
+ spin_unlock_irqrestore(&async_lock, flags);
+
+ /* schedule for execution */
+ queue_work_node(node, system_unbound_wq, &entry->work);
+
+ return newcookie;
+}
+
/**
* async_schedule_node_domain - NUMA specific version of async_schedule_domain
* @func: function to execute asynchronously
@@ -186,29 +219,8 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
func(data, newcookie);
return newcookie;
}
- INIT_LIST_HEAD(&entry->domain_list);
- INIT_LIST_HEAD(&entry->global_list);
- INIT_WORK(&entry->work, async_run_entry_fn);
- entry->func = func;
- entry->data = data;
- entry->domain = domain;
-
- spin_lock_irqsave(&async_lock, flags);
-
- /* allocate cookie and queue */
- newcookie = entry->cookie = next_cookie++;
-
- list_add_tail(&entry->domain_list, &domain->pending);
- if (domain->registered)
- list_add_tail(&entry->global_list, &async_global_pending);
-
- atomic_inc(&entry_count);
- spin_unlock_irqrestore(&async_lock, flags);
-
- /* schedule for execution */
- queue_work_node(node, system_unbound_wq, &entry->work);
- return newcookie;
+ return __async_schedule_node_domain(func, data, node, domain, entry);
}
EXPORT_SYMBOL_GPL(async_schedule_node_domain);
@@ -232,6 +244,35 @@ async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
EXPORT_SYMBOL_GPL(async_schedule_node);
/**
+ * async_schedule_dev_nocall - A simplified variant of async_schedule_dev()
+ * @func: function to execute asynchronously
+ * @dev: device argument to be passed to function
+ *
+ * @dev is used as both the argument for the function and to provide NUMA
+ * context for where to run the function.
+ *
+ * If the asynchronous execution of @func is scheduled successfully, return
+ * true. Otherwise, do nothing and return false, unlike async_schedule_dev()
+ * that will run the function synchronously then.
+ */
+bool async_schedule_dev_nocall(async_func_t func, struct device *dev)
+{
+ struct async_entry *entry;
+
+ entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL);
+
+ /* Give up if there is no memory or too much work. */
+ if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+ kfree(entry);
+ return false;
+ }
+
+ __async_schedule_node_domain(func, dev, dev_to_node(dev),
+ &async_dfl_domain, entry);
+ return true;
+}
+
+/**
* async_synchronize_full - synchronize all asynchronous function calls
*
* This function waits until all asynchronous function calls have been done.
diff --git a/kernel/audit.c b/kernel/audit.c
index 16205dd29843..9c8e5f732c4c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -487,15 +487,19 @@ static void auditd_conn_free(struct rcu_head *rcu)
* @pid: auditd PID
* @portid: auditd netlink portid
* @net: auditd network namespace pointer
+ * @skb: the netlink command from the audit daemon
+ * @ack: netlink ack flag, cleared if ack'd here
*
* Description:
* This function will obtain and drop network namespace references as
* necessary. Returns zero on success, negative values on failure.
*/
-static int auditd_set(struct pid *pid, u32 portid, struct net *net)
+static int auditd_set(struct pid *pid, u32 portid, struct net *net,
+ struct sk_buff *skb, bool *ack)
{
unsigned long flags;
struct auditd_connection *ac_old, *ac_new;
+ struct nlmsghdr *nlh;
if (!pid || !net)
return -EINVAL;
@@ -507,6 +511,13 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net)
ac_new->portid = portid;
ac_new->net = get_net(net);
+ /* send the ack now to avoid a race with the queue backlog */
+ if (*ack) {
+ nlh = nlmsg_hdr(skb);
+ netlink_ack(skb, nlh, 0, NULL);
+ *ack = false;
+ }
+
spin_lock_irqsave(&auditd_conn_lock, flags);
ac_old = rcu_dereference_protected(auditd_conn,
lockdep_is_held(&auditd_conn_lock));
@@ -1200,7 +1211,8 @@ static int audit_replace(struct pid *pid)
return auditd_send_unicast_skb(skb);
}
-static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+ bool *ack)
{
u32 seq;
void *data;
@@ -1293,7 +1305,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* register a new auditd connection */
err = auditd_set(req_pid,
NETLINK_CB(skb).portid,
- sock_net(NETLINK_CB(skb).sk));
+ sock_net(NETLINK_CB(skb).sk),
+ skb, ack);
if (audit_enabled != AUDIT_OFF)
audit_log_config_change("audit_pid",
new_pid,
@@ -1538,9 +1551,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
* Parse the provided skb and deal with any messages that may be present,
* malformed skbs are discarded.
*/
-static void audit_receive(struct sk_buff *skb)
+static void audit_receive(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
+ bool ack;
/*
* len MUST be signed for nlmsg_next to be able to dec it below 0
* if the nlmsg_len was not aligned
@@ -1553,9 +1567,12 @@ static void audit_receive(struct sk_buff *skb)
audit_ctl_lock();
while (nlmsg_ok(nlh, len)) {
- err = audit_receive_msg(skb, nlh);
- /* if err or if this message says it wants a response */
- if (err || (nlh->nlmsg_flags & NLM_F_ACK))
+ ack = nlh->nlmsg_flags & NLM_F_ACK;
+ err = audit_receive_msg(skb, nlh, &ack);
+
+ /* send an ack if the user asked for one and audit_receive_msg
+ * didn't already do it, or if there was an error. */
+ if (ack || err)
netlink_ack(skb, nlh, err, NULL);
nlh = nlmsg_next(nlh, &len);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2058e89b5ddd..c85ff9162a5c 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1012,11 +1012,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map,
mutex_unlock(&aux->poke_mutex);
}
+void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+ struct bpf_prog *new, struct bpf_prog *old)
+{
+ WARN_ON_ONCE(1);
+}
+
static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
struct bpf_prog *old,
struct bpf_prog *new)
{
- u8 *old_addr, *new_addr, *old_bypass_addr;
struct prog_poke_elem *elem;
struct bpf_array_aux *aux;
@@ -1025,7 +1030,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
list_for_each_entry(elem, &aux->poke_progs, list) {
struct bpf_jit_poke_descriptor *poke;
- int i, ret;
+ int i;
for (i = 0; i < elem->aux->size_poke_tab; i++) {
poke = &elem->aux->poke_tab[i];
@@ -1044,21 +1049,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
* activated, so tail call updates can arrive from here
* while JIT is still finishing its final fixup for
* non-activated poke entries.
- * 3) On program teardown, the program's kallsym entry gets
- * removed out of RCU callback, but we can only untrack
- * from sleepable context, therefore bpf_arch_text_poke()
- * might not see that this is in BPF text section and
- * bails out with -EINVAL. As these are unreachable since
- * RCU grace period already passed, we simply skip them.
- * 4) Also programs reaching refcount of zero while patching
+ * 3) Also programs reaching refcount of zero while patching
* is in progress is okay since we're protected under
* poke_mutex and untrack the programs before the JIT
- * buffer is freed. When we're still in the middle of
- * patching and suddenly kallsyms entry of the program
- * gets evicted, we just skip the rest which is fine due
- * to point 3).
- * 5) Any other error happening below from bpf_arch_text_poke()
- * is a unexpected bug.
+ * buffer is freed.
*/
if (!READ_ONCE(poke->tailcall_target_stable))
continue;
@@ -1068,39 +1062,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
poke->tail_call.key != key)
continue;
- old_bypass_addr = old ? NULL : poke->bypass_addr;
- old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
- new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
-
- if (new) {
- ret = bpf_arch_text_poke(poke->tailcall_target,
- BPF_MOD_JUMP,
- old_addr, new_addr);
- BUG_ON(ret < 0 && ret != -EINVAL);
- if (!old) {
- ret = bpf_arch_text_poke(poke->tailcall_bypass,
- BPF_MOD_JUMP,
- poke->bypass_addr,
- NULL);
- BUG_ON(ret < 0 && ret != -EINVAL);
- }
- } else {
- ret = bpf_arch_text_poke(poke->tailcall_bypass,
- BPF_MOD_JUMP,
- old_bypass_addr,
- poke->bypass_addr);
- BUG_ON(ret < 0 && ret != -EINVAL);
- /* let other CPUs finish the execution of program
- * so that it will not possible to expose them
- * to invalid nop, stack unwind, nop state
- */
- if (!ret)
- synchronize_rcu();
- ret = bpf_arch_text_poke(poke->tailcall_target,
- BPF_MOD_JUMP,
- old_addr, NULL);
- BUG_ON(ret < 0 && ret != -EINVAL);
- }
+ bpf_arch_poke_desc_update(poke, new, old);
}
}
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index cd3afe57ece3..fe254ae035fe 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -371,14 +371,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
s32 end_new, s32 curr, const bool probe_pass)
{
- const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s64 off_min, off_max, off;
s32 delta = end_new - end_old;
- s32 off;
- if (insn->code == (BPF_JMP32 | BPF_JA))
+ if (insn->code == (BPF_JMP32 | BPF_JA)) {
off = insn->imm;
- else
+ off_min = S32_MIN;
+ off_max = S32_MAX;
+ } else {
off = insn->off;
+ off_min = S16_MIN;
+ off_max = S16_MAX;
+ }
if (curr < pos && curr + off + 1 >= end_old)
off += delta;
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 63b909d277d4..6a51cfe4c2d6 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -978,6 +978,8 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
memcg = get_memcg(c);
old_memcg = set_active_memcg(memcg);
ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+ if (ret)
+ *(struct bpf_mem_cache **)ret = c;
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6da370a047fe..af2819d5c8ee 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -547,13 +547,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_dynptr_data;
}
-static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_sync_callback_calling_kfunc(u32 btf_id);
static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
-static bool is_callback_calling_function(enum bpf_func_id func_id)
+static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_for_each_map_elem ||
- func_id == BPF_FUNC_timer_set_callback ||
func_id == BPF_FUNC_find_vma ||
func_id == BPF_FUNC_loop ||
func_id == BPF_FUNC_user_ringbuf_drain;
@@ -564,6 +563,18 @@ static bool is_async_callback_calling_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_timer_set_callback;
}
+static bool is_callback_calling_function(enum bpf_func_id func_id)
+{
+ return is_sync_callback_calling_function(func_id) ||
+ is_async_callback_calling_function(func_id);
+}
+
+static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
+{
+ return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
+}
+
static bool is_storage_get_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_storage_get ||
@@ -1808,6 +1819,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->first_insn_idx = src->first_insn_idx;
dst_state->last_insn_idx = src->last_insn_idx;
dst_state->dfs_depth = src->dfs_depth;
+ dst_state->callback_unroll_depth = src->callback_unroll_depth;
dst_state->used_as_loop_entry = src->used_as_loop_entry;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
@@ -3439,13 +3451,11 @@ static void mark_insn_zext(struct bpf_verifier_env *env,
reg->subreg_def = DEF_NOT_SUBREG;
}
-static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
- enum reg_arg_type t)
+static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
+ enum reg_arg_type t)
{
- struct bpf_verifier_state *vstate = env->cur_state;
- struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
- struct bpf_reg_state *reg, *regs = state->regs;
+ struct bpf_reg_state *reg;
bool rw64;
if (regno >= MAX_BPF_REG) {
@@ -3486,6 +3496,15 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
+ enum reg_arg_type t)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+
+ return __check_reg_arg(env, state->regs, regno, t);
+}
+
static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
{
env->insn_aux_data[idx].jmp_point = true;
@@ -3724,6 +3743,8 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
}
}
+static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
+
/* For given verifier state backtrack_insn() is called from the last insn to
* the first insn. Its purpose is to compute a bitmask of registers and
* stack slots that needs precision in the parent verifier state.
@@ -3899,16 +3920,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
return -EFAULT;
return 0;
}
- } else if ((bpf_helper_call(insn) &&
- is_callback_calling_function(insn->imm) &&
- !is_async_callback_calling_function(insn->imm)) ||
- (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) {
- /* callback-calling helper or kfunc call, which means
- * we are exiting from subprog, but unlike the subprog
- * call handling above, we shouldn't propagate
- * precision of r1-r5 (if any requested), as they are
- * not actually arguments passed directly to callback
- * subprogs
+ } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
+ /* exit from callback subprog to callback-calling helper or
+ * kfunc call. Use idx/subseq_idx check to discern it from
+ * straight line code backtracking.
+ * Unlike the subprog call handling above, we shouldn't
+ * propagate precision of r1-r5 (if any requested), as they are
+ * not actually arguments passed directly to callback subprogs
*/
if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
@@ -3943,10 +3961,18 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
} else if (opcode == BPF_EXIT) {
bool r0_precise;
+ /* Backtracking to a nested function call, 'idx' is a part of
+ * the inner frame 'subseq_idx' is a part of the outer frame.
+ * In case of a regular function call, instructions giving
+ * precision to registers R1-R5 should have been found already.
+ * In case of a callback, it is ok to have R1-R5 marked for
+ * backtracking, as these registers are set by the function
+ * invoking callback.
+ */
+ if (subseq_idx >= 0 && calls_callback(env, subseq_idx))
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- /* if backtracing was looking for registers R1-R5
- * they should have been found already.
- */
verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
@@ -9350,7 +9376,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
/* after the call registers r0 - r5 were scratched */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
- check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
}
}
@@ -9363,11 +9389,10 @@ static int set_callee_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee, int insn_idx);
-static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx, int subprog,
- set_callee_state_fn set_callee_state_cb)
+static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite,
+ set_callee_state_fn set_callee_state_cb,
+ struct bpf_verifier_state *state)
{
- struct bpf_verifier_state *state = env->cur_state;
struct bpf_func_state *caller, *callee;
int err;
@@ -9377,54 +9402,72 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -E2BIG;
}
- caller = state->frame[state->curframe];
if (state->frame[state->curframe + 1]) {
verbose(env, "verifier bug. Frame %d already allocated\n",
state->curframe + 1);
return -EFAULT;
}
+ caller = state->frame[state->curframe];
+ callee = kzalloc(sizeof(*callee), GFP_KERNEL);
+ if (!callee)
+ return -ENOMEM;
+ state->frame[state->curframe + 1] = callee;
+
+ /* callee cannot access r0, r6 - r9 for reading and has to write
+ * into its own stack before reading from it.
+ * callee can read/write into caller's stack
+ */
+ init_func_state(env, callee,
+ /* remember the callsite, it will be used by bpf_exit */
+ callsite,
+ state->curframe + 1 /* frameno within this callchain */,
+ subprog /* subprog number within this prog */);
+ /* Transfer references to the callee */
+ err = copy_reference_state(callee, caller);
+ err = err ?: set_callee_state_cb(env, caller, callee, callsite);
+ if (err)
+ goto err_out;
+
+ /* only increment it after check_reg_arg() finished */
+ state->curframe++;
+
+ return 0;
+
+err_out:
+ free_func_state(callee);
+ state->frame[state->curframe + 1] = NULL;
+ return err;
+}
+
+static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int insn_idx, int subprog,
+ set_callee_state_fn set_callee_state_cb)
+{
+ struct bpf_verifier_state *state = env->cur_state, *callback_state;
+ struct bpf_func_state *caller, *callee;
+ int err;
+
+ caller = state->frame[state->curframe];
err = btf_check_subprog_call(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
- if (subprog_is_global(env, subprog)) {
- if (err) {
- verbose(env, "Caller passes invalid args into func#%d\n",
- subprog);
- return err;
- } else {
- if (env->log.level & BPF_LOG_LEVEL)
- verbose(env,
- "Func#%d is global and valid. Skipping.\n",
- subprog);
- clear_caller_saved_regs(env, caller->regs);
-
- /* All global functions return a 64-bit SCALAR_VALUE */
- mark_reg_unknown(env, caller->regs, BPF_REG_0);
- caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
-
- /* continue with next insn after call */
- return 0;
- }
- }
/* set_callee_state is used for direct subprog calls, but we are
* interested in validating only BPF helpers that can call subprogs as
* callbacks
*/
- if (set_callee_state_cb != set_callee_state) {
- env->subprog_info[subprog].is_cb = true;
- if (bpf_pseudo_kfunc_call(insn) &&
- !is_callback_calling_kfunc(insn->imm)) {
- verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
- return -EFAULT;
- } else if (!bpf_pseudo_kfunc_call(insn) &&
- !is_callback_calling_function(insn->imm)) { /* helper */
- verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
- return -EFAULT;
- }
+ env->subprog_info[subprog].is_cb = true;
+ if (bpf_pseudo_kfunc_call(insn) &&
+ !is_sync_callback_calling_kfunc(insn->imm)) {
+ verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ } else if (!bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_function(insn->imm)) { /* helper */
+ verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
}
if (insn->code == (BPF_JMP | BPF_CALL) &&
@@ -9435,53 +9478,83 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* there is no real recursion here. timer callbacks are async */
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
- *insn_idx, subprog);
+ insn_idx, subprog);
if (!async_cb)
return -EFAULT;
callee = async_cb->frame[0];
callee->async_entry_cnt = caller->async_entry_cnt + 1;
/* Convert bpf_timer_set_callback() args into timer callback args */
- err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ err = set_callee_state_cb(env, caller, callee, insn_idx);
if (err)
return err;
+ return 0;
+ }
+
+ /* for callback functions enqueue entry to callback and
+ * proceed with next instruction within current frame.
+ */
+ callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
+ if (!callback_state)
+ return -ENOMEM;
+
+ err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
+ callback_state);
+ if (err)
+ return err;
+
+ callback_state->callback_unroll_depth++;
+ callback_state->frame[callback_state->curframe - 1]->callback_depth++;
+ caller->callback_depth = 0;
+ return 0;
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *caller;
+ int err, subprog, target_insn;
+
+ target_insn = *insn_idx + insn->imm + 1;
+ subprog = find_subprog(env, target_insn);
+ if (subprog < 0) {
+ verbose(env, "verifier bug. No program starts at insn %d\n", target_insn);
+ return -EFAULT;
+ }
+
+ caller = state->frame[state->curframe];
+ err = btf_check_subprog_call(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+ if (subprog_is_global(env, subprog)) {
+ if (err) {
+ verbose(env, "Caller passes invalid args into func#%d\n", subprog);
+ return err;
+ }
+
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "Func#%d is global and valid. Skipping.\n", subprog);
clear_caller_saved_regs(env, caller->regs);
+
+ /* All global functions return a 64-bit SCALAR_VALUE */
mark_reg_unknown(env, caller->regs, BPF_REG_0);
caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
/* continue with next insn after call */
return 0;
}
- callee = kzalloc(sizeof(*callee), GFP_KERNEL);
- if (!callee)
- return -ENOMEM;
- state->frame[state->curframe + 1] = callee;
-
- /* callee cannot access r0, r6 - r9 for reading and has to write
- * into its own stack before reading from it.
- * callee can read/write into caller's stack
+ /* for regular function entry setup new frame and continue
+ * from that frame.
*/
- init_func_state(env, callee,
- /* remember the callsite, it will be used by bpf_exit */
- *insn_idx /* callsite */,
- state->curframe + 1 /* frameno within this callchain */,
- subprog /* subprog number within this prog */);
-
- /* Transfer references to the callee */
- err = copy_reference_state(callee, caller);
+ err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state);
if (err)
- goto err_out;
-
- err = set_callee_state_cb(env, caller, callee, *insn_idx);
- if (err)
- goto err_out;
+ return err;
clear_caller_saved_regs(env, caller->regs);
- /* only increment it after check_reg_arg() finished */
- state->curframe++;
-
/* and go analyze first insn of the callee */
*insn_idx = env->subprog_info[subprog].start - 1;
@@ -9489,14 +9562,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
verbose(env, "caller:\n");
print_verifier_state(env, caller, true);
verbose(env, "callee:\n");
- print_verifier_state(env, callee, true);
+ print_verifier_state(env, state->frame[state->curframe], true);
}
- return 0;
-err_out:
- free_func_state(callee);
- state->frame[state->curframe + 1] = NULL;
- return err;
+ return 0;
}
int map_set_for_each_callback_args(struct bpf_verifier_env *env,
@@ -9540,22 +9609,6 @@ static int set_callee_state(struct bpf_verifier_env *env,
return 0;
}
-static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx)
-{
- int subprog, target_insn;
-
- target_insn = *insn_idx + insn->imm + 1;
- subprog = find_subprog(env, target_insn);
- if (subprog < 0) {
- verbose(env, "verifier bug. No program starts at insn %d\n",
- target_insn);
- return -EFAULT;
- }
-
- return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
-}
-
static int set_map_elem_callback_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee,
@@ -9748,9 +9801,10 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
- struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state, *prev_st;
struct bpf_func_state *caller, *callee;
struct bpf_reg_state *r0;
+ bool in_callback_fn;
int err;
callee = state->frame[state->curframe];
@@ -9779,6 +9833,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
return -EINVAL;
}
+ if (!calls_callback(env, callee->callsite)) {
+ verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n",
+ *insn_idx, callee->callsite);
+ return -EFAULT;
+ }
} else {
/* return to the caller whatever r0 had in the callee */
caller->regs[BPF_REG_0] = *r0;
@@ -9796,7 +9855,16 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return err;
}
- *insn_idx = callee->callsite + 1;
+ /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
+ * there function call logic would reschedule callback visit. If iteration
+ * converges is_state_visited() would prune that visit eventually.
+ */
+ in_callback_fn = callee->in_callback_fn;
+ if (in_callback_fn)
+ *insn_idx = callee->callsite;
+ else
+ *insn_idx = callee->callsite + 1;
+
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
print_verifier_state(env, callee, true);
@@ -9807,6 +9875,24 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
* bpf_throw, this will be done by copy_verifier_state for extra frames. */
free_func_state(callee);
state->frame[state->curframe--] = NULL;
+
+ /* for callbacks widen imprecise scalars to make programs like below verify:
+ *
+ * struct ctx { int i; }
+ * void cb(int idx, struct ctx *ctx) { ctx->i++; ... }
+ * ...
+ * struct ctx = { .i = 0; }
+ * bpf_loop(100, cb, &ctx, 0);
+ *
+ * This is similar to what is done in process_iter_next_call() for open
+ * coded iterators.
+ */
+ prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL;
+ if (prev_st) {
+ err = widen_imprecise_scalars(env, prev_st, state);
+ if (err)
+ return err;
+ }
return 0;
}
@@ -10209,24 +10295,37 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
break;
case BPF_FUNC_for_each_map_elem:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_map_elem_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_map_elem_callback_state);
break;
case BPF_FUNC_timer_set_callback:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_timer_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_timer_callback_state);
break;
case BPF_FUNC_find_vma:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_find_vma_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_find_vma_callback_state);
break;
case BPF_FUNC_snprintf:
err = check_bpf_snprintf_call(env, regs);
break;
case BPF_FUNC_loop:
update_loop_inline_state(env, meta.subprogno);
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_loop_callback_state);
+ /* Verifier relies on R1 value to determine if bpf_loop() iteration
+ * is finished, thus mark it precise.
+ */
+ err = mark_chain_precision(env, BPF_REG_1);
+ if (err)
+ return err;
+ if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_loop_callback_state);
+ } else {
+ cur_func(env)->callback_depth = 0;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame%d bpf_loop iteration limit reached\n",
+ env->cur_state->curframe);
+ }
break;
case BPF_FUNC_dynptr_from_mem:
if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
@@ -10322,8 +10421,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
break;
}
case BPF_FUNC_user_ringbuf_drain:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_user_ringbuf_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_user_ringbuf_callback_state);
break;
}
@@ -11211,7 +11310,7 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id)
btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
}
-static bool is_callback_calling_kfunc(u32 btf_id)
+static bool is_sync_callback_calling_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}
@@ -11963,6 +12062,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EACCES;
}
+ /* Check the arguments */
+ err = check_kfunc_args(env, &meta, insn_idx);
+ if (err < 0)
+ return err;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_rbtree_add_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
@@ -11998,10 +12112,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EINVAL;
}
- /* Check the arguments */
- err = check_kfunc_args(env, &meta, insn_idx);
- if (err < 0)
- return err;
/* In case of release function, we get register number of refcounted
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
@@ -12035,16 +12145,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
- if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_rbtree_add_callback_state);
- if (err) {
- verbose(env, "kfunc %s#%d failed callback verification\n",
- func_name, meta.func_id);
- return err;
- }
- }
-
if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
if (!bpf_jit_supports_exceptions()) {
verbose(env, "JIT does not support calling kfunc %s#%d\n",
@@ -15408,6 +15508,15 @@ static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
return env->insn_aux_data[insn_idx].force_checkpoint;
}
+static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].calls_callback = true;
+}
+
+static bool calls_callback(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].calls_callback;
+}
enum {
DONE_EXPLORING = 0,
@@ -15521,6 +15630,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
* async state will be pushed for further exploration.
*/
mark_prune_point(env, t);
+ /* For functions that invoke callbacks it is not known how many times
+ * callback would be called. Verifier models callback calling functions
+ * by repeatedly visiting callback bodies and returning to origin call
+ * instruction.
+ * In order to stop such iteration verifier needs to identify when a
+ * state identical some state from a previous iteration is reached.
+ * Check below forces creation of checkpoint before callback calling
+ * instruction to allow search for such identical states.
+ */
+ if (is_sync_callback_calling_insn(insn)) {
+ mark_calls_callback(env, t);
+ mark_force_checkpoint(env, t);
+ mark_prune_point(env, t);
+ mark_jmp_point(env, t);
+ }
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
struct bpf_kfunc_call_arg_meta meta;
@@ -16990,10 +17114,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
}
goto skip_inf_loop_check;
}
+ if (calls_callback(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, true))
+ goto hit;
+ goto skip_inf_loop_check;
+ }
/* attempt to detect infinite loop to avoid unnecessary doomed work */
if (states_maybe_looping(&sl->state, cur) &&
states_equal(env, &sl->state, cur, false) &&
- !iter_active_depths_differ(&sl->state, cur)) {
+ !iter_active_depths_differ(&sl->state, cur) &&
+ sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
verbose(env, "cur state:");
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index c56071f150f2..520b90dd97ec 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -164,13 +164,13 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
/* iterate across the hierarchies */
#define for_each_root(root) \
- list_for_each_entry((root), &cgroup_roots, root_list)
+ list_for_each_entry_rcu((root), &cgroup_roots, root_list, \
+ lockdep_is_held(&cgroup_mutex))
/**
* for_each_subsys - iterate all enabled cgroup subsystems
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 76db6c67e39a..04d11a7dd95f 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1262,6 +1262,40 @@ int cgroup1_get_tree(struct fs_context *fc)
return ret;
}
+/**
+ * task_get_cgroup1 - Acquires the associated cgroup of a task within a
+ * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its
+ * hierarchy ID.
+ * @tsk: The target task
+ * @hierarchy_id: The ID of a cgroup1 hierarchy
+ *
+ * On success, the cgroup is returned. On failure, ERR_PTR is returned.
+ * We limit it to cgroup1 only.
+ */
+struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id)
+{
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup_root *root;
+ unsigned long flags;
+
+ rcu_read_lock();
+ for_each_root(root) {
+ /* cgroup1 only*/
+ if (root == &cgrp_dfl_root)
+ continue;
+ if (root->hierarchy_id != hierarchy_id)
+ continue;
+ spin_lock_irqsave(&css_set_lock, flags);
+ cgrp = task_cgroup_from_root(tsk, root);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+ break;
+ }
+ rcu_read_unlock();
+ return cgrp;
+}
+
static int __init cgroup1_wq_init(void)
{
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4b9ff41ca603..8f3cef1a4d8a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1315,7 +1315,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
void cgroup_free_root(struct cgroup_root *root)
{
- kfree(root);
+ kfree_rcu(root, rcu);
}
static void cgroup_destroy_root(struct cgroup_root *root)
@@ -1347,10 +1347,9 @@ static void cgroup_destroy_root(struct cgroup_root *root)
spin_unlock_irq(&css_set_lock);
- if (!list_empty(&root->root_list)) {
- list_del(&root->root_list);
- cgroup_root_count--;
- }
+ WARN_ON_ONCE(list_empty(&root->root_list));
+ list_del_rcu(&root->root_list);
+ cgroup_root_count--;
if (!have_favordynmods)
cgroup_favor_dynmods(root, false);
@@ -1390,7 +1389,15 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
}
}
- BUG_ON(!res_cgroup);
+ /*
+ * If cgroup_mutex is not held, the cgrp_cset_link will be freed
+ * before we remove the cgroup root from the root_list. Consequently,
+ * when accessing a cgroup root, the cset_link may have already been
+ * freed, resulting in a NULL res_cgroup. However, by holding the
+ * cgroup_mutex, we ensure that res_cgroup can't be NULL.
+ * If we don't hold cgroup_mutex in the caller, we must do the NULL
+ * check.
+ */
return res_cgroup;
}
@@ -1413,6 +1420,11 @@ current_cgns_cgroup_from_root(struct cgroup_root *root)
rcu_read_unlock();
+ /*
+ * The namespace_sem is held by current, so the root cgroup can't
+ * be umounted. Therefore, we can ensure that the res is non-NULL.
+ */
+ WARN_ON_ONCE(!res);
return res;
}
@@ -1449,7 +1461,6 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
{
- lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
return __cset_cgroup_from_root(cset, root);
@@ -1457,7 +1468,9 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
/*
* Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_lock held.
+ * called with css_set_lock held to prevent task's groups from being modified.
+ * Must be called with either cgroup_mutex or rcu read lock to prevent the
+ * cgroup root from being destroyed.
*/
struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroup_root *root)
@@ -2032,7 +2045,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
struct cgroup_root *root = ctx->root;
struct cgroup *cgrp = &root->cgrp;
- INIT_LIST_HEAD(&root->root_list);
+ INIT_LIST_HEAD_RCU(&root->root_list);
atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
@@ -2115,7 +2128,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
* care of subsystems' refcounts, which are explicitly dropped in
* the failure exit path.
*/
- list_add(&root->root_list, &cgroup_roots);
+ list_add_rcu(&root->root_list, &cgroup_roots);
cgroup_root_count++;
/*
@@ -6265,7 +6278,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- cgroup_lock();
+ rcu_read_lock();
spin_lock_irq(&css_set_lock);
for_each_root(root) {
@@ -6276,6 +6289,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
continue;
+ cgrp = task_cgroup_from_root(tsk, root);
+ /* The root has already been unmounted. */
+ if (!cgrp)
+ continue;
+
seq_printf(m, "%d:", root->hierarchy_id);
if (root != &cgrp_dfl_root)
for_each_subsys(ss, ssid)
@@ -6286,9 +6304,6 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
-
- cgrp = task_cgroup_from_root(tsk, root);
-
/*
* On traditional hierarchies, all zombie tasks show up as
* belonging to the root cgroup. On the default hierarchy,
@@ -6320,7 +6335,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
retval = 0;
out_unlock:
spin_unlock_irq(&css_set_lock);
- cgroup_unlock();
+ rcu_read_unlock();
kfree(buf);
out:
return retval;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 615daaf87f1f..dfbb16aca9f4 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -25,6 +25,7 @@
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
+#include <linux/delay.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
@@ -43,6 +44,7 @@
#include <linux/sched/isolation.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
+#include <linux/workqueue.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -204,6 +206,11 @@ struct cpuset {
*/
static cpumask_var_t subpartitions_cpus;
+/*
+ * Exclusive CPUs in isolated partitions
+ */
+static cpumask_var_t isolated_cpus;
+
/* List of remote partition root children */
static struct list_head remote_children;
@@ -1317,6 +1324,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus,
*/
enum partition_cmd {
partcmd_enable, /* Enable partition root */
+ partcmd_enablei, /* Enable isolated partition root */
partcmd_disable, /* Disable partition root */
partcmd_update, /* Update parent's effective_cpus */
partcmd_invalidate, /* Make partition invalid */
@@ -1419,6 +1427,109 @@ static void reset_partition_data(struct cpuset *cs)
}
/*
+ * partition_xcpus_newstate - Exclusive CPUs state change
+ * @old_prs: old partition_root_state
+ * @new_prs: new partition_root_state
+ * @xcpus: exclusive CPUs with state change
+ */
+static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
+{
+ WARN_ON_ONCE(old_prs == new_prs);
+ if (new_prs == PRS_ISOLATED)
+ cpumask_or(isolated_cpus, isolated_cpus, xcpus);
+ else
+ cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
+}
+
+/*
+ * partition_xcpus_add - Add new exclusive CPUs to partition
+ * @new_prs: new partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be added
+ * Return: true if isolated_cpus modified, false otherwise
+ *
+ * Remote partition if parent == NULL
+ */
+static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
+ struct cpumask *xcpus)
+{
+ bool isolcpus_updated;
+
+ WARN_ON_ONCE(new_prs < 0);
+ lockdep_assert_held(&callback_lock);
+ if (!parent)
+ parent = &top_cpuset;
+
+
+ if (parent == &top_cpuset)
+ cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+ isolcpus_updated = (new_prs != parent->partition_root_state);
+ if (isolcpus_updated)
+ partition_xcpus_newstate(parent->partition_root_state, new_prs,
+ xcpus);
+
+ cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
+ return isolcpus_updated;
+}
+
+/*
+ * partition_xcpus_del - Remove exclusive CPUs from partition
+ * @old_prs: old partition_root_state
+ * @parent: parent cpuset
+ * @xcpus: exclusive CPUs to be removed
+ * Return: true if isolated_cpus modified, false otherwise
+ *
+ * Remote partition if parent == NULL
+ */
+static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
+ struct cpumask *xcpus)
+{
+ bool isolcpus_updated;
+
+ WARN_ON_ONCE(old_prs < 0);
+ lockdep_assert_held(&callback_lock);
+ if (!parent)
+ parent = &top_cpuset;
+
+ if (parent == &top_cpuset)
+ cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
+
+ isolcpus_updated = (old_prs != parent->partition_root_state);
+ if (isolcpus_updated)
+ partition_xcpus_newstate(old_prs, parent->partition_root_state,
+ xcpus);
+
+ cpumask_and(xcpus, xcpus, cpu_active_mask);
+ cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
+ return isolcpus_updated;
+}
+
+static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
+{
+ int ret;
+
+ lockdep_assert_cpus_held();
+
+ if (!isolcpus_updated)
+ return;
+
+ ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
+ WARN_ON_ONCE(ret < 0);
+}
+
+/**
+ * cpuset_cpu_is_isolated - Check if the given CPU is isolated
+ * @cpu: the CPU number to be checked
+ * Return: true if CPU is used in an isolated partition, false otherwise
+ */
+bool cpuset_cpu_is_isolated(int cpu)
+{
+ return cpumask_test_cpu(cpu, isolated_cpus);
+}
+EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
+
+/*
* compute_effective_exclusive_cpumask - compute effective exclusive CPUs
* @cs: cpuset
* @xcpus: effective exclusive CPUs value to be set
@@ -1456,14 +1567,18 @@ static inline bool is_local_partition(struct cpuset *cs)
/*
* remote_partition_enable - Enable current cpuset as a remote partition root
* @cs: the cpuset to update
+ * @new_prs: new partition_root_state
* @tmp: temparary masks
* Return: 1 if successful, 0 if error
*
* Enable the current cpuset to become a remote partition root taking CPUs
* directly from the top cpuset. cpuset_mutex must be held by the caller.
*/
-static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp)
+static int remote_partition_enable(struct cpuset *cs, int new_prs,
+ struct tmpmasks *tmp)
{
+ bool isolcpus_updated;
+
/*
* The user must have sysadmin privilege.
*/
@@ -1485,26 +1600,22 @@ static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp)
return 0;
spin_lock_irq(&callback_lock);
- cpumask_andnot(top_cpuset.effective_cpus,
- top_cpuset.effective_cpus, tmp->new_cpus);
- cpumask_or(subpartitions_cpus,
- subpartitions_cpus, tmp->new_cpus);
-
+ isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
+ list_add(&cs->remote_sibling, &remote_children);
if (cs->use_parent_ecpus) {
struct cpuset *parent = parent_cs(cs);
cs->use_parent_ecpus = false;
parent->child_ecpus_count--;
}
- list_add(&cs->remote_sibling, &remote_children);
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
*/
update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
-
return 1;
}
@@ -1519,23 +1630,22 @@ static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp)
*/
static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
{
+ bool isolcpus_updated;
+
compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
WARN_ON_ONCE(!is_remote_partition(cs));
WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
spin_lock_irq(&callback_lock);
- cpumask_andnot(subpartitions_cpus,
- subpartitions_cpus, tmp->new_cpus);
- cpumask_and(tmp->new_cpus,
- tmp->new_cpus, cpu_active_mask);
- cpumask_or(top_cpuset.effective_cpus,
- top_cpuset.effective_cpus, tmp->new_cpus);
list_del_init(&cs->remote_sibling);
+ isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
+ NULL, tmp->new_cpus);
cs->partition_root_state = -cs->partition_root_state;
if (!cs->prs_err)
cs->prs_err = PERR_INVCPUS;
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1557,6 +1667,8 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
struct tmpmasks *tmp)
{
bool adding, deleting;
+ int prs = cs->partition_root_state;
+ int isolcpus_updated = 0;
if (WARN_ON_ONCE(!is_remote_partition(cs)))
return;
@@ -1580,21 +1692,12 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
goto invalidate;
spin_lock_irq(&callback_lock);
- if (adding) {
- cpumask_or(subpartitions_cpus,
- subpartitions_cpus, tmp->addmask);
- cpumask_andnot(top_cpuset.effective_cpus,
- top_cpuset.effective_cpus, tmp->addmask);
- }
- if (deleting) {
- cpumask_andnot(subpartitions_cpus,
- subpartitions_cpus, tmp->delmask);
- cpumask_and(tmp->delmask,
- tmp->delmask, cpu_active_mask);
- cpumask_or(top_cpuset.effective_cpus,
- top_cpuset.effective_cpus, tmp->delmask);
- }
+ if (adding)
+ isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
+ if (deleting)
+ isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
/*
* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1676,11 +1779,11 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
* @tmp: Temporary addmask and delmask
* Return: 0 or a partition root state error code
*
- * For partcmd_enable, the cpuset is being transformed from a non-partition
- * root to a partition root. The effective_xcpus (cpus_allowed if effective_xcpus
- * not set) mask of the given cpuset will be taken away from parent's
- * effective_cpus. The function will return 0 if all the CPUs listed in
- * effective_xcpus can be granted or an error code will be returned.
+ * For partcmd_enable*, the cpuset is being transformed from a non-partition
+ * root to a partition root. The effective_xcpus (cpus_allowed if
+ * effective_xcpus not set) mask of the given cpuset will be taken away from
+ * parent's effective_cpus. The function will return 0 if all the CPUs listed
+ * in effective_xcpus can be granted or an error code will be returned.
*
* For partcmd_disable, the cpuset is being transformed from a partition
* root back to a non-partition root. Any CPUs in effective_xcpus will be
@@ -1695,7 +1798,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
*
* For partcmd_invalidate, the current partition will be made invalid.
*
- * The partcmd_enable and partcmd_disable commands are used by
+ * The partcmd_enable* and partcmd_disable commands are used by
* update_prstate(). An error code may be returned and the caller will check
* for error.
*
@@ -1716,6 +1819,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
int part_error = PERR_NONE; /* Partition error? */
int subparts_delta = 0;
struct cpumask *xcpus; /* cs effective_xcpus */
+ int isolcpus_updated = 0;
bool nocpu;
lockdep_assert_held(&cpuset_mutex);
@@ -1760,7 +1864,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
nocpu = tasks_nocpu_error(parent, cs, xcpus);
- if (cmd == partcmd_enable) {
+ if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
/*
* Enabling partition root is not allowed if its
* effective_xcpus is empty or doesn't overlap with
@@ -1783,6 +1887,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
cpumask_copy(tmp->delmask, xcpus);
deleting = true;
subparts_delta++;
+ new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
/*
* May need to add cpus to parent's effective_cpus for
@@ -1792,6 +1897,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
if (adding)
subparts_delta--;
+ new_prs = PRS_MEMBER;
} else if (newmask) {
/*
* Empty cpumask is not allowed
@@ -1940,38 +2046,28 @@ write_error:
* newly deleted ones will be added back to effective_cpus.
*/
spin_lock_irq(&callback_lock);
- if (adding) {
- if (parent == &top_cpuset)
- cpumask_andnot(subpartitions_cpus,
- subpartitions_cpus, tmp->addmask);
- /*
- * Some of the CPUs in effective_xcpus might have been offlined.
- */
- cpumask_or(parent->effective_cpus,
- parent->effective_cpus, tmp->addmask);
- cpumask_and(parent->effective_cpus,
- parent->effective_cpus, cpu_active_mask);
- }
- if (deleting) {
- if (parent == &top_cpuset)
- cpumask_or(subpartitions_cpus,
- subpartitions_cpus, tmp->delmask);
- cpumask_andnot(parent->effective_cpus,
- parent->effective_cpus, tmp->delmask);
- }
-
- if (is_partition_valid(parent)) {
- parent->nr_subparts += subparts_delta;
- WARN_ON_ONCE(parent->nr_subparts < 0);
- }
-
if (old_prs != new_prs) {
cs->partition_root_state = new_prs;
if (new_prs <= 0)
cs->nr_subparts = 0;
}
+ /*
+ * Adding to parent's effective_cpus means deletion CPUs from cs
+ * and vice versa.
+ */
+ if (adding)
+ isolcpus_updated += partition_xcpus_del(old_prs, parent,
+ tmp->addmask);
+ if (deleting)
+ isolcpus_updated += partition_xcpus_add(new_prs, parent,
+ tmp->delmask);
+ if (is_partition_valid(parent)) {
+ parent->nr_subparts += subparts_delta;
+ WARN_ON_ONCE(parent->nr_subparts < 0);
+ }
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
if ((old_prs != new_prs) && (cmd == partcmd_update))
update_partition_exclusive(cs, new_prs);
@@ -2948,6 +3044,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
int err = PERR_NONE, old_prs = cs->partition_root_state;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
+ bool new_xcpus_state = false;
if (old_prs == new_prs)
return 0;
@@ -2977,6 +3074,9 @@ static int update_prstate(struct cpuset *cs, int new_prs)
goto out;
if (!old_prs) {
+ enum partition_cmd cmd = (new_prs == PRS_ROOT)
+ ? partcmd_enable : partcmd_enablei;
+
/*
* cpus_allowed cannot be empty.
*/
@@ -2985,19 +3085,18 @@ static int update_prstate(struct cpuset *cs, int new_prs)
goto out;
}
- err = update_parent_effective_cpumask(cs, partcmd_enable,
- NULL, &tmpmask);
+ err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
/*
* If an attempt to become local partition root fails,
* try to become a remote partition root instead.
*/
- if (err && remote_partition_enable(cs, &tmpmask))
+ if (err && remote_partition_enable(cs, new_prs, &tmpmask))
err = 0;
} else if (old_prs && new_prs) {
/*
* A change in load balance state only, no change in cpumasks.
*/
- ;
+ new_xcpus_state = true;
} else {
/*
* Switching back to member is always allowed even if it
@@ -3029,7 +3128,10 @@ out:
WRITE_ONCE(cs->prs_err, err);
if (!is_partition_valid(cs))
reset_partition_data(cs);
+ else if (new_xcpus_state)
+ partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
+ update_unbound_workqueue_cpumask(new_xcpus_state);
/* Force update if switching back to member */
update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
@@ -3386,6 +3488,7 @@ typedef enum {
FILE_SUBPARTS_CPULIST,
FILE_EXCLUSIVE_CPULIST,
FILE_EFFECTIVE_XCPULIST,
+ FILE_ISOLATED_CPULIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
@@ -3582,6 +3685,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_SUBPARTS_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
break;
+ case FILE_ISOLATED_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
+ break;
default:
ret = -EINVAL;
}
@@ -3875,6 +3981,13 @@ static struct cftype dfl_files[] = {
.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
},
+ {
+ .name = "cpus.isolated",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_ISOLATED_CPULIST,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
{ } /* terminate */
};
@@ -4194,6 +4307,7 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
@@ -4306,6 +4420,30 @@ void cpuset_force_rebuild(void)
force_rebuild = true;
}
+/*
+ * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
+ * progress.
+ * Return: true if successful, false otherwise
+ *
+ * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
+ * cpus_read_trylock() is used here to acquire the lock.
+ */
+static bool cpuset_hotplug_cpus_read_trylock(void)
+{
+ int retries = 0;
+
+ while (!cpus_read_trylock()) {
+ /*
+ * CPU hotplug still in progress. Retry 5 times
+ * with a 10ms wait before bailing out.
+ */
+ if (++retries > 5)
+ return false;
+ msleep(10);
+ }
+ return true;
+}
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
@@ -4322,6 +4460,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
bool cpus_updated;
bool mems_updated;
bool remote;
+ int partcmd = -1;
struct cpuset *parent;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -4353,11 +4492,13 @@ retry:
compute_partition_effective_cpumask(cs, &new_cpus);
if (remote && cpumask_empty(&new_cpus) &&
- partition_is_populated(cs, NULL)) {
+ partition_is_populated(cs, NULL) &&
+ cpuset_hotplug_cpus_read_trylock()) {
remote_partition_disable(cs, tmp);
compute_effective_cpumask(&new_cpus, cs, parent);
remote = false;
cpuset_force_rebuild();
+ cpus_read_unlock();
}
/*
@@ -4368,18 +4509,28 @@ retry:
* partitions.
*/
if (is_local_partition(cs) && (!is_partition_valid(parent) ||
- tasks_nocpu_error(parent, cs, &new_cpus))) {
- update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp);
- compute_effective_cpumask(&new_cpus, cs, parent);
- cpuset_force_rebuild();
- }
+ tasks_nocpu_error(parent, cs, &new_cpus)))
+ partcmd = partcmd_invalidate;
/*
* On the other hand, an invalid partition root may be transitioned
* back to a regular one.
*/
- else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
- update_parent_effective_cpumask(cs, partcmd_update, NULL, tmp);
- if (is_partition_valid(cs)) {
+ else if (is_partition_valid(parent) && is_partition_invalid(cs))
+ partcmd = partcmd_update;
+
+ /*
+ * cpus_read_lock needs to be held before calling
+ * update_parent_effective_cpumask(). To avoid circular lock
+ * dependency between cpuset_mutex and cpus_read_lock,
+ * cpus_read_trylock() is used here to acquire the lock.
+ */
+ if (partcmd >= 0) {
+ if (!cpuset_hotplug_cpus_read_trylock())
+ goto update_tasks;
+
+ update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
+ cpus_read_unlock();
+ if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
compute_partition_effective_cpumask(cs, &new_cpus);
cpuset_force_rebuild();
}
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 122dacb3a443..66d1708042a7 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -66,9 +66,15 @@ static struct freezer *parent_freezer(struct freezer *freezer)
bool cgroup_freezing(struct task_struct *task)
{
bool ret;
+ unsigned int state;
rcu_read_lock();
- ret = task_freezer(task)->state & CGROUP_FREEZING;
+ /* Check if the cgroup is still FREEZING, but not FROZEN. The extra
+ * !FROZEN check is required, because the FREEZING bit is not cleared
+ * when the state FROZEN is reached.
+ */
+ state = task_freezer(task)->state;
+ ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN);
rcu_read_unlock();
return ret;
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index c0adb7254b45..a8350d2d63e6 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -74,64 +74,109 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
}
/**
- * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
- * @pos: current position
- * @root: root of the tree to traversal
+ * cgroup_rstat_push_children - push children cgroups into the given list
+ * @head: current head of the list (= subtree root)
+ * @child: first child of the root
* @cpu: target cpu
+ * Return: A new singly linked list of cgroups to be flush
*
- * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
- * the traversal and %NULL return indicates the end. During traversal,
- * each returned cgroup is unlinked from the tree. Must be called with the
- * matching cgroup_rstat_cpu_lock held.
+ * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
+ * level and push all the parents first before their next level children
+ * into a singly linked list built from the tail backward like "pushing"
+ * cgroups into a stack. The root is pushed by the caller.
+ */
+static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
+ struct cgroup *child, int cpu)
+{
+ struct cgroup *chead = child; /* Head of child cgroup level */
+ struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */
+ struct cgroup *parent, *grandchild;
+ struct cgroup_rstat_cpu *crstatc;
+
+ child->rstat_flush_next = NULL;
+
+next_level:
+ while (chead) {
+ child = chead;
+ chead = child->rstat_flush_next;
+ parent = cgroup_parent(child);
+
+ /* updated_next is parent cgroup terminated */
+ while (child != parent) {
+ child->rstat_flush_next = head;
+ head = child;
+ crstatc = cgroup_rstat_cpu(child, cpu);
+ grandchild = crstatc->updated_children;
+ if (grandchild != child) {
+ /* Push the grand child to the next level */
+ crstatc->updated_children = child;
+ grandchild->rstat_flush_next = ghead;
+ ghead = grandchild;
+ }
+ child = crstatc->updated_next;
+ crstatc->updated_next = NULL;
+ }
+ }
+
+ if (ghead) {
+ chead = ghead;
+ ghead = NULL;
+ goto next_level;
+ }
+ return head;
+}
+
+/**
+ * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
+ * @root: root of the cgroup subtree to traverse
+ * @cpu: target cpu
+ * Return: A singly linked list of cgroups to be flushed
+ *
+ * Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
+ * each returned cgroup is unlinked from the updated tree.
*
* The only ordering guarantee is that, for a parent and a child pair
- * covered by a given traversal, if a child is visited, its parent is
- * guaranteed to be visited afterwards.
+ * covered by a given traversal, the child is before its parent in
+ * the list.
+ *
+ * Note that updated_children is self terminated and points to a list of
+ * child cgroups if not empty. Whereas updated_next is like a sibling link
+ * within the children list and terminated by the parent cgroup. An exception
+ * here is the cgroup root whose updated_next can be self terminated.
*/
-static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
- struct cgroup *root, int cpu)
+static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
{
- struct cgroup_rstat_cpu *rstatc;
- struct cgroup *parent;
-
- if (pos == root)
- return NULL;
+ raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+ struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
+ struct cgroup *head = NULL, *parent, *child;
+ unsigned long flags;
/*
- * We're gonna walk down to the first leaf and visit/remove it. We
- * can pick whatever unvisited node as the starting point.
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
*/
- if (!pos) {
- pos = root;
- /* return NULL if this subtree is not on-list */
- if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
- return NULL;
- } else {
- pos = cgroup_parent(pos);
- }
+ raw_spin_lock_irqsave(cpu_lock, flags);
- /* walk down to the first leaf */
- while (true) {
- rstatc = cgroup_rstat_cpu(pos, cpu);
- if (rstatc->updated_children == pos)
- break;
- pos = rstatc->updated_children;
- }
+ /* Return NULL if this subtree is not on-list */
+ if (!rstatc->updated_next)
+ goto unlock_ret;
/*
- * Unlink @pos from the tree. As the updated_children list is
+ * Unlink @root from its parent. As the updated_children list is
* singly linked, we have to walk it to find the removal point.
- * However, due to the way we traverse, @pos will be the first
- * child in most cases. The only exception is @root.
*/
- parent = cgroup_parent(pos);
+ parent = cgroup_parent(root);
if (parent) {
struct cgroup_rstat_cpu *prstatc;
struct cgroup **nextp;
prstatc = cgroup_rstat_cpu(parent, cpu);
nextp = &prstatc->updated_children;
- while (*nextp != pos) {
+ while (*nextp != root) {
struct cgroup_rstat_cpu *nrstatc;
nrstatc = cgroup_rstat_cpu(*nextp, cpu);
@@ -142,7 +187,17 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
}
rstatc->updated_next = NULL;
- return pos;
+
+ /* Push @root to the list first before pushing the children */
+ head = root;
+ root->rstat_flush_next = NULL;
+ child = rstatc->updated_children;
+ rstatc->updated_children = root;
+ if (child != root)
+ head = cgroup_rstat_push_children(head, child, cpu);
+unlock_ret:
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+ return head;
}
/*
@@ -176,21 +231,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_rstat_lock);
for_each_possible_cpu(cpu) {
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
- cpu);
- struct cgroup *pos = NULL;
- unsigned long flags;
+ struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
- /*
- * The _irqsave() is needed because cgroup_rstat_lock is
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
- * this lock with the _irq() suffix only disables interrupts on
- * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
- * interrupts on both configurations. The _irqsave() ensures
- * that interrupts are always disabled and later restored.
- */
- raw_spin_lock_irqsave(cpu_lock, flags);
- while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
+ for (; pos; pos = pos->rstat_flush_next) {
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
@@ -202,7 +245,6 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock_irqrestore(cpu_lock, flags);
/* play nice and yield if necessary */
if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a86972a91991..e6ec3ba4950b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2125,11 +2125,6 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = relay_prepare_cpu,
.teardown.single = NULL,
},
- [CPUHP_SLAB_PREPARE] = {
- .name = "slab:prepare",
- .startup.single = slab_prepare_cpu,
- .teardown.single = slab_dead_cpu,
- },
[CPUHP_RCUTREE_PREP] = {
.name = "RCU/tree:prepare",
.startup.single = rcutree_prepare_cpu,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index efe87d501c8c..d48315667752 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -13,7 +13,6 @@
#include <linux/memory.h>
#include <linux/cpuhotplug.h>
#include <linux/memblock.h>
-#include <linux/kexec.h>
#include <linux/kmemleak.h>
#include <asm/page.h>
@@ -199,7 +198,7 @@ static __initdata char *suffix_tbl[] = {
* It returns 0 on success and -EINVAL on failure.
*/
static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
+ unsigned long long *crash_size,
const char *suffix)
{
char *cur = cmdline;
@@ -268,9 +267,9 @@ static int __init __parse_crashkernel(char *cmdline,
unsigned long long *crash_base,
const char *suffix)
{
- char *first_colon, *first_space;
- char *ck_cmdline;
- char *name = "crashkernel=";
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+ char *name = "crashkernel=";
BUG_ON(!crash_size || !crash_base);
*crash_size = 0;
@@ -440,7 +439,7 @@ retry:
return;
}
- if ((crash_base > CRASH_ADDR_LOW_MAX) &&
+ if ((crash_base >= CRASH_ADDR_LOW_MAX) &&
crash_low_size && reserve_crashkernel_low(crash_low_size)) {
memblock_phys_free(crash_base, crash_size);
return;
@@ -551,9 +550,11 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
phdr->p_align = 0;
ehdr->e_phnum++;
- pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
- phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
- ehdr->e_phnum, phdr->p_offset);
+#ifdef CONFIG_KEXEC_FILE
+ kexec_dprintk("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+#endif
phdr++;
}
@@ -565,9 +566,8 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map,
int crash_exclude_mem_range(struct crash_mem *mem,
unsigned long long mstart, unsigned long long mend)
{
- int i, j;
+ int i;
unsigned long long start, end, p_start, p_end;
- struct range temp_range = {0, 0};
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
@@ -575,72 +575,51 @@ int crash_exclude_mem_range(struct crash_mem *mem,
p_start = mstart;
p_end = mend;
- if (mstart > end || mend < start)
+ if (p_start > end)
continue;
+ /*
+ * Because the memory ranges in mem->ranges are stored in
+ * ascending order, when we detect `p_end < start`, we can
+ * immediately exit the for loop, as the subsequent memory
+ * ranges will definitely be outside the range we are looking
+ * for.
+ */
+ if (p_end < start)
+ break;
+
/* Truncate any area outside of range */
- if (mstart < start)
+ if (p_start < start)
p_start = start;
- if (mend > end)
+ if (p_end > end)
p_end = end;
/* Found completely overlapping range */
if (p_start == start && p_end == end) {
- mem->ranges[i].start = 0;
- mem->ranges[i].end = 0;
- if (i < mem->nr_ranges - 1) {
- /* Shift rest of the ranges to left */
- for (j = i; j < mem->nr_ranges - 1; j++) {
- mem->ranges[j].start =
- mem->ranges[j+1].start;
- mem->ranges[j].end =
- mem->ranges[j+1].end;
- }
-
- /*
- * Continue to check if there are another overlapping ranges
- * from the current position because of shifting the above
- * mem ranges.
- */
- i--;
- mem->nr_ranges--;
- continue;
- }
+ memmove(&mem->ranges[i], &mem->ranges[i + 1],
+ (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i]));
+ i--;
mem->nr_ranges--;
- return 0;
- }
-
- if (p_start > start && p_end < end) {
+ } else if (p_start > start && p_end < end) {
/* Split original range */
+ if (mem->nr_ranges >= mem->max_nr_ranges)
+ return -ENOMEM;
+
+ memmove(&mem->ranges[i + 2], &mem->ranges[i + 1],
+ (mem->nr_ranges - (i + 1)) * sizeof(mem->ranges[i]));
+
mem->ranges[i].end = p_start - 1;
- temp_range.start = p_end + 1;
- temp_range.end = end;
+ mem->ranges[i + 1].start = p_end + 1;
+ mem->ranges[i + 1].end = end;
+
+ i++;
+ mem->nr_ranges++;
} else if (p_start != start)
mem->ranges[i].end = p_start - 1;
else
mem->ranges[i].start = p_end + 1;
- break;
- }
-
- /* If a split happened, add the split to array */
- if (!temp_range.end)
- return 0;
-
- /* Split happened */
- if (i == mem->max_nr_ranges - 1)
- return -ENOMEM;
-
- /* Location where new range should go */
- j = i + 1;
- if (j < mem->nr_ranges) {
- /* Move over all ranges one slot towards the end */
- for (i = mem->nr_ranges - 1; i >= j; i--)
- mem->ranges[i + 1] = mem->ranges[i];
}
- mem->ranges[j].start = temp_range.start;
- mem->ranges[j].end = temp_range.end;
- mem->nr_ranges++;
return 0;
}
@@ -802,7 +781,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(list_head, prev);
VMCOREINFO_OFFSET(vmap_area, va_start);
VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1);
+ VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS);
log_buf_vmcoreinfo_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
VMCOREINFO_NUMBER(NR_FREE_PAGES);
diff --git a/kernel/cred.c b/kernel/cred.c
index 3c714cb31660..c033a201c808 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -43,10 +43,6 @@ static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
*/
struct cred init_cred = {
.usage = ATOMIC_INIT(4),
-#ifdef CONFIG_DEBUG_CREDENTIALS
- .subscribers = ATOMIC_INIT(2),
- .magic = CRED_MAGIC,
-#endif
.uid = GLOBAL_ROOT_UID,
.gid = GLOBAL_ROOT_GID,
.suid = GLOBAL_ROOT_UID,
@@ -66,31 +62,6 @@ struct cred init_cred = {
.ucounts = &init_ucounts,
};
-static inline void set_cred_subscribers(struct cred *cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- atomic_set(&cred->subscribers, n);
-#endif
-}
-
-static inline int read_cred_subscribers(const struct cred *cred)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- return atomic_read(&cred->subscribers);
-#else
- return 0;
-#endif
-}
-
-static inline void alter_cred_subscribers(const struct cred *_cred, int n)
-{
-#ifdef CONFIG_DEBUG_CREDENTIALS
- struct cred *cred = (struct cred *) _cred;
-
- atomic_add(n, &cred->subscribers);
-#endif
-}
-
/*
* The RCU callback to actually dispose of a set of credentials
*/
@@ -100,20 +71,9 @@ static void put_cred_rcu(struct rcu_head *rcu)
kdebug("put_cred_rcu(%p)", cred);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- if (cred->magic != CRED_MAGIC_DEAD ||
- atomic_read(&cred->usage) != 0 ||
- read_cred_subscribers(cred) != 0)
- panic("CRED: put_cred_rcu() sees %p with"
- " mag %x, put %p, usage %d, subscr %d\n",
- cred, cred->magic, cred->put_addr,
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
-#else
- if (atomic_read(&cred->usage) != 0)
- panic("CRED: put_cred_rcu() sees %p with usage %d\n",
- cred, atomic_read(&cred->usage));
-#endif
+ if (atomic_long_read(&cred->usage) != 0)
+ panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
+ cred, atomic_long_read(&cred->usage));
security_cred_free(cred);
key_put(cred->session_keyring);
@@ -137,16 +97,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
*/
void __put_cred(struct cred *cred)
{
- kdebug("__put_cred(%p{%d,%d})", cred,
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
-
- BUG_ON(atomic_read(&cred->usage) != 0);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(cred) != 0);
- cred->magic = CRED_MAGIC_DEAD;
- cred->put_addr = __builtin_return_address(0);
-#endif
+ kdebug("__put_cred(%p{%ld})", cred,
+ atomic_long_read(&cred->usage));
+
+ BUG_ON(atomic_long_read(&cred->usage) != 0);
BUG_ON(cred == current->cred);
BUG_ON(cred == current->real_cred);
@@ -164,9 +118,8 @@ void exit_creds(struct task_struct *tsk)
{
struct cred *real_cred, *cred;
- kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
- atomic_read(&tsk->cred->usage),
- read_cred_subscribers(tsk->cred));
+ kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred,
+ atomic_long_read(&tsk->cred->usage));
real_cred = (struct cred *) tsk->real_cred;
tsk->real_cred = NULL;
@@ -174,15 +127,10 @@ void exit_creds(struct task_struct *tsk)
cred = (struct cred *) tsk->cred;
tsk->cred = NULL;
- validate_creds(cred);
if (real_cred == cred) {
- alter_cred_subscribers(cred, -2);
put_cred_many(cred, 2);
} else {
- validate_creds(real_cred);
- alter_cred_subscribers(real_cred, -1);
put_cred(real_cred);
- alter_cred_subscribers(cred, -1);
put_cred(cred);
}
@@ -230,10 +178,7 @@ struct cred *cred_alloc_blank(void)
if (!new)
return NULL;
- atomic_set(&new->usage, 1);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- new->magic = CRED_MAGIC;
-#endif
+ atomic_long_set(&new->usage, 1);
if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
@@ -264,8 +209,6 @@ struct cred *prepare_creds(void)
const struct cred *old;
struct cred *new;
- validate_process_creds();
-
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
@@ -276,8 +219,7 @@ struct cred *prepare_creds(void)
memcpy(new, old, sizeof(struct cred));
new->non_rcu = 0;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
+ atomic_long_set(&new->usage, 1);
get_group_info(new->group_info);
get_uid(new->user);
get_user_ns(new->user_ns);
@@ -300,7 +242,6 @@ struct cred *prepare_creds(void)
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error;
- validate_creds(new);
return new;
error:
@@ -362,10 +303,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
clone_flags & CLONE_THREAD
) {
p->real_cred = get_cred_many(p->cred, 2);
- alter_cred_subscribers(p->cred, 2);
- kdebug("share_creds(%p{%d,%d})",
- p->cred, atomic_read(&p->cred->usage),
- read_cred_subscribers(p->cred));
+ kdebug("share_creds(%p{%ld})",
+ p->cred, atomic_long_read(&p->cred->usage));
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
return 0;
}
@@ -404,8 +343,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
p->cred = p->real_cred = get_cred(new);
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
- alter_cred_subscribers(new, 2);
- validate_creds(new);
return 0;
error_put:
@@ -457,17 +394,11 @@ int commit_creds(struct cred *new)
struct task_struct *task = current;
const struct cred *old = task->real_cred;
- kdebug("commit_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
+ kdebug("commit_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
BUG_ON(task->cred != old);
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(old) < 2);
- validate_creds(old);
- validate_creds(new);
-#endif
- BUG_ON(atomic_read(&new->usage) < 1);
+ BUG_ON(atomic_long_read(&new->usage) < 1);
get_cred(new); /* we will require a ref for the subj creds too */
@@ -502,14 +433,12 @@ int commit_creds(struct cred *new)
* RLIMIT_NPROC limits on user->processes have already been checked
* in set_user().
*/
- alter_cred_subscribers(new, 2);
if (new->user != old->user || new->user_ns != old->user_ns)
inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user || new->user_ns != old->user_ns)
dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
- alter_cred_subscribers(old, -2);
/* send notifications */
if (!uid_eq(new->uid, old->uid) ||
@@ -539,14 +468,10 @@ EXPORT_SYMBOL(commit_creds);
*/
void abort_creds(struct cred *new)
{
- kdebug("abort_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
+ kdebug("abort_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
-#ifdef CONFIG_DEBUG_CREDENTIALS
- BUG_ON(read_cred_subscribers(new) != 0);
-#endif
- BUG_ON(atomic_read(&new->usage) < 1);
+ BUG_ON(atomic_long_read(&new->usage) < 1);
put_cred(new);
}
EXPORT_SYMBOL(abort_creds);
@@ -562,12 +487,8 @@ const struct cred *override_creds(const struct cred *new)
{
const struct cred *old = current->cred;
- kdebug("override_creds(%p{%d,%d})", new,
- atomic_read(&new->usage),
- read_cred_subscribers(new));
-
- validate_creds(old);
- validate_creds(new);
+ kdebug("override_creds(%p{%ld})", new,
+ atomic_long_read(&new->usage));
/*
* NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
@@ -576,18 +497,12 @@ const struct cred *override_creds(const struct cred *new)
* we are only installing the cred into the thread-synchronous
* '->cred' pointer, not the '->real_cred' pointer that is
* visible to other threads under RCU.
- *
- * Also note that we did validate_creds() manually, not depending
- * on the validation in 'get_cred()'.
*/
get_new_cred((struct cred *)new);
- alter_cred_subscribers(new, 1);
rcu_assign_pointer(current->cred, new);
- alter_cred_subscribers(old, -1);
- kdebug("override_creds() = %p{%d,%d}", old,
- atomic_read(&old->usage),
- read_cred_subscribers(old));
+ kdebug("override_creds() = %p{%ld}", old,
+ atomic_long_read(&old->usage));
return old;
}
EXPORT_SYMBOL(override_creds);
@@ -603,15 +518,10 @@ void revert_creds(const struct cred *old)
{
const struct cred *override = current->cred;
- kdebug("revert_creds(%p{%d,%d})", old,
- atomic_read(&old->usage),
- read_cred_subscribers(old));
+ kdebug("revert_creds(%p{%ld})", old,
+ atomic_long_read(&old->usage));
- validate_creds(old);
- validate_creds(override);
- alter_cred_subscribers(old, 1);
rcu_assign_pointer(current->cred, old);
- alter_cred_subscribers(override, -1);
put_cred(override);
}
EXPORT_SYMBOL(revert_creds);
@@ -731,12 +641,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
kdebug("prepare_kernel_cred() alloc %p", new);
old = get_task_cred(daemon);
- validate_creds(old);
*new = *old;
new->non_rcu = 0;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
+ atomic_long_set(&new->usage, 1);
get_uid(new->user);
get_user_ns(new->user_ns);
get_group_info(new->group_info);
@@ -760,7 +668,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
goto error;
put_cred(old);
- validate_creds(new);
return new;
error:
@@ -825,109 +732,3 @@ int set_create_files_as(struct cred *new, struct inode *inode)
return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);
-
-#ifdef CONFIG_DEBUG_CREDENTIALS
-
-bool creds_are_invalid(const struct cred *cred)
-{
- if (cred->magic != CRED_MAGIC)
- return true;
- return false;
-}
-EXPORT_SYMBOL(creds_are_invalid);
-
-/*
- * dump invalid credentials
- */
-static void dump_invalid_creds(const struct cred *cred, const char *label,
- const struct task_struct *tsk)
-{
- pr_err("%s credentials: %p %s%s%s\n",
- label, cred,
- cred == &init_cred ? "[init]" : "",
- cred == tsk->real_cred ? "[real]" : "",
- cred == tsk->cred ? "[eff]" : "");
- pr_err("->magic=%x, put_addr=%p\n",
- cred->magic, cred->put_addr);
- pr_err("->usage=%d, subscr=%d\n",
- atomic_read(&cred->usage),
- read_cred_subscribers(cred));
- pr_err("->*uid = { %d,%d,%d,%d }\n",
- from_kuid_munged(&init_user_ns, cred->uid),
- from_kuid_munged(&init_user_ns, cred->euid),
- from_kuid_munged(&init_user_ns, cred->suid),
- from_kuid_munged(&init_user_ns, cred->fsuid));
- pr_err("->*gid = { %d,%d,%d,%d }\n",
- from_kgid_munged(&init_user_ns, cred->gid),
- from_kgid_munged(&init_user_ns, cred->egid),
- from_kgid_munged(&init_user_ns, cred->sgid),
- from_kgid_munged(&init_user_ns, cred->fsgid));
-#ifdef CONFIG_SECURITY
- pr_err("->security is %p\n", cred->security);
- if ((unsigned long) cred->security >= PAGE_SIZE &&
- (((unsigned long) cred->security & 0xffffff00) !=
- (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
- pr_err("->security {%x, %x}\n",
- ((u32*)cred->security)[0],
- ((u32*)cred->security)[1]);
-#endif
-}
-
-/*
- * report use of invalid credentials
- */
-void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line)
-{
- pr_err("Invalid credentials\n");
- pr_err("At %s:%u\n", file, line);
- dump_invalid_creds(cred, "Specified", current);
- BUG();
-}
-EXPORT_SYMBOL(__invalid_creds);
-
-/*
- * check the credentials on a process
- */
-void __validate_process_creds(struct task_struct *tsk,
- const char *file, unsigned line)
-{
- if (tsk->cred == tsk->real_cred) {
- if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
- creds_are_invalid(tsk->cred)))
- goto invalid_creds;
- } else {
- if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
- read_cred_subscribers(tsk->cred) < 1 ||
- creds_are_invalid(tsk->real_cred) ||
- creds_are_invalid(tsk->cred)))
- goto invalid_creds;
- }
- return;
-
-invalid_creds:
- pr_err("Invalid process credentials\n");
- pr_err("At %s:%u\n", file, line);
-
- dump_invalid_creds(tsk->real_cred, "Real", tsk);
- if (tsk->cred != tsk->real_cred)
- dump_invalid_creds(tsk->cred, "Effective", tsk);
- else
- pr_err("Effective creds == Real creds\n");
- BUG();
-}
-EXPORT_SYMBOL(__validate_process_creds);
-
-/*
- * check creds for do_exit()
- */
-void validate_creds_for_do_exit(struct task_struct *tsk)
-{
- kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
- tsk->real_cred, tsk->cred,
- atomic_read(&tsk->cred->usage),
- read_cred_subscribers(tsk->cred));
-
- __validate_process_creds(tsk, __FILE__, __LINE__);
-}
-
-#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index b481c48a31a6..d10613eb0f63 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
void *addr;
int ret = -ENOMEM;
- /* Cannot allocate larger than MAX_ORDER */
- order = min(get_order(pool_size), MAX_ORDER);
+ /* Cannot allocate larger than MAX_PAGE_ORDER */
+ order = min(get_order(pool_size), MAX_PAGE_ORDER);
do {
pool_size = 1 << (PAGE_SHIFT + order);
@@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void)
/*
* If coherent_pool was not used on the command line, default the pool
- * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER.
+ * sizes to 128KB per 1GB of memory, min 128KB, max MAX_PAGE_ORDER.
*/
if (!atomic_pool_size) {
unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 33d942615be5..176078bf2215 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -686,8 +686,8 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
size_t pool_size;
size_t tlb_size;
- if (nslabs > SLABS_PER_PAGE << MAX_ORDER) {
- nslabs = SLABS_PER_PAGE << MAX_ORDER;
+ if (nslabs > SLABS_PER_PAGE << MAX_PAGE_ORDER) {
+ nslabs = SLABS_PER_PAGE << MAX_PAGE_ORDER;
nareas = limit_nareas(nareas, nslabs);
}
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index d7ee4bc3f2ba..88cb3c88aaa5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -15,26 +15,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
-/* See comment for enter_from_user_mode() in entry-common.h */
-static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
-{
- arch_enter_from_user_mode(regs);
- lockdep_hardirqs_off(CALLER_ADDR0);
-
- CT_WARN_ON(__ct_state() != CONTEXT_USER);
- user_exit_irqoff();
-
- instrumentation_begin();
- kmsan_unpoison_entry_regs(regs);
- trace_hardirqs_off_finish();
- instrumentation_end();
-}
-
-void noinstr enter_from_user_mode(struct pt_regs *regs)
-{
- __enter_from_user_mode(regs);
-}
-
static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
{
if (unlikely(audit_context())) {
@@ -45,7 +25,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
}
}
-static long syscall_trace_enter(struct pt_regs *regs, long syscall,
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
unsigned long work)
{
long ret = 0;
@@ -85,67 +65,24 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
return ret ? : syscall;
}
-static __always_inline long
-__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
-{
- unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
-
- if (work & SYSCALL_WORK_ENTER)
- syscall = syscall_trace_enter(regs, syscall, work);
-
- return syscall;
-}
-
-long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
-{
- return __syscall_enter_from_user_work(regs, syscall);
-}
-
-noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
-{
- long ret;
-
- __enter_from_user_mode(regs);
-
- instrumentation_begin();
- local_irq_enable();
- ret = __syscall_enter_from_user_work(regs, syscall);
- instrumentation_end();
-
- return ret;
-}
-
noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
{
- __enter_from_user_mode(regs);
+ enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
instrumentation_end();
}
-/* See comment for exit_to_user_mode() in entry-common.h */
-static __always_inline void __exit_to_user_mode(void)
-{
- instrumentation_begin();
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare();
- instrumentation_end();
-
- user_enter_irqoff();
- arch_exit_to_user_mode();
- lockdep_hardirqs_on(CALLER_ADDR0);
-}
-
-void noinstr exit_to_user_mode(void)
-{
- __exit_to_user_mode();
-}
-
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
-static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work)
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs: Pointer to pt_regs on entry stack
+ * @ti_work: TIF work flags as read by the caller
+ */
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
{
/*
* Before returning to user space ensure that all pending work
@@ -190,27 +127,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
return ti_work;
}
-static void exit_to_user_mode_prepare(struct pt_regs *regs)
-{
- unsigned long ti_work;
-
- lockdep_assert_irqs_disabled();
-
- /* Flush pending rcuog wakeup before the last need_resched() check */
- tick_nohz_user_enter_prepare();
-
- ti_work = read_thread_flags();
- if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
- ti_work = exit_to_user_mode_loop(regs, ti_work);
-
- arch_exit_to_user_mode_prepare(regs, ti_work);
-
- /* Ensure that kernel state is sane for a return to userspace */
- kmap_assert_nomap();
- lockdep_assert_irqs_disabled();
- lockdep_sys_exit();
-}
-
/*
* If SYSCALL_EMU is set, then the only reason to report is when
* SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
@@ -295,12 +211,12 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
instrumentation_begin();
__syscall_exit_to_user_mode_work(regs);
instrumentation_end();
- __exit_to_user_mode();
+ exit_to_user_mode();
}
noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
- __enter_from_user_mode(regs);
+ enter_from_user_mode(regs);
}
noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
@@ -308,7 +224,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
instrumentation_begin();
exit_to_user_mode_prepare(regs);
instrumentation_end();
- __exit_to_user_mode();
+ exit_to_user_mode();
}
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b704d83a28b2..f0f0f71213a1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1814,31 +1814,34 @@ static inline void perf_event__state_init(struct perf_event *event)
PERF_EVENT_STATE_INACTIVE;
}
-static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
+static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
int entry = sizeof(u64); /* value */
int size = 0;
int nr = 1;
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
size += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_ID)
+ if (read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_LOST)
+ if (read_format & PERF_FORMAT_LOST)
entry += sizeof(u64);
- if (event->attr.read_format & PERF_FORMAT_GROUP) {
+ if (read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
}
- size += entry * nr;
- event->read_size = size;
+ /*
+ * Since perf_event_validate_size() limits this to 16k and inhibits
+ * adding more siblings, this will never overflow.
+ */
+ return size + nr * entry;
}
static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
@@ -1888,8 +1891,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
*/
static void perf_event__header_size(struct perf_event *event)
{
- __perf_event_read_size(event,
- event->group_leader->nr_siblings);
+ event->read_size =
+ __perf_event_read_size(event->attr.read_format,
+ event->group_leader->nr_siblings);
__perf_event_header_size(event, event->attr.sample_type);
}
@@ -1920,23 +1924,44 @@ static void perf_event__id_header_size(struct perf_event *event)
event->id_header_size = size;
}
+/*
+ * Check that adding an event to the group does not result in anybody
+ * overflowing the 64k event limit imposed by the output buffer.
+ *
+ * Specifically, check that the read_size for the event does not exceed 16k,
+ * read_size being the one term that grows with groups size. Since read_size
+ * depends on per-event read_format, also (re)check the existing events.
+ *
+ * This leaves 48k for the constant size fields and things like callchains,
+ * branch stacks and register sets.
+ */
static bool perf_event_validate_size(struct perf_event *event)
{
- /*
- * The values computed here will be over-written when we actually
- * attach the event.
- */
- __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
- __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
- perf_event__id_header_size(event);
+ struct perf_event *sibling, *group_leader = event->group_leader;
+
+ if (__perf_event_read_size(event->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+
+ if (__perf_event_read_size(group_leader->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
/*
- * Sum the lot; should not exceed the 64k limit we have on records.
- * Conservative limit to allow for callchains and other variable fields.
+ * When creating a new group leader, group_leader->ctx is initialized
+ * after the size has been validated, but we cannot safely use
+ * for_each_sibling_event() until group_leader->ctx is set. A new group
+ * leader cannot have any siblings yet, so we can safely skip checking
+ * the non-existent siblings.
*/
- if (event->read_size + event->header_size +
- event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
- return false;
+ if (event == group_leader)
+ return true;
+
+ for_each_sibling_event(sibling, group_leader) {
+ if (__perf_event_read_size(sibling->attr.read_format,
+ group_leader->nr_siblings + 1) > 16*1024)
+ return false;
+ }
return true;
}
@@ -7372,6 +7397,14 @@ void perf_output_sample(struct perf_output_handle *handle,
if (branch_sample_hw_index(event))
perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size);
+ /*
+ * Add the extension space which is appended
+ * right after the struct perf_branch_stack.
+ */
+ if (data->br_stack_cntr) {
+ size = data->br_stack->nr * sizeof(u64);
+ perf_output_copy(handle, data->br_stack_cntr, size);
+ }
} else {
/*
* we always store at least the value of nr
@@ -11400,9 +11433,30 @@ static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
+ &dev_attr_nr_addr_filters.attr,
+ NULL,
+};
+
+static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ if (n == 2 && !pmu->nr_addr_filters)
+ return 0;
+
+ return a->mode;
+}
+
+static struct attribute_group pmu_dev_attr_group = {
+ .is_visible = pmu_dev_is_visible,
+ .attrs = pmu_dev_attrs,
+};
+
+static const struct attribute_group *pmu_dev_groups[] = {
+ &pmu_dev_attr_group,
NULL,
};
-ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
static struct bus_type pmu_bus = {
@@ -11439,18 +11493,11 @@ static int pmu_dev_alloc(struct pmu *pmu)
if (ret)
goto free_dev;
- /* For PMUs with address filters, throw in an extra attribute: */
- if (pmu->nr_addr_filters)
- ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
-
- if (ret)
- goto del_dev;
-
- if (pmu->attr_update)
+ if (pmu->attr_update) {
ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
-
- if (ret)
- goto del_dev;
+ if (ret)
+ goto del_dev;
+ }
out:
return ret;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index e8d82c2f07d0..60ed43d1c29e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -610,8 +610,8 @@ static struct page *rb_alloc_aux_page(int node, int order)
{
struct page *page;
- if (order > MAX_ORDER)
- order = MAX_ORDER;
+ if (order > MAX_PAGE_ORDER)
+ order = MAX_PAGE_ORDER;
do {
page = alloc_pages_node(node, PERF_AUX_GFP, order);
@@ -702,9 +702,9 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
/*
* kcalloc_node() is unable to allocate buffer if the size is larger
- * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case.
+ * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case.
*/
- if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER)
+ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER)
return -ENOMEM;
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
node);
@@ -821,7 +821,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
size = sizeof(struct perf_buffer);
size += nr_pages * sizeof(void *);
- if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER)
+ if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER)
goto fail;
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 435aac1d8c27..485bb0389b48 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -181,7 +181,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
if (new_page) {
folio_get(new_folio);
- page_add_new_anon_rmap(new_page, vma, addr);
+ folio_add_new_anon_rmap(new_folio, vma, addr);
folio_add_lru_vma(new_folio, vma);
} else
/* no new page, just dec_mm_counter for old_page */
@@ -198,7 +198,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));
- page_remove_rmap(old_page, vma, false);
+ folio_remove_rmap_pte(old_folio, old_page, vma);
if (!folio_mapped(old_folio))
folio_free_swap(old_folio);
page_vma_mapped_walk_done(&pvmw);
diff --git a/kernel/exit.c b/kernel/exit.c
index ee9f43bed49a..aedc0832c9f4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -824,8 +824,6 @@ void __noreturn do_exit(long code)
ptrace_event(PTRACE_EVENT_EXIT, code);
user_events_exit(tsk);
- validate_creds_for_do_exit(tsk);
-
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
@@ -909,7 +907,6 @@ void __noreturn do_exit(long code)
if (tsk->task_frag.page)
put_page(tsk->task_frag.page);
- validate_creds_for_do_exit(tsk);
exit_task_stack_account(tsk);
check_stack_usage();
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..b32e323adbbf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -165,7 +165,6 @@ void __weak arch_release_task_struct(struct task_struct *tsk)
{
}
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
static inline struct task_struct *alloc_task_struct_node(int node)
@@ -177,9 +176,6 @@ static inline void free_task_struct(struct task_struct *tsk)
{
kmem_cache_free(task_struct_cachep, tsk);
}
-#endif
-
-#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
/*
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -412,24 +408,6 @@ void thread_stack_cache_init(void)
}
# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
-#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
-
-static int alloc_thread_stack_node(struct task_struct *tsk, int node)
-{
- unsigned long *stack;
-
- stack = arch_alloc_thread_stack_node(tsk, node);
- tsk->stack = stack;
- return stack ? 0 : -ENOMEM;
-}
-
-static void free_thread_stack(struct task_struct *tsk)
-{
- arch_free_thread_stack(tsk);
- tsk->stack = NULL;
-}
-
-#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -650,7 +628,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
int retval;
unsigned long charge = 0;
LIST_HEAD(uf);
- VMA_ITERATOR(old_vmi, oldmm, 0);
VMA_ITERATOR(vmi, mm, 0);
uprobe_start_dup_mmap();
@@ -678,16 +655,22 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto out;
khugepaged_fork(mm, oldmm);
- retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
- if (retval)
+ /* Use __mt_dup() to efficiently build an identical maple tree. */
+ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+ if (unlikely(retval))
goto out;
mt_clear_in_rcu(vmi.mas.tree);
- for_each_vma(old_vmi, mpnt) {
+ for_each_vma(vmi, mpnt) {
struct file *file;
vma_start_write(mpnt);
if (mpnt->vm_flags & VM_DONTCOPY) {
+ retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+ mpnt->vm_end, GFP_KERNEL);
+ if (retval)
+ goto loop_out;
+
vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
continue;
}
@@ -749,9 +732,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (is_vm_hugetlb_page(tmp))
hugetlb_dup_vma_private(tmp);
- /* Link the vma into the MT */
- if (vma_iter_bulk_store(&vmi, tmp))
- goto fail_nomem_vmi_store;
+ /*
+ * Link the vma into the MT. After using __mt_dup(), memory
+ * allocation is not necessary here, so it cannot fail.
+ */
+ vma_iter_bulk_store(&vmi, tmp);
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -760,15 +745,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
- if (retval)
+ if (retval) {
+ mpnt = vma_next(&vmi);
goto loop_out;
+ }
}
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
- if (!retval)
+ if (!retval) {
mt_set_in_rcu(vmi.mas.tree);
+ } else if (mpnt) {
+ /*
+ * The entire maple tree has already been duplicated. If the
+ * mmap duplication fails, mark the failure point with
+ * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+ * stop releasing VMAs that have not been duplicated after this
+ * point.
+ */
+ mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+ mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ }
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -778,8 +776,6 @@ fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
-fail_nomem_vmi_store:
- unlink_anon_vmas(tmp);
fail_nomem_anon_vma_fork:
mpol_put(vma_policy(tmp));
fail_nomem_policy:
@@ -1021,7 +1017,6 @@ static void set_max_threads(unsigned int max_threads_suggested)
int arch_task_struct_size __read_mostly;
#endif
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
@@ -1036,12 +1031,10 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
else
*offset += offsetof(struct task_struct, thread);
}
-#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
void __init fork_init(void)
{
int i;
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN 0
#endif
@@ -1054,7 +1047,6 @@ void __init fork_init(void)
arch_task_struct_size, align,
SLAB_PANIC|SLAB_ACCOUNT,
useroffset, usersize, NULL);
-#endif
/* do the arch specific task caches init */
arch_task_cache_init();
@@ -1588,7 +1580,7 @@ static void complete_vfork_done(struct task_struct *tsk)
static int wait_for_vfork_done(struct task_struct *child,
struct completion *vfork)
{
- unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
+ unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
int killed;
cgroup_enter_frozen();
@@ -2928,7 +2920,7 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}
- if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index c450fa8b8b5e..f57aaf96b829 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -187,6 +187,7 @@ static int __restore_freezer_state(struct task_struct *p, void *arg)
if (state != TASK_RUNNING) {
WRITE_ONCE(p->__state, state);
+ p->saved_state = TASK_RUNNING;
return 1;
}
@@ -201,7 +202,7 @@ void __thaw_task(struct task_struct *p)
if (WARN_ON_ONCE(freezing(p)))
goto unlock;
- if (task_call_func(p, __restore_freezer_state, NULL))
+ if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL))
goto unlock;
wake_up_state(p, TASK_FROZEN);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index be5642a4ec49..a08031b57a61 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -52,6 +52,8 @@ atomic_t __kexec_lock = ATOMIC_INIT(0);
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
+bool kexec_file_dbg_print;
+
int kexec_should_crash(struct task_struct *p)
{
/*
@@ -276,8 +278,8 @@ int kimage_is_destination_range(struct kimage *image,
unsigned long mstart, mend;
mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((end > mstart) && (start < mend))
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((end >= mstart) && (start <= mend))
return 1;
}
@@ -370,7 +372,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
pfn = page_to_boot_pfn(pages);
epfn = pfn + count;
addr = pfn << PAGE_SHIFT;
- eaddr = epfn << PAGE_SHIFT;
+ eaddr = (epfn << PAGE_SHIFT) - 1;
if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
kimage_is_destination_range(image, addr, eaddr)) {
list_add(&pages->lru, &extra_pages);
@@ -430,7 +432,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
pages = NULL;
size = (1 << order) << PAGE_SHIFT;
- hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+ hole_start = ALIGN(image->control_page, size);
hole_end = hole_start + size - 1;
while (hole_end <= crashk_res.end) {
unsigned long i;
@@ -447,7 +449,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
mend = mstart + image->segment[i].memsz - 1;
if ((hole_end >= mstart) && (hole_start <= mend)) {
/* Advance the hole to the end of the segment */
- hole_start = (mend + (size - 1)) & ~(size - 1);
+ hole_start = ALIGN(mend, size);
hole_end = hole_start + size - 1;
break;
}
@@ -455,7 +457,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
/* If I don't overlap any segments I have found my hole! */
if (i == image->nr_segments) {
pages = pfn_to_page(hole_start >> PAGE_SHIFT);
- image->control_page = hole_end;
+ image->control_page = hole_end + 1;
break;
}
}
@@ -716,7 +718,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
/* If the page is not a destination page use it */
if (!kimage_is_destination_range(image, addr,
- addr + PAGE_SIZE))
+ addr + PAGE_SIZE - 1))
break;
/*
@@ -1063,9 +1065,10 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs)
* panic(). Otherwise parallel calls of panic() and crash_kexec()
* may stop each other. To exclude them, we use panic_cpu here too.
*/
+ old_cpu = PANIC_CPU_INVALID;
this_cpu = raw_smp_processor_id();
- old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
- if (old_cpu == PANIC_CPU_INVALID) {
+
+ if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) {
/* This is the 1st CPU which comes here, so go ahead. */
__crash_kexec(regs);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f9a419cd22d4..bef2f6f2571b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -123,6 +123,8 @@ void kimage_file_post_load_cleanup(struct kimage *image)
*/
kfree(image->image_loader_data);
image->image_loader_data = NULL;
+
+ kexec_file_dbg_print = false;
}
#ifdef CONFIG_KEXEC_SIG
@@ -202,6 +204,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
if (ret < 0)
return ret;
image->kernel_buf_len = ret;
+ kexec_dprintk("kernel: %p kernel_size: %#lx\n",
+ image->kernel_buf, image->kernel_buf_len);
/* Call arch image probe handlers */
ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
@@ -278,6 +282,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
if (!image)
return -ENOMEM;
+ kexec_file_dbg_print = !!(flags & KEXEC_FILE_DEBUG);
image->file_mode = 1;
if (kexec_on_panic) {
@@ -384,13 +389,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (ret)
goto out;
+ kexec_dprintk("nr_segments = %lu\n", image->nr_segments);
for (i = 0; i < image->nr_segments; i++) {
struct kexec_segment *ksegment;
ksegment = &image->segment[i];
- pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
- i, ksegment->buf, ksegment->bufsz, ksegment->mem,
- ksegment->memsz);
+ kexec_dprintk("segment[%d]: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
ret = kimage_load_segment(image, &image->segment[i]);
if (ret)
@@ -403,6 +409,8 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (ret)
goto out;
+ kexec_dprintk("kexec_file_load: type:%u, start:0x%lx head:0x%lx flags:0x%lx\n",
+ image->type, image->start, image->head, flags);
/*
* Free up any temporary buffers allocated which are not needed
* after image has been loaded
@@ -426,11 +434,11 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
unsigned long temp_start, temp_end;
temp_end = min(end, kbuf->buf_max);
- temp_start = temp_end - kbuf->memsz;
+ temp_start = temp_end - kbuf->memsz + 1;
do {
/* align down start */
- temp_start = temp_start & (~(kbuf->buf_align - 1));
+ temp_start = ALIGN_DOWN(temp_start, kbuf->buf_align);
if (temp_start < start || temp_start < kbuf->buf_min)
return 0;
@@ -592,6 +600,8 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
crashk_res.start, crashk_res.end,
kbuf, func);
+ else if (kbuf->top_down)
+ return walk_system_ram_res_rev(0, ULONG_MAX, kbuf, func);
else
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 075a632e6c7c..d5a0ee40bf66 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2252,7 +2252,7 @@ int register_kretprobe(struct kretprobe *rp)
rp->rph = NULL;
return -ENOMEM;
}
- rp->rph->rp = rp;
+ rcu_assign_pointer(rp->rph->rp, rp);
rp->nmissed = 0;
/* Establish function entry probe point */
ret = register_kprobe(&rp->kp);
@@ -2300,7 +2300,7 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
rethook_free(rps[i]->rh);
#else
- rps[i]->rph->rp = NULL;
+ rcu_assign_pointer(rps[i]->rph->rp, NULL);
#endif
}
mutex_unlock(&kprobe_mutex);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e85b5ad3e206..151bd3de5936 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3497,7 +3497,8 @@ static int alloc_chain_hlocks(int req)
size = chain_block_size(curr);
if (likely(size >= req)) {
del_chain_block(0, size, chain_block_next(curr));
- add_chain_block(curr + req, size - req);
+ if (size > req)
+ add_chain_block(curr + req, size - req);
return curr;
}
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 2deeeca3e71b..cbae8c0b89ab 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -532,6 +532,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
* This function must not be used in interrupt context. Unlocking
* of a not locked mutex is not allowed.
*
+ * The caller must ensure that the mutex stays alive until this function has
+ * returned - mutex_unlock() can NOT directly be used to release an object such
+ * that another concurrent task can free it.
+ * Mutexes are different from spinlocks & refcounts in this aspect.
+ *
* This function is similar to (but not equivalent to) up().
*/
void __sched mutex_unlock(struct mutex *lock)
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index d5610ad52b92..75a6f6133866 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -11,6 +11,13 @@
* called from interrupt context and we have preemption disabled while
* spinning.
*/
+
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
+ int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # + 1 value */
+};
+
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
/*
@@ -37,32 +44,28 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
+ *
+ * If osq_lock() is being cancelled there must be a previous node
+ * and 'old_cpu' is its CPU #.
+ * For osq_unlock() there is never a previous node and old_cpu is
+ * set to OSQ_UNLOCKED_VAL.
*/
static inline struct optimistic_spin_node *
osq_wait_next(struct optimistic_spin_queue *lock,
struct optimistic_spin_node *node,
- struct optimistic_spin_node *prev)
+ int old_cpu)
{
- struct optimistic_spin_node *next = NULL;
int curr = encode_cpu(smp_processor_id());
- int old;
-
- /*
- * If there is a prev node in queue, then the 'old' value will be
- * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
- * we're currently last in queue, then the queue will then become empty.
- */
- old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
for (;;) {
if (atomic_read(&lock->tail) == curr &&
- atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
+ atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
* unlock()/unqueue().
*/
- break;
+ return NULL;
}
/*
@@ -76,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock,
* wait for a new @node->next from its Step-C.
*/
if (node->next) {
+ struct optimistic_spin_node *next;
+
next = xchg(&node->next, NULL);
if (next)
- break;
+ return next;
}
cpu_relax();
}
-
- return next;
}
bool osq_lock(struct optimistic_spin_queue *lock)
@@ -186,7 +189,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* back to @prev.
*/
- next = osq_wait_next(lock, node, prev);
+ next = osq_wait_next(lock, node, prev->cpu);
if (!next)
return false;
@@ -226,7 +229,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
return;
}
- next = osq_wait_next(lock, node, NULL);
+ next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL);
if (next)
WRITE_ONCE(next->locked, 1);
}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 98fedfdb8db5..36681911c05a 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2199,6 +2199,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
mod->kunit_suites = section_objs(info, ".kunit_test_suites",
sizeof(*mod->kunit_suites),
&mod->num_kunit_suites);
+ mod->kunit_init_suites = section_objs(info, ".kunit_init_test_suites",
+ sizeof(*mod->kunit_init_suites),
+ &mod->num_kunit_init_suites);
#endif
mod->extable = section_objs(info, "__ex_table",
diff --git a/kernel/pid.c b/kernel/pid.c
index 6500ef956f2f..b52b10865454 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -700,7 +700,7 @@ static int pidfd_getfd(struct pid *pid, int fd)
if (IS_ERR(file))
return PTR_ERR(file);
- ret = receive_fd(file, O_CLOEXEC);
+ ret = receive_fd(file, NULL, O_CLOEXEC);
fput(file);
return ret;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index dee341ae4ace..4b0b7cf2e019 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -642,9 +642,9 @@ int hibernation_platform_enter(void)
*/
static void power_down(void)
{
-#ifdef CONFIG_SUSPEND
int error;
+#ifdef CONFIG_SUSPEND
if (hibernation_mode == HIBERNATION_SUSPEND) {
error = suspend_devices_and_enter(mem_sleep_current);
if (error) {
@@ -667,7 +667,13 @@ static void power_down(void)
kernel_restart(NULL);
break;
case HIBERNATION_PLATFORM:
- hibernation_platform_enter();
+ error = hibernation_platform_enter();
+ if (error == -EAGAIN || error == -EBUSY) {
+ swsusp_unmark();
+ events_check_enabled = false;
+ pr_info("Wakeup event detected during hibernation, rolling back.\n");
+ return;
+ }
fallthrough;
case HIBERNATION_SHUTDOWN:
if (kernel_can_power_off())
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6425ae3e8b0..b1ae9b677d03 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -60,22 +60,6 @@ EXPORT_SYMBOL_GPL(lock_system_sleep);
void unlock_system_sleep(unsigned int flags)
{
- /*
- * Don't use freezer_count() because we don't want the call to
- * try_to_freeze() here.
- *
- * Reason:
- * Fundamentally, we just don't need it, because freezing condition
- * doesn't come into effect until we release the
- * system_transition_mutex lock, since the freezer always works with
- * system_transition_mutex held.
- *
- * More importantly, in the case of hibernation,
- * unlock_system_sleep() gets called in snapshot_read() and
- * snapshot_write() when the freezing condition is still in effect.
- * Which means, if we use try_to_freeze() here, it would make them
- * enter the refrigerator, thus causing hibernation to lockup.
- */
if (!(flags & PF_NOFREEZE))
current->flags &= ~PF_NOFREEZE;
mutex_unlock(&system_transition_mutex);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 17fd9aaaf084..8499a39c62f4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -175,6 +175,8 @@ extern int swsusp_write(unsigned int flags);
void swsusp_close(void);
#ifdef CONFIG_SUSPEND
extern int swsusp_unmark(void);
+#else
+static inline int swsusp_unmark(void) { return 0; }
#endif
struct __kernel_old_timeval;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 50a15408c3fc..5c96ff067c64 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1119,7 +1119,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
int create_basic_memory_bitmaps(void)
{
struct memory_bitmap *bm1, *bm2;
- int error = 0;
+ int error;
if (forbidden_pages_map && free_pages_map)
return 0;
@@ -1487,11 +1487,11 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
s_page = pfn_to_page(src_pfn);
d_page = pfn_to_page(dst_pfn);
if (PageHighMem(s_page)) {
- src = kmap_atomic(s_page);
- dst = kmap_atomic(d_page);
+ src = kmap_local_page(s_page);
+ dst = kmap_local_page(d_page);
zeros_only = do_copy_page(dst, src);
- kunmap_atomic(dst);
- kunmap_atomic(src);
+ kunmap_local(dst);
+ kunmap_local(src);
} else {
if (PageHighMem(d_page)) {
/*
@@ -1499,9 +1499,9 @@ static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
* data modified by kmap_atomic()
*/
zeros_only = safe_copy_page(buffer, s_page);
- dst = kmap_atomic(d_page);
+ dst = kmap_local_page(d_page);
copy_page(dst, buffer);
- kunmap_atomic(dst);
+ kunmap_local(dst);
} else {
zeros_only = safe_copy_page(page_address(d_page), s_page);
}
@@ -2778,7 +2778,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
int snapshot_write_next(struct snapshot_handle *handle)
{
static struct chain_allocator ca;
- int error = 0;
+ int error;
next:
/* Check if we have already loaded the entire image */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a2cb0babb5ec..6053ddddaf65 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -451,7 +451,7 @@ err_close:
static int swap_write_page(struct swap_map_handle *handle, void *buf,
struct hib_bio_batch *hb)
{
- int error = 0;
+ int error;
sector_t offset;
if (!handle->cur)
@@ -606,11 +606,11 @@ static int crc32_threadfn(void *data)
unsigned i;
while (1) {
- wait_event(d->go, atomic_read(&d->ready) ||
+ wait_event(d->go, atomic_read_acquire(&d->ready) ||
kthread_should_stop());
if (kthread_should_stop()) {
d->thr = NULL;
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
break;
}
@@ -619,7 +619,7 @@ static int crc32_threadfn(void *data)
for (i = 0; i < d->run_threads; i++)
*d->crc32 = crc32_le(*d->crc32,
d->unc[i], *d->unc_len[i]);
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
@@ -649,12 +649,12 @@ static int lzo_compress_threadfn(void *data)
struct cmp_data *d = data;
while (1) {
- wait_event(d->go, atomic_read(&d->ready) ||
+ wait_event(d->go, atomic_read_acquire(&d->ready) ||
kthread_should_stop());
if (kthread_should_stop()) {
d->thr = NULL;
d->ret = -1;
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
break;
}
@@ -663,7 +663,7 @@ static int lzo_compress_threadfn(void *data)
d->ret = lzo1x_1_compress(d->unc, d->unc_len,
d->cmp + LZO_HEADER, &d->cmp_len,
d->wrk);
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
@@ -798,7 +798,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
data[thr].unc_len = off;
- atomic_set(&data[thr].ready, 1);
+ atomic_set_release(&data[thr].ready, 1);
wake_up(&data[thr].go);
}
@@ -806,12 +806,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
break;
crc->run_threads = thr;
- atomic_set(&crc->ready, 1);
+ atomic_set_release(&crc->ready, 1);
wake_up(&crc->go);
for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
wait_event(data[thr].done,
- atomic_read(&data[thr].stop));
+ atomic_read_acquire(&data[thr].stop));
atomic_set(&data[thr].stop, 0);
ret = data[thr].ret;
@@ -850,7 +850,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
}
}
- wait_event(crc->done, atomic_read(&crc->stop));
+ wait_event(crc->done, atomic_read_acquire(&crc->stop));
atomic_set(&crc->stop, 0);
}
@@ -1132,12 +1132,12 @@ static int lzo_decompress_threadfn(void *data)
struct dec_data *d = data;
while (1) {
- wait_event(d->go, atomic_read(&d->ready) ||
+ wait_event(d->go, atomic_read_acquire(&d->ready) ||
kthread_should_stop());
if (kthread_should_stop()) {
d->thr = NULL;
d->ret = -1;
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
break;
}
@@ -1150,7 +1150,7 @@ static int lzo_decompress_threadfn(void *data)
flush_icache_range((unsigned long)d->unc,
(unsigned long)d->unc + d->unc_len);
- atomic_set(&d->stop, 1);
+ atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
return 0;
@@ -1335,7 +1335,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
if (crc->run_threads) {
- wait_event(crc->done, atomic_read(&crc->stop));
+ wait_event(crc->done, atomic_read_acquire(&crc->stop));
atomic_set(&crc->stop, 0);
crc->run_threads = 0;
}
@@ -1371,7 +1371,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
pg = 0;
}
- atomic_set(&data[thr].ready, 1);
+ atomic_set_release(&data[thr].ready, 1);
wake_up(&data[thr].go);
}
@@ -1390,7 +1390,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
wait_event(data[thr].done,
- atomic_read(&data[thr].stop));
+ atomic_read_acquire(&data[thr].stop));
atomic_set(&data[thr].stop, 0);
ret = data[thr].ret;
@@ -1421,7 +1421,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
ret = snapshot_write_next(snapshot);
if (ret <= 0) {
crc->run_threads = thr + 1;
- atomic_set(&crc->ready, 1);
+ atomic_set_release(&crc->ready, 1);
wake_up(&crc->go);
goto out_finish;
}
@@ -1429,13 +1429,13 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
crc->run_threads = thr;
- atomic_set(&crc->ready, 1);
+ atomic_set_release(&crc->ready, 1);
wake_up(&crc->go);
}
out_finish:
if (crc->run_threads) {
- wait_event(crc->done, atomic_read(&crc->stop));
+ wait_event(crc->done, atomic_read_acquire(&crc->stop));
atomic_set(&crc->stop, 0);
}
stop = ktime_get();
@@ -1566,7 +1566,6 @@ put:
/**
* swsusp_close - close resume device.
- * @exclusive: Close the resume device which is exclusively opened.
*/
void swsusp_close(void)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d8b5e13a2229..2fabd497d659 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -145,20 +145,9 @@ void __ptrace_unlink(struct task_struct *child)
*/
if (!(child->flags & PF_EXITING) &&
(child->signal->flags & SIGNAL_STOP_STOPPED ||
- child->signal->group_stop_count)) {
+ child->signal->group_stop_count))
child->jobctl |= JOBCTL_STOP_PENDING;
- /*
- * This is only possible if this thread was cloned by the
- * traced task running in the stopped group, set the signal
- * for the future reports.
- * FIXME: we should change ptrace_init_task() to handle this
- * case.
- */
- if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
- child->jobctl |= SIGSTOP;
- }
-
/*
* If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
* @child in the butt. Note that @resume should be used iff @child
@@ -386,6 +375,34 @@ static int check_ptrace_options(unsigned long data)
return 0;
}
+static inline void ptrace_set_stopped(struct task_struct *task)
+{
+ guard(spinlock)(&task->sighand->siglock);
+
+ /*
+ * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
+ * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
+ * will be cleared if the child completes the transition or any
+ * event which clears the group stop states happens. We'll wait
+ * for the transition to complete before returning from this
+ * function.
+ *
+ * This hides STOPPED -> RUNNING -> TRACED transition from the
+ * attaching thread but a different thread in the same group can
+ * still observe the transient RUNNING state. IOW, if another
+ * thread's WNOHANG wait(2) on the stopped tracee races against
+ * ATTACH, the wait(2) may fail due to the transient RUNNING.
+ *
+ * The following task_is_stopped() test is safe as both transitions
+ * in and out of STOPPED are protected by siglock.
+ */
+ if (task_is_stopped(task) &&
+ task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
+ task->jobctl &= ~JOBCTL_STOPPED;
+ signal_wake_up_state(task, __TASK_STOPPED);
+ }
+}
+
static int ptrace_attach(struct task_struct *task, long request,
unsigned long addr,
unsigned long flags)
@@ -393,17 +410,17 @@ static int ptrace_attach(struct task_struct *task, long request,
bool seize = (request == PTRACE_SEIZE);
int retval;
- retval = -EIO;
if (seize) {
if (addr != 0)
- goto out;
+ return -EIO;
/*
* This duplicates the check in check_ptrace_options() because
* ptrace_attach() and ptrace_setoptions() have historically
* used different error codes for unknown ptrace options.
*/
if (flags & ~(unsigned long)PTRACE_O_MASK)
- goto out;
+ return -EIO;
+
retval = check_ptrace_options(flags);
if (retval)
return retval;
@@ -414,88 +431,54 @@ static int ptrace_attach(struct task_struct *task, long request,
audit_ptrace(task);
- retval = -EPERM;
if (unlikely(task->flags & PF_KTHREAD))
- goto out;
+ return -EPERM;
if (same_thread_group(task, current))
- goto out;
+ return -EPERM;
/*
* Protect exec's credential calculations against our interference;
* SUID, SGID and LSM creds get determined differently
* under ptrace.
*/
- retval = -ERESTARTNOINTR;
- if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
- goto out;
+ scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR,
+ &task->signal->cred_guard_mutex) {
- task_lock(task);
- retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
- task_unlock(task);
- if (retval)
- goto unlock_creds;
+ scoped_guard (task_lock, task) {
+ retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+ if (retval)
+ return retval;
+ }
- write_lock_irq(&tasklist_lock);
- retval = -EPERM;
- if (unlikely(task->exit_state))
- goto unlock_tasklist;
- if (task->ptrace)
- goto unlock_tasklist;
+ scoped_guard (write_lock_irq, &tasklist_lock) {
+ if (unlikely(task->exit_state))
+ return -EPERM;
+ if (task->ptrace)
+ return -EPERM;
- task->ptrace = flags;
+ task->ptrace = flags;
- ptrace_link(task, current);
+ ptrace_link(task, current);
- /* SEIZE doesn't trap tracee on attach */
- if (!seize)
- send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
+ /* SEIZE doesn't trap tracee on attach */
+ if (!seize)
+ send_sig_info(SIGSTOP, SEND_SIG_PRIV, task);
- spin_lock(&task->sighand->siglock);
+ ptrace_set_stopped(task);
+ }
+ }
/*
- * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
- * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
- * will be cleared if the child completes the transition or any
- * event which clears the group stop states happens. We'll wait
- * for the transition to complete before returning from this
- * function.
- *
- * This hides STOPPED -> RUNNING -> TRACED transition from the
- * attaching thread but a different thread in the same group can
- * still observe the transient RUNNING state. IOW, if another
- * thread's WNOHANG wait(2) on the stopped tracee races against
- * ATTACH, the wait(2) may fail due to the transient RUNNING.
- *
- * The following task_is_stopped() test is safe as both transitions
- * in and out of STOPPED are protected by siglock.
+ * We do not bother to change retval or clear JOBCTL_TRAPPING
+ * if wait_on_bit() was interrupted by SIGKILL. The tracer will
+ * not return to user-mode, it will exit and clear this bit in
+ * __ptrace_unlink() if it wasn't already cleared by the tracee;
+ * and until then nobody can ptrace this task.
*/
- if (task_is_stopped(task) &&
- task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
- task->jobctl &= ~JOBCTL_STOPPED;
- signal_wake_up_state(task, __TASK_STOPPED);
- }
-
- spin_unlock(&task->sighand->siglock);
-
- retval = 0;
-unlock_tasklist:
- write_unlock_irq(&tasklist_lock);
-unlock_creds:
- mutex_unlock(&task->signal->cred_guard_mutex);
-out:
- if (!retval) {
- /*
- * We do not bother to change retval or clear JOBCTL_TRAPPING
- * if wait_on_bit() was interrupted by SIGKILL. The tracer will
- * not return to user-mode, it will exit and clear this bit in
- * __ptrace_unlink() if it wasn't already cleared by the tracee;
- * and until then nobody can ptrace this task.
- */
- wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
- proc_ptrace_connector(task, PTRACE_ATTACH);
- }
+ wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
+ proc_ptrace_connector(task, PTRACE_ATTACH);
- return retval;
+ return 0;
}
/**
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 395a0ea3c7a8..22c16e2564cc 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -59,6 +59,14 @@ struct sys_off_handler {
};
/*
+ * This variable is used to indicate if a halt was initiated instead of a
+ * reboot when the reboot call was invoked with LINUX_REBOOT_CMD_POWER_OFF, but
+ * the system cannot be powered off. This allowes kernel_halt() to notify users
+ * of that.
+ */
+static bool poweroff_fallback_to_halt;
+
+/*
* Temporary stub that prevents linkage failure while we're in process
* of removing all uses of legacy pm_power_off() around the kernel.
*/
@@ -297,7 +305,10 @@ void kernel_halt(void)
kernel_shutdown_prepare(SYSTEM_HALT);
migrate_to_reboot_cpu();
syscore_shutdown();
- pr_emerg("System halted\n");
+ if (poweroff_fallback_to_halt)
+ pr_emerg("Power off not available: System halted instead\n");
+ else
+ pr_emerg("System halted\n");
kmsg_dump(KMSG_DUMP_SHUTDOWN);
machine_halt();
}
@@ -732,8 +743,10 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
/* Instead of trying to make the power_off code look like
* halt when pm_power_off is not set do it the easy way.
*/
- if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off())
+ if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off()) {
+ poweroff_fallback_to_halt = true;
cmd = LINUX_REBOOT_CMD_HALT;
+ }
mutex_lock(&system_transition_mutex);
switch (cmd) {
@@ -957,21 +970,24 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms)
}
/**
- * hw_protection_shutdown - Trigger an emergency system poweroff
+ * __hw_protection_shutdown - Trigger an emergency system shutdown or reboot
*
- * @reason: Reason of emergency shutdown to be printed.
- * @ms_until_forced: Time to wait for orderly shutdown before tiggering a
- * forced shudown. Negative value disables the forced
- * shutdown.
+ * @reason: Reason of emergency shutdown or reboot to be printed.
+ * @ms_until_forced: Time to wait for orderly shutdown or reboot before
+ * triggering it. Negative value disables the forced
+ * shutdown or reboot.
+ * @shutdown: If true, indicates that a shutdown will happen
+ * after the critical tempeature is reached.
+ * If false, indicates that a reboot will happen
+ * after the critical tempeature is reached.
*
- * Initiate an emergency system shutdown in order to protect hardware from
- * further damage. Usage examples include a thermal protection or a voltage or
- * current regulator failures.
- * NOTE: The request is ignored if protection shutdown is already pending even
- * if the previous request has given a large timeout for forced shutdown.
- * Can be called from any context.
+ * Initiate an emergency system shutdown or reboot in order to protect
+ * hardware from further damage. Usage examples include a thermal protection.
+ * NOTE: The request is ignored if protection shutdown or reboot is already
+ * pending even if the previous request has given a large timeout for forced
+ * shutdown/reboot.
*/
-void hw_protection_shutdown(const char *reason, int ms_until_forced)
+void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown)
{
static atomic_t allow_proceed = ATOMIC_INIT(1);
@@ -986,9 +1002,12 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
* orderly_poweroff failure
*/
hw_failure_emergency_poweroff(ms_until_forced);
- orderly_poweroff(true);
+ if (shutdown)
+ orderly_poweroff(true);
+ else
+ orderly_reboot();
}
-EXPORT_SYMBOL_GPL(hw_protection_shutdown);
+EXPORT_SYMBOL_GPL(__hw_protection_shutdown);
static int __init reboot_setup(char *str)
{
diff --git a/kernel/relay.c b/kernel/relay.c
index 83fe0325cde1..a8e90e98bf2c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1073,167 +1073,6 @@ static ssize_t relay_file_read(struct file *filp,
return written;
}
-static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
-{
- rbuf->bytes_consumed += bytes_consumed;
-
- if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
- relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
- rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
- }
-}
-
-static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
-{
- struct rchan_buf *rbuf;
-
- rbuf = (struct rchan_buf *)page_private(buf->page);
- relay_consume_bytes(rbuf, buf->private);
-}
-
-static const struct pipe_buf_operations relay_pipe_buf_ops = {
- .release = relay_pipe_buf_release,
- .try_steal = generic_pipe_buf_try_steal,
- .get = generic_pipe_buf_get,
-};
-
-static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
-{
-}
-
-/*
- * subbuf_splice_actor - splice up to one subbuf's worth of data
- */
-static ssize_t subbuf_splice_actor(struct file *in,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len,
- unsigned int flags,
- int *nonpad_ret)
-{
- unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
- struct rchan_buf *rbuf = in->private_data;
- unsigned int subbuf_size = rbuf->chan->subbuf_size;
- uint64_t pos = (uint64_t) *ppos;
- uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size;
- size_t read_start = (size_t) do_div(pos, alloc_size);
- size_t read_subbuf = read_start / subbuf_size;
- size_t padding = rbuf->padding[read_subbuf];
- size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
- struct page *pages[PIPE_DEF_BUFFERS];
- struct partial_page partial[PIPE_DEF_BUFFERS];
- struct splice_pipe_desc spd = {
- .pages = pages,
- .nr_pages = 0,
- .nr_pages_max = PIPE_DEF_BUFFERS,
- .partial = partial,
- .ops = &relay_pipe_buf_ops,
- .spd_release = relay_page_release,
- };
- ssize_t ret;
-
- if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
- return 0;
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
- /*
- * Adjust read len, if longer than what is available
- */
- if (len > (subbuf_size - read_start % subbuf_size))
- len = subbuf_size - read_start % subbuf_size;
-
- subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
- pidx = (read_start / PAGE_SIZE) % subbuf_pages;
- poff = read_start & ~PAGE_MASK;
- nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
-
- for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
- unsigned int this_len, this_end, private;
- unsigned int cur_pos = read_start + total_len;
-
- if (!len)
- break;
-
- this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
- private = this_len;
-
- spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
- spd.partial[spd.nr_pages].offset = poff;
-
- this_end = cur_pos + this_len;
- if (this_end >= nonpad_end) {
- this_len = nonpad_end - cur_pos;
- private = this_len + padding;
- }
- spd.partial[spd.nr_pages].len = this_len;
- spd.partial[spd.nr_pages].private = private;
-
- len -= this_len;
- total_len += this_len;
- poff = 0;
- pidx = (pidx + 1) % subbuf_pages;
-
- if (this_end >= nonpad_end) {
- spd.nr_pages++;
- break;
- }
- }
-
- ret = 0;
- if (!spd.nr_pages)
- goto out;
-
- ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
- if (ret < 0 || ret < total_len)
- goto out;
-
- if (read_start + ret == nonpad_end)
- ret += padding;
-
-out:
- splice_shrink_spd(&spd);
- return ret;
-}
-
-static ssize_t relay_file_splice_read(struct file *in,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len,
- unsigned int flags)
-{
- ssize_t spliced;
- int ret;
- int nonpad_ret = 0;
-
- ret = 0;
- spliced = 0;
-
- while (len && !spliced) {
- ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
- if (ret < 0)
- break;
- else if (!ret) {
- if (flags & SPLICE_F_NONBLOCK)
- ret = -EAGAIN;
- break;
- }
-
- *ppos += ret;
- if (ret > len)
- len = 0;
- else
- len -= ret;
- spliced += nonpad_ret;
- nonpad_ret = 0;
- }
-
- if (spliced)
- return spliced;
-
- return ret;
-}
const struct file_operations relay_file_operations = {
.open = relay_file_open,
@@ -1242,6 +1081,5 @@ const struct file_operations relay_file_operations = {
.read = relay_file_read,
.llseek = no_llseek,
.release = relay_file_release,
- .splice_read = relay_file_splice_read,
};
EXPORT_SYMBOL_GPL(relay_file_operations);
diff --git a/kernel/resource.c b/kernel/resource.c
index 866ef3663a0b..fcbca39dbc45 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -27,6 +27,8 @@
#include <linux/mount.h>
#include <linux/resource_ext.h>
#include <uapi/linux/magic.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
#include <asm/io.h>
@@ -430,6 +432,61 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
}
/*
+ * This function, being a variant of walk_system_ram_res(), calls the @func
+ * callback against all memory ranges of type System RAM which are marked as
+ * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from
+ * higher to lower.
+ */
+int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ struct resource res, *rams;
+ int rams_size = 16, i;
+ unsigned long flags;
+ int ret = -1;
+
+ /* create a list */
+ rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL);
+ if (!rams)
+ return ret;
+
+ flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ i = 0;
+ while ((start < end) &&
+ (!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) {
+ if (i >= rams_size) {
+ /* re-alloc */
+ struct resource *rams_new;
+
+ rams_new = kvrealloc(rams, rams_size * sizeof(struct resource),
+ (rams_size + 16) * sizeof(struct resource),
+ GFP_KERNEL);
+ if (!rams_new)
+ goto out;
+
+ rams = rams_new;
+ rams_size += 16;
+ }
+
+ rams[i].start = res.start;
+ rams[i++].end = res.end;
+
+ start = res.end + 1;
+ }
+
+ /* go reverse */
+ for (i--; i >= 0; i--) {
+ ret = (*func)(&rams[i], arg);
+ if (ret)
+ break;
+ }
+
+out:
+ kvfree(rams);
+ return ret;
+}
+
+/*
* This function calls the @func callback against all memory ranges, which
* are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
*/
@@ -1844,8 +1901,8 @@ get_free_mem_region(struct device *dev, struct resource *base,
write_lock(&resource_lock);
for (addr = gfr_start(base, size, align, flags);
- gfr_continue(base, addr, size, flags);
- addr = gfr_next(addr, size, flags)) {
+ gfr_continue(base, addr, align, flags);
+ addr = gfr_next(addr, align, flags)) {
if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
REGION_DISJOINT)
continue;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a708d225c28e..db4be4921e7f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1131,6 +1131,28 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
+ /*
+ * Set TIF_NEED_RESCHED and send an IPI if in the non-polling
+ * part of the idle loop. This forces an exit from the idle loop
+ * and a round trip to schedule(). Now this could be optimized
+ * because a simple new idle loop iteration is enough to
+ * re-evaluate the next tick. Provided some re-ordering of tick
+ * nohz functions that would need to follow TIF_NR_POLLING
+ * clearing:
+ *
+ * - On most archs, a simple fetch_or on ti::flags with a
+ * "0" value would be enough to know if an IPI needs to be sent.
+ *
+ * - x86 needs to perform a last need_resched() check between
+ * monitor and mwait which doesn't take timers into account.
+ * There a dedicated TIF_TIMER flag would be required to
+ * fetch_or here and be checked along with TIF_NEED_RESCHED
+ * before mwait().
+ *
+ * However, remote timer enqueue is not such a frequent event
+ * and testing of the above solutions didn't appear to report
+ * much benefits.
+ */
if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
else
@@ -2124,12 +2146,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
enqueue_task(rq, p, flags);
- p->on_rq = TASK_ON_RQ_QUEUED;
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
}
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
+ WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
dequeue_task(rq, p, flags);
}
@@ -3795,6 +3819,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
rq->idle_stamp = 0;
}
#endif
+
+ p->dl_server = NULL;
}
/*
@@ -4509,10 +4535,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
memset(&p->stats, 0, sizeof(p->stats));
#endif
- RB_CLEAR_NODE(&p->dl.rb_node);
- init_dl_task_timer(&p->dl);
- init_dl_inactive_task_timer(&p->dl);
- __dl_clear_params(p);
+ init_dl_entity(&p->dl);
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
@@ -6004,12 +6027,27 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = pick_next_task_idle(rq);
}
+ /*
+ * This is the fast path; it cannot be a DL server pick;
+ * therefore even if @p == @prev, ->dl_server must be NULL.
+ */
+ if (p->dl_server)
+ p->dl_server = NULL;
+
return p;
}
restart:
put_prev_task_balance(rq, prev, rf);
+ /*
+ * We've updated @prev and no longer need the server link, clear it.
+ * Must be done before ->pick_next_task() because that can (re)set
+ * ->dl_server.
+ */
+ if (prev->dl_server)
+ prev->dl_server = NULL;
+
for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
@@ -7429,18 +7467,13 @@ int sched_core_idle_cpu(int cpu)
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- enum cpu_util_type type,
- struct task_struct *p)
+ unsigned long *min,
+ unsigned long *max)
{
- unsigned long dl_util, util, irq, max;
+ unsigned long util, irq, scale;
struct rq *rq = cpu_rq(cpu);
- max = arch_scale_cpu_capacity(cpu);
-
- if (!uclamp_is_used() &&
- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
- return max;
- }
+ scale = arch_scale_cpu_capacity(cpu);
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
@@ -7448,45 +7481,49 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
* update_irq_load_avg().
*/
irq = cpu_util_irq(rq);
- if (unlikely(irq >= max))
- return max;
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+
+ if (min) {
+ /*
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
+ */
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
+ }
/*
* Because the time spend on RT/DL tasks is visible as 'lost' time to
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
- *
- * CFS and RT utilization can be boosted or capped, depending on
- * utilization clamp constraints requested by currently RUNNABLE
- * tasks.
- * When there are no CFS RUNNABLE tasks, clamps are released and
- * frequency will be gracefully reduced with the utilization decay.
*/
util = util_cfs + cpu_util_rt(rq);
- if (type == FREQUENCY_UTIL)
- util = uclamp_rq_util_with(rq, util, p);
-
- dl_util = cpu_util_dl(rq);
+ util += cpu_util_dl(rq);
/*
- * For frequency selection we do not make cpu_util_dl() a permanent part
- * of this sum because we want to use cpu_bw_dl() later on, but we need
- * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
- * that we select f_max when there is no idle time.
- *
- * NOTE: numerical errors or stop class might cause us to not quite hit
- * saturation when we should -- something for later.
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
*/
- if (util + dl_util >= max)
- return max;
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
- /*
- * OTOH, for energy computation we need the estimated running time, so
- * include util_dl and ignore dl_bw.
- */
- if (type == ENERGY_UTIL)
- util += dl_util;
+ if (util >= scale)
+ return scale;
/*
* There is still idle time; further improve the number by using the
@@ -7497,28 +7534,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
* U' = irq + --------- * U
* max
*/
- util = scale_irq_capacity(util, irq, max);
+ util = scale_irq_capacity(util, irq, scale);
util += irq;
- /*
- * Bandwidth required by DEADLINE must always be granted while, for
- * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
- * to gracefully reduce the frequency when no tasks show up for longer
- * periods of time.
- *
- * Ideally we would like to set bw_dl as min/guaranteed freq and util +
- * bw_dl as requested freq. However, cpufreq is not yet ready for such
- * an interface. So, we only do the latter for now.
- */
- if (type == FREQUENCY_UTIL)
- util += cpu_bw_dl(rq);
-
- return min(max, util);
+ return min(scale, util);
}
unsigned long sched_cpu_util(int cpu)
{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
}
#endif /* CONFIG_SMP */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 5888176354e2..95c3c097083e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -47,7 +47,7 @@ struct sugov_cpu {
u64 last_update;
unsigned long util;
- unsigned long bw_dl;
+ unsigned long bw_min;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
@@ -115,6 +115,28 @@ static void sugov_deferred_update(struct sugov_policy *sg_policy)
}
/**
+ * get_capacity_ref_freq - get the reference frequency that has been used to
+ * correlate frequency and compute capacity for a given cpufreq policy. We use
+ * the CPU managing it for the arch_scale_freq_ref() call in the function.
+ * @policy: the cpufreq policy of the CPU in question.
+ *
+ * Return: the reference CPU frequency to compute a capacity.
+ */
+static __always_inline
+unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy)
+{
+ unsigned int freq = arch_scale_freq_ref(policy->cpu);
+
+ if (freq)
+ return freq;
+
+ if (arch_scale_freq_invariant())
+ return policy->cpuinfo.max_freq;
+
+ return policy->cur;
+}
+
+/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
* @sg_policy: schedutil policy object to compute the new frequency for.
* @util: Current CPU utilization.
@@ -140,10 +162,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned long util, unsigned long max)
{
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned int freq = arch_scale_freq_invariant() ?
- policy->cpuinfo.max_freq : policy->cur;
+ unsigned int freq;
- util = map_util_perf(util);
+ freq = get_capacity_ref_freq(policy);
freq = map_util_freq(util, freq, max);
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
@@ -153,14 +174,31 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
}
-static void sugov_get_util(struct sugov_cpu *sg_cpu)
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+ unsigned long min,
+ unsigned long max)
+{
+ /* Add dvfs headroom to actual utilization */
+ actual = map_util_perf(actual);
+ /* Actually we don't need to target the max performance */
+ if (actual < max)
+ max = actual;
+
+ /*
+ * Ensure at least minimum performance while providing more compute
+ * capacity when possible.
+ */
+ return max(min, max);
+}
+
+static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
{
- unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu);
- struct rq *rq = cpu_rq(sg_cpu->cpu);
+ unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
- sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util,
- FREQUENCY_UTIL, NULL);
+ util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
+ util = max(util, boost);
+ sg_cpu->bw_min = min;
+ sg_cpu->util = sugov_effective_cpu_perf(sg_cpu->cpu, util, min, max);
}
/**
@@ -251,18 +289,16 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* This mechanism is designed to boost high frequently IO waiting tasks, while
* being more conservative on tasks which does sporadic IO operations.
*/
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
unsigned long max_cap)
{
- unsigned long boost;
-
/* No boost currently required */
if (!sg_cpu->iowait_boost)
- return;
+ return 0;
/* Reset boost if the CPU appears to have been idle enough */
if (sugov_iowait_reset(sg_cpu, time, false))
- return;
+ return 0;
if (!sg_cpu->iowait_boost_pending) {
/*
@@ -271,7 +307,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
sg_cpu->iowait_boost >>= 1;
if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
sg_cpu->iowait_boost = 0;
- return;
+ return 0;
}
}
@@ -281,10 +317,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
* sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
- boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
- boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
- if (sg_cpu->util < boost)
- sg_cpu->util = boost;
+ return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -306,7 +339,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
*/
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
{
- if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
+ if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min)
sg_cpu->sg_policy->limits_changed = true;
}
@@ -314,6 +347,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
u64 time, unsigned long max_cap,
unsigned int flags)
{
+ unsigned long boost;
+
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -322,8 +357,8 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
return false;
- sugov_get_util(sg_cpu);
- sugov_iowait_apply(sg_cpu, time, max_cap);
+ boost = sugov_iowait_apply(sg_cpu, time, max_cap);
+ sugov_get_util(sg_cpu, boost);
return true;
}
@@ -407,8 +442,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
sg_cpu->util = prev_util;
- cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
- map_util_perf(sg_cpu->util), max_cap);
+ cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
+ sg_cpu->util, max_cap);
sg_cpu->sg_policy->last_freq_update_time = time;
}
@@ -424,9 +459,10 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
+ unsigned long boost;
- sugov_get_util(j_sg_cpu);
- sugov_iowait_apply(j_sg_cpu, time, max_cap);
+ boost = sugov_iowait_apply(j_sg_cpu, time, max_cap);
+ sugov_get_util(j_sg_cpu, boost);
util = max(j_sg_cpu->util, util);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b28114478b82..a04a436af8cc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -54,8 +54,14 @@ static int __init sched_dl_sysctl_init(void)
late_initcall(sched_dl_sysctl_init);
#endif
+static bool dl_server(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_server;
+}
+
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
{
+ BUG_ON(dl_server(dl_se));
return container_of(dl_se, struct task_struct, dl);
}
@@ -64,12 +70,19 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
return container_of(dl_rq, struct rq, dl);
}
-static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+static inline struct rq *rq_of_dl_se(struct sched_dl_entity *dl_se)
{
- struct task_struct *p = dl_task_of(dl_se);
- struct rq *rq = task_rq(p);
+ struct rq *rq = dl_se->rq;
+
+ if (!dl_server(dl_se))
+ rq = task_rq(dl_task_of(dl_se));
+
+ return rq;
+}
- return &rq->dl;
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+ return &rq_of_dl_se(dl_se)->dl;
}
static inline int on_dl_rq(struct sched_dl_entity *dl_se)
@@ -335,6 +348,8 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
__add_rq_bw(new_bw, &rq->dl);
}
+static void __dl_clear_params(struct sched_dl_entity *dl_se);
+
/*
* The utilization of a task cannot be immediately removed from
* the rq active utilization (running_bw) when the task blocks.
@@ -389,12 +404,11 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
* up, and checks if the task is still in the "ACTIVE non contending"
* state or not (in the second case, it updates running_bw).
*/
-static void task_non_contending(struct task_struct *p)
+static void task_non_contending(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
struct hrtimer *timer = &dl_se->inactive_timer;
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ struct rq *rq = rq_of_dl_se(dl_se);
+ struct dl_rq *dl_rq = &rq->dl;
s64 zerolag_time;
/*
@@ -424,24 +438,33 @@ static void task_non_contending(struct task_struct *p)
* utilization now, instead of starting a timer
*/
if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
- if (dl_task(p))
+ if (dl_server(dl_se)) {
sub_running_bw(dl_se, dl_rq);
- if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-
- if (READ_ONCE(p->__state) == TASK_DEAD)
- sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_lock(&dl_b->lock);
- __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
- raw_spin_unlock(&dl_b->lock);
- __dl_clear_params(p);
+ } else {
+ struct task_struct *p = dl_task_of(dl_se);
+
+ if (dl_task(p))
+ sub_running_bw(dl_se, dl_rq);
+
+ if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+ if (READ_ONCE(p->__state) == TASK_DEAD)
+ sub_rq_bw(dl_se, &rq->dl);
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, dl_se->dl_bw, dl_bw_cpus(task_cpu(p)));
+ raw_spin_unlock(&dl_b->lock);
+ __dl_clear_params(dl_se);
+ }
}
return;
}
dl_se->dl_non_contending = 1;
- get_task_struct(p);
+ if (!dl_server(dl_se))
+ get_task_struct(dl_task_of(dl_se));
+
hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
}
@@ -468,8 +491,10 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
* will not touch the rq's active utilization,
* so we are still safe.
*/
- if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
- put_task_struct(dl_task_of(dl_se));
+ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
+ if (!dl_server(dl_se))
+ put_task_struct(dl_task_of(dl_se));
+ }
} else {
/*
* Since "dl_non_contending" is not set, the
@@ -482,10 +507,8 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
}
}
-static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+static inline int is_leftmost(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- struct sched_dl_entity *dl_se = &p->dl;
-
return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
}
@@ -737,8 +760,10 @@ static inline void deadline_queue_pull_task(struct rq *rq)
}
#endif /* CONFIG_SMP */
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags);
static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags);
static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se,
@@ -986,8 +1011,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
*/
static void update_dl_entity(struct sched_dl_entity *dl_se)
{
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ struct rq *rq = rq_of_dl_se(dl_se);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, rq_clock(rq))) {
@@ -1018,11 +1042,11 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
* actually started or not (i.e., the replenishment instant is in
* the future or in the past).
*/
-static int start_dl_timer(struct task_struct *p)
+static int start_dl_timer(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
struct hrtimer *timer = &dl_se->dl_timer;
- struct rq *rq = task_rq(p);
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
ktime_t now, act;
s64 delta;
@@ -1056,13 +1080,33 @@ static int start_dl_timer(struct task_struct *p)
* and observe our state.
*/
if (!hrtimer_is_queued(timer)) {
- get_task_struct(p);
+ if (!dl_server(dl_se))
+ get_task_struct(dl_task_of(dl_se));
hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
}
return 1;
}
+static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Queueing this task back might have overloaded rq, check if we need
+ * to kick someone away.
+ */
+ if (has_pushable_dl_tasks(rq)) {
+ /*
+ * Nothing relies on rq->lock after this, so its safe to drop
+ * rq->lock.
+ */
+ rq_unpin_lock(rq, rf);
+ push_dl_task(rq);
+ rq_repin_lock(rq, rf);
+ }
+#endif
+}
+
/*
* This is the bandwidth enforcement timer callback. If here, we know
* a task is not on its dl_rq, since the fact that the timer was running
@@ -1081,10 +1125,34 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity *dl_se = container_of(timer,
struct sched_dl_entity,
dl_timer);
- struct task_struct *p = dl_task_of(dl_se);
+ struct task_struct *p;
struct rq_flags rf;
struct rq *rq;
+ if (dl_server(dl_se)) {
+ struct rq *rq = rq_of_dl_se(dl_se);
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ if (dl_se->dl_throttled) {
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ if (dl_se->server_has_tasks(dl_se)) {
+ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+ resched_curr(rq);
+ __push_dl_task(rq, &rf);
+ } else {
+ replenish_dl_entity(dl_se);
+ }
+
+ }
+ rq_unlock(rq, &rf);
+
+ return HRTIMER_NORESTART;
+ }
+
+ p = dl_task_of(dl_se);
rq = task_rq_lock(p, &rf);
/*
@@ -1155,21 +1223,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
else
resched_curr(rq);
-#ifdef CONFIG_SMP
- /*
- * Queueing this task back might have overloaded rq, check if we need
- * to kick someone away.
- */
- if (has_pushable_dl_tasks(rq)) {
- /*
- * Nothing relies on rq->lock after this, so its safe to drop
- * rq->lock.
- */
- rq_unpin_lock(rq, &rf);
- push_dl_task(rq);
- rq_repin_lock(rq, &rf);
- }
-#endif
+ __push_dl_task(rq, &rf);
unlock:
task_rq_unlock(rq, p, &rf);
@@ -1183,7 +1237,7 @@ unlock:
return HRTIMER_NORESTART;
}
-void init_dl_task_timer(struct sched_dl_entity *dl_se)
+static void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
@@ -1211,12 +1265,11 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
*/
static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
{
- struct task_struct *p = dl_task_of(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+ struct rq *rq = rq_of_dl_se(dl_se);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
- if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se)))
return;
dl_se->dl_throttled = 1;
if (dl_se->runtime > 0)
@@ -1267,44 +1320,19 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
return (delta * u_act) >> BW_SHIFT;
}
-/*
- * Update the current task's runtime statistics (provided it is still
- * a -deadline task and has not been removed from the dl_rq).
- */
-static void update_curr_dl(struct rq *rq)
+static inline void
+update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
+ int flags);
+static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec)
{
- struct task_struct *curr = rq->curr;
- struct sched_dl_entity *dl_se = &curr->dl;
- u64 delta_exec, scaled_delta_exec;
- int cpu = cpu_of(rq);
- u64 now;
-
- if (!dl_task(curr) || !on_dl_rq(dl_se))
- return;
+ s64 scaled_delta_exec;
- /*
- * Consumed budget is computed considering the time as
- * observed by schedulable tasks (excluding time spent
- * in hardirq context, etc.). Deadlines are instead
- * computed using hard walltime. This seems to be the more
- * natural solution, but the full ramifications of this
- * approach need further study.
- */
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0)) {
+ if (unlikely(delta_exec <= 0)) {
if (unlikely(dl_se->dl_yielded))
goto throttle;
return;
}
- schedstat_set(curr->stats.exec_max,
- max(curr->stats.exec_max, delta_exec));
-
- trace_sched_stat_runtime(curr, delta_exec, 0);
-
- update_current_exec_runtime(curr, now, delta_exec);
-
if (dl_entity_is_special(dl_se))
return;
@@ -1316,10 +1344,9 @@ static void update_curr_dl(struct rq *rq)
* according to current frequency and CPU maximum capacity.
*/
if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
- scaled_delta_exec = grub_reclaim(delta_exec,
- rq,
- &curr->dl);
+ scaled_delta_exec = grub_reclaim(delta_exec, rq, dl_se);
} else {
+ int cpu = cpu_of(rq);
unsigned long scale_freq = arch_scale_freq_capacity(cpu);
unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
@@ -1338,11 +1365,20 @@ throttle:
(dl_se->flags & SCHED_FLAG_DL_OVERRUN))
dl_se->dl_overrun = 1;
- __dequeue_task_dl(rq, curr, 0);
- if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
- enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+ dequeue_dl_entity(dl_se, 0);
+ if (!dl_server(dl_se)) {
+ update_stats_dequeue_dl(&rq->dl, dl_se, 0);
+ dequeue_pushable_dl_task(rq, dl_task_of(dl_se));
+ }
+
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
+ if (dl_server(dl_se))
+ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
+ else
+ enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
+ }
- if (!is_leftmost(curr, &rq->dl))
+ if (!is_leftmost(dl_se, &rq->dl))
resched_curr(rq);
}
@@ -1372,20 +1408,82 @@ throttle:
}
}
+void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
+{
+ update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+}
+
+void dl_server_start(struct sched_dl_entity *dl_se)
+{
+ if (!dl_server(dl_se)) {
+ dl_se->dl_server = 1;
+ setup_new_dl_entity(dl_se);
+ }
+ enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
+}
+
+void dl_server_stop(struct sched_dl_entity *dl_se)
+{
+ dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
+}
+
+void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ dl_server_has_tasks_f has_tasks,
+ dl_server_pick_f pick)
+{
+ dl_se->rq = rq;
+ dl_se->server_has_tasks = has_tasks;
+ dl_se->server_pick = pick;
+}
+
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_dl_entity *dl_se = &curr->dl;
+ s64 delta_exec;
+
+ if (!dl_task(curr) || !on_dl_rq(dl_se))
+ return;
+
+ /*
+ * Consumed budget is computed considering the time as
+ * observed by schedulable tasks (excluding time spent
+ * in hardirq context, etc.). Deadlines are instead
+ * computed using hard walltime. This seems to be the more
+ * natural solution, but the full ramifications of this
+ * approach need further study.
+ */
+ delta_exec = update_curr_common(rq);
+ update_curr_dl_se(rq, dl_se, delta_exec);
+}
+
static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
{
struct sched_dl_entity *dl_se = container_of(timer,
struct sched_dl_entity,
inactive_timer);
- struct task_struct *p = dl_task_of(dl_se);
+ struct task_struct *p = NULL;
struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(p, &rf);
+ if (!dl_server(dl_se)) {
+ p = dl_task_of(dl_se);
+ rq = task_rq_lock(p, &rf);
+ } else {
+ rq = dl_se->rq;
+ rq_lock(rq, &rf);
+ }
sched_clock_tick();
update_rq_clock(rq);
+ if (dl_server(dl_se))
+ goto no_task;
+
if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) {
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
@@ -1398,23 +1496,30 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
raw_spin_lock(&dl_b->lock);
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&dl_b->lock);
- __dl_clear_params(p);
+ __dl_clear_params(dl_se);
goto unlock;
}
+
+no_task:
if (dl_se->dl_non_contending == 0)
goto unlock;
sub_running_bw(dl_se, &rq->dl);
dl_se->dl_non_contending = 0;
unlock:
- task_rq_unlock(rq, p, &rf);
- put_task_struct(p);
+
+ if (!dl_server(dl_se)) {
+ task_rq_unlock(rq, p, &rf);
+ put_task_struct(p);
+ } else {
+ rq_unlock(rq, &rf);
+ }
return HRTIMER_NORESTART;
}
-void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
+static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->inactive_timer;
@@ -1472,10 +1577,8 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
static inline
void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- int prio = dl_task_of(dl_se)->prio;
u64 deadline = dl_se->deadline;
- WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
add_nr_running(rq_of_dl_rq(dl_rq), 1);
@@ -1485,9 +1588,6 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
static inline
void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- int prio = dl_task_of(dl_se)->prio;
-
- WARN_ON(!dl_prio(prio));
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
@@ -1609,6 +1709,41 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags);
/*
+ * Check if a constrained deadline task was activated
+ * after the deadline but before the next period.
+ * If that is the case, the task will be throttled and
+ * the replenishment timer will be set to the next period.
+ */
+ if (!dl_se->dl_throttled && !dl_is_implicit(dl_se))
+ dl_check_constrained_dl(dl_se);
+
+ if (flags & (ENQUEUE_RESTORE|ENQUEUE_MIGRATING)) {
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ add_rq_bw(dl_se, dl_rq);
+ add_running_bw(dl_se, dl_rq);
+ }
+
+ /*
+ * If p is throttled, we do not enqueue it. In fact, if it exhausted
+ * its budget it needs a replenishment and, since it now is on
+ * its rq, the bandwidth timer callback (which clearly has not
+ * run yet) will take care of this.
+ * However, the active utilization does not depend on the fact
+ * that the task is on the runqueue or not (but depends on the
+ * task's state - in GRUB parlance, "inactive" vs "active contending").
+ * In other words, even if a task is throttled its utilization must
+ * be counted in the active utilization; hence, we need to call
+ * add_running_bw().
+ */
+ if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+ if (flags & ENQUEUE_WAKEUP)
+ task_contending(dl_se, flags);
+
+ return;
+ }
+
+ /*
* If this is a wakeup or a new instance, the scheduling
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
@@ -1619,17 +1754,35 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se);
} else if ((flags & ENQUEUE_RESTORE) &&
- dl_time_before(dl_se->deadline,
- rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+ dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) {
setup_new_dl_entity(dl_se);
}
__enqueue_dl_entity(dl_se);
}
-static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
__dequeue_dl_entity(dl_se);
+
+ if (flags & (DEQUEUE_SAVE|DEQUEUE_MIGRATING)) {
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+ sub_running_bw(dl_se, dl_rq);
+ sub_rq_bw(dl_se, dl_rq);
+ }
+
+ /*
+ * This check allows to start the inactive timer (or to immediately
+ * decrease the active utilization, if needed) in two cases:
+ * when the task blocks and when it is terminating
+ * (p->state == TASK_DEAD). We can handle the two cases in the same
+ * way, because from GRUB's point of view the same thing is happening
+ * (the task moves from "active contending" to "active non contending"
+ * or "inactive")
+ */
+ if (flags & DEQUEUE_SLEEP)
+ task_non_contending(dl_se);
}
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -1674,76 +1827,31 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
return;
}
- /*
- * Check if a constrained deadline task was activated
- * after the deadline but before the next period.
- * If that is the case, the task will be throttled and
- * the replenishment timer will be set to the next period.
- */
- if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
- dl_check_constrained_dl(&p->dl);
-
- if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
- add_rq_bw(&p->dl, &rq->dl);
- add_running_bw(&p->dl, &rq->dl);
- }
-
- /*
- * If p is throttled, we do not enqueue it. In fact, if it exhausted
- * its budget it needs a replenishment and, since it now is on
- * its rq, the bandwidth timer callback (which clearly has not
- * run yet) will take care of this.
- * However, the active utilization does not depend on the fact
- * that the task is on the runqueue or not (but depends on the
- * task's state - in GRUB parlance, "inactive" vs "active contending").
- * In other words, even if a task is throttled its utilization must
- * be counted in the active utilization; hence, we need to call
- * add_running_bw().
- */
- if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
- if (flags & ENQUEUE_WAKEUP)
- task_contending(&p->dl, flags);
-
- return;
- }
-
check_schedstat_required();
update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl);
+ if (p->on_rq == TASK_ON_RQ_MIGRATING)
+ flags |= ENQUEUE_MIGRATING;
+
enqueue_dl_entity(&p->dl, flags);
- if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
- enqueue_pushable_dl_task(rq, p);
-}
+ if (dl_server(&p->dl))
+ return;
-static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
-{
- update_stats_dequeue_dl(&rq->dl, &p->dl, flags);
- dequeue_dl_entity(&p->dl);
- dequeue_pushable_dl_task(rq, p);
+ if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
}
static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
update_curr_dl(rq);
- __dequeue_task_dl(rq, p, flags);
- if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
- sub_running_bw(&p->dl, &rq->dl);
- sub_rq_bw(&p->dl, &rq->dl);
- }
+ if (p->on_rq == TASK_ON_RQ_MIGRATING)
+ flags |= DEQUEUE_MIGRATING;
- /*
- * This check allows to start the inactive timer (or to immediately
- * decrease the active utilization, if needed) in two cases:
- * when the task blocks and when it is terminating
- * (p->state == TASK_DEAD). We can handle the two cases in the same
- * way, because from GRUB's point of view the same thing is happening
- * (the task moves from "active contending" to "active non contending"
- * or "inactive")
- */
- if (flags & DEQUEUE_SLEEP)
- task_non_contending(p);
+ dequeue_dl_entity(&p->dl, flags);
+ if (!p->dl.dl_throttled && !dl_server(&p->dl))
+ dequeue_pushable_dl_task(rq, p);
}
/*
@@ -1933,12 +2041,12 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
}
#ifdef CONFIG_SCHED_HRTICK
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
- hrtick_start(rq, p->dl.runtime);
+ hrtick_start(rq, dl_se->runtime);
}
#else /* !CONFIG_SCHED_HRTICK */
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
}
#endif
@@ -1958,9 +2066,6 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
if (!first)
return;
- if (hrtick_enabled_dl(rq))
- start_hrtick_dl(rq, p);
-
if (rq->curr->sched_class != &dl_sched_class)
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
@@ -1983,12 +2088,25 @@ static struct task_struct *pick_task_dl(struct rq *rq)
struct dl_rq *dl_rq = &rq->dl;
struct task_struct *p;
+again:
if (!sched_dl_runnable(rq))
return NULL;
dl_se = pick_next_dl_entity(dl_rq);
WARN_ON_ONCE(!dl_se);
- p = dl_task_of(dl_se);
+
+ if (dl_server(dl_se)) {
+ p = dl_se->server_pick(dl_se);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ dl_se->dl_yielded = 1;
+ update_curr_dl_se(rq, dl_se, 0);
+ goto again;
+ }
+ p->dl_server = dl_se;
+ } else {
+ p = dl_task_of(dl_se);
+ }
return p;
}
@@ -1998,9 +2116,15 @@ static struct task_struct *pick_next_task_dl(struct rq *rq)
struct task_struct *p;
p = pick_task_dl(rq);
- if (p)
+ if (!p)
+ return p;
+
+ if (!p->dl_server)
set_next_task_dl(rq, p, true);
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, &p->dl);
+
return p;
}
@@ -2038,8 +2162,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
* be set and schedule() will start a new hrtick for the next task.
*/
if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 &&
- is_leftmost(p, &rq->dl))
- start_hrtick_dl(rq, p);
+ is_leftmost(&p->dl, &rq->dl))
+ start_hrtick_dl(rq, &p->dl);
}
static void task_fork_dl(struct task_struct *p)
@@ -2558,7 +2682,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
* will reset the task parameters.
*/
if (task_on_rq_queued(p) && p->dl.dl_runtime)
- task_non_contending(p);
+ task_non_contending(&p->dl);
/*
* In case a task is setscheduled out from SCHED_DEADLINE we need to
@@ -2966,10 +3090,8 @@ bool __checkparam_dl(const struct sched_attr *attr)
/*
* This function clears the sched_dl_entity static params.
*/
-void __dl_clear_params(struct task_struct *p)
+static void __dl_clear_params(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
-
dl_se->dl_runtime = 0;
dl_se->dl_deadline = 0;
dl_se->dl_period = 0;
@@ -2981,12 +3103,21 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
+ dl_se->dl_server = 0;
#ifdef CONFIG_RT_MUTEXES
dl_se->pi_se = dl_se;
#endif
}
+void init_dl_entity(struct sched_dl_entity *dl_se)
+{
+ RB_CLEAR_NODE(&dl_se->rb_node);
+ init_dl_task_timer(dl_se);
+ init_dl_inactive_task_timer(dl_se);
+ __dl_clear_params(dl_se);
+}
+
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_dl_entity *dl_se = &p->dl;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4580a450700e..8d5d98a5834d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -628,8 +628,8 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
- struct sched_entity *last, *first;
+ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -644,15 +644,20 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_rq_lock_irqsave(rq, flags);
+ root = __pick_root_entity(cfs_rq);
+ if (root)
+ left_vruntime = root->min_vruntime;
first = __pick_first_entity(cfs_rq);
if (first)
- left_vruntime = first->vruntime;
+ left_deadline = first->deadline;
last = __pick_last_entity(cfs_rq);
if (last)
right_vruntime = last->vruntime;
min_vruntime = cfs_rq->min_vruntime;
raw_spin_rq_unlock_irqrestore(rq, flags);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
+ SPLIT_NS(left_deadline));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
SPLIT_NS(left_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
@@ -679,8 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.runnable_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
- SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
- cfs_rq->avg.util_est.enqueued);
+ SEQ_printf(m, " .%-30s: %u\n", "util_est",
+ cfs_rq->avg.util_est);
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
@@ -1070,8 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.runnable_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
- P(se.avg.util_est.ewma);
- PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
+ PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
#endif
#ifdef CONFIG_UCLAMP_TASK
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d7a3c63a2171..533547e3c90a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -551,7 +551,11 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
static inline bool entity_before(const struct sched_entity *a,
const struct sched_entity *b)
{
- return (s64)(a->vruntime - b->vruntime) < 0;
+ /*
+ * Tiebreak on vruntime seems unnecessary since it can
+ * hardly happen.
+ */
+ return (s64)(a->deadline - b->deadline) < 0;
}
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -720,7 +724,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Note: using 'avg_vruntime() > se->vruntime' is inacurate due
* to the loss in precision caused by the division.
*/
-int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
{
struct sched_entity *curr = cfs_rq->curr;
s64 avg = cfs_rq->avg_vruntime;
@@ -733,7 +737,12 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
load += weight;
}
- return avg >= entity_key(cfs_rq, se) * load;
+ return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
+}
+
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return vruntime_eligible(cfs_rq, se->vruntime);
}
static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -752,9 +761,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = __pick_first_entity(cfs_rq);
+ struct sched_entity *se = __pick_root_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
-
u64 vruntime = cfs_rq->min_vruntime;
if (curr) {
@@ -766,9 +774,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
if (se) {
if (!curr)
- vruntime = se->vruntime;
+ vruntime = se->min_vruntime;
else
- vruntime = min_vruntime(vruntime, se->vruntime);
+ vruntime = min_vruntime(vruntime, se->min_vruntime);
}
/* ensure we never gain time by being placed backwards. */
@@ -781,34 +789,34 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
return entity_before(__node_2_se(a), __node_2_se(b));
}
-#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
-static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
+static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
{
if (node) {
struct sched_entity *rse = __node_2_se(node);
- if (deadline_gt(min_deadline, se, rse))
- se->min_deadline = rse->min_deadline;
+ if (vruntime_gt(min_vruntime, se, rse))
+ se->min_vruntime = rse->min_vruntime;
}
}
/*
- * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
-static inline bool min_deadline_update(struct sched_entity *se, bool exit)
+static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
{
- u64 old_min_deadline = se->min_deadline;
+ u64 old_min_vruntime = se->min_vruntime;
struct rb_node *node = &se->run_node;
- se->min_deadline = se->deadline;
- __update_min_deadline(se, node->rb_right);
- __update_min_deadline(se, node->rb_left);
+ se->min_vruntime = se->vruntime;
+ __min_vruntime_update(se, node->rb_right);
+ __min_vruntime_update(se, node->rb_left);
- return se->min_deadline == old_min_deadline;
+ return se->min_vruntime == old_min_vruntime;
}
-RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
- run_node, min_deadline, min_deadline_update);
+RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
+ run_node, min_vruntime, min_vruntime_update);
/*
* Enqueue an entity into the rb-tree:
@@ -816,18 +824,28 @@ RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
- se->min_deadline = se->deadline;
+ se->min_vruntime = se->vruntime;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
- __entity_less, &min_deadline_cb);
+ __entity_less, &min_vruntime_cb);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
- &min_deadline_cb);
+ &min_vruntime_cb);
avg_vruntime_sub(cfs_rq, se);
}
+struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
+
+ if (!root)
+ return NULL;
+
+ return __node_2_se(root);
+}
+
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
@@ -850,23 +868,29 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
* with the earliest virtual deadline.
*
* We can do this in O(log n) time due to an augmented RB-tree. The
- * tree keeps the entries sorted on service, but also functions as a
- * heap based on the deadline by keeping:
+ * tree keeps the entries sorted on deadline, but also functions as a
+ * heap based on the vruntime by keeping:
*
- * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
*
- * Which allows an EDF like search on (sub)trees.
+ * Which allows tree pruning through eligibility.
*/
-static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
struct sched_entity *best = NULL;
- struct sched_entity *best_left = NULL;
+
+ /*
+ * We can safely skip eligibility check if there is only one entity
+ * in this cfs_rq, saving some cycles.
+ */
+ if (cfs_rq->nr_running == 1)
+ return curr && curr->on_rq ? curr : se;
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
- best = curr;
/*
* Once selected, run a task until it either becomes non-eligible or
@@ -875,95 +899,45 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
return curr;
+ /* Pick the leftmost entity if it's eligible */
+ if (se && entity_eligible(cfs_rq, se)) {
+ best = se;
+ goto found;
+ }
+
+ /* Heap search for the EEVD entity */
while (node) {
- struct sched_entity *se = __node_2_se(node);
+ struct rb_node *left = node->rb_left;
/*
- * If this entity is not eligible, try the left subtree.
+ * Eligible entities in left subtree are always better
+ * choices, since they have earlier deadlines.
*/
- if (!entity_eligible(cfs_rq, se)) {
- node = node->rb_left;
+ if (left && vruntime_eligible(cfs_rq,
+ __node_2_se(left)->min_vruntime)) {
+ node = left;
continue;
}
- /*
- * Now we heap search eligible trees for the best (min_)deadline
- */
- if (!best || deadline_gt(deadline, best, se))
- best = se;
+ se = __node_2_se(node);
/*
- * Every se in a left branch is eligible, keep track of the
- * branch with the best min_deadline
+ * The left subtree either is empty or has no eligible
+ * entity, so check the current node since it is the one
+ * with earliest deadline that might be eligible.
*/
- if (node->rb_left) {
- struct sched_entity *left = __node_2_se(node->rb_left);
-
- if (!best_left || deadline_gt(min_deadline, best_left, left))
- best_left = left;
-
- /*
- * min_deadline is in the left branch. rb_left and all
- * descendants are eligible, so immediately switch to the second
- * loop.
- */
- if (left->min_deadline == se->min_deadline)
- break;
- }
-
- /* min_deadline is at this node, no need to look right */
- if (se->deadline == se->min_deadline)
+ if (entity_eligible(cfs_rq, se)) {
+ best = se;
break;
-
- /* else min_deadline is in the right branch. */
- node = node->rb_right;
- }
-
- /*
- * We ran into an eligible node which is itself the best.
- * (Or nr_running == 0 and both are NULL)
- */
- if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0)
- return best;
-
- /*
- * Now best_left and all of its children are eligible, and we are just
- * looking for deadline == min_deadline
- */
- node = &best_left->run_node;
- while (node) {
- struct sched_entity *se = __node_2_se(node);
-
- /* min_deadline is the current node */
- if (se->deadline == se->min_deadline)
- return se;
-
- /* min_deadline is in the left branch */
- if (node->rb_left &&
- __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
- node = node->rb_left;
- continue;
}
- /* else min_deadline is in the right branch */
node = node->rb_right;
}
- return NULL;
-}
-
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
-{
- struct sched_entity *se = __pick_eevdf(cfs_rq);
+found:
+ if (!best || (curr && entity_before(curr, best)))
+ best = curr;
- if (!se) {
- struct sched_entity *left = __pick_first_entity(cfs_rq);
- if (left) {
- pr_err("EEVDF scheduling fail, picking leftmost\n");
- return left;
- }
- }
-
- return se;
+ return best;
}
#ifdef CONFIG_SCHED_DEBUG
@@ -1129,23 +1103,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_SMP */
-/*
- * Update the current task's runtime statistics.
- */
-static void update_curr(struct cfs_rq *cfs_rq)
+static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
{
- struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_clock_task(rq_of(cfs_rq));
- u64 delta_exec;
-
- if (unlikely(!curr))
- return;
+ u64 now = rq_clock_task(rq);
+ s64 delta_exec;
delta_exec = now - curr->exec_start;
- if (unlikely((s64)delta_exec <= 0))
- return;
+ if (unlikely(delta_exec <= 0))
+ return delta_exec;
curr->exec_start = now;
+ curr->sum_exec_runtime += delta_exec;
if (schedstat_enabled()) {
struct sched_statistics *stats;
@@ -1155,20 +1123,54 @@ static void update_curr(struct cfs_rq *cfs_rq)
max(delta_exec, stats->exec_max));
}
- curr->sum_exec_runtime += delta_exec;
- schedstat_add(cfs_rq->exec_clock, delta_exec);
+ return delta_exec;
+}
+
+static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
+{
+ trace_sched_stat_runtime(p, delta_exec);
+ account_group_exec_runtime(p, delta_exec);
+ cgroup_account_cputime(p, delta_exec);
+ if (p->dl_server)
+ dl_server_update(p->dl_server, delta_exec);
+}
+
+/*
+ * Used by other classes to account runtime.
+ */
+s64 update_curr_common(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ s64 delta_exec;
+
+ delta_exec = update_curr_se(rq, &curr->se);
+ if (likely(delta_exec > 0))
+ update_curr_task(curr, delta_exec);
+
+ return delta_exec;
+}
+
+/*
+ * Update the current task's runtime statistics.
+ */
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 delta_exec;
+
+ if (unlikely(!curr))
+ return;
+
+ delta_exec = update_curr_se(rq_of(cfs_rq), curr);
+ if (unlikely(delta_exec <= 0))
+ return;
curr->vruntime += calc_delta_fair(delta_exec, curr);
update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
- if (entity_is_task(curr)) {
- struct task_struct *curtask = task_of(curr);
-
- trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
- cgroup_account_cputime(curtask, delta_exec);
- account_group_exec_runtime(curtask, delta_exec);
- }
+ if (entity_is_task(curr))
+ update_curr_task(task_of(curr), delta_exec);
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
@@ -3164,7 +3166,7 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
* This is also done to avoid any side effect of task scanning
* amplifying the unfairness of disjoint set of VMAs' access.
*/
- if (READ_ONCE(current->mm->numa_scan_seq) < 2)
+ if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
return true;
pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
@@ -3307,6 +3309,8 @@ retry_pids:
if (!vma->numab_state)
continue;
+ vma->numab_state->start_scan_seq = mm->numa_scan_seq;
+
vma->numab_state->next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
@@ -3811,17 +3815,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
update_load_add(&cfs_rq->load, se->load.weight);
- if (!curr) {
- /*
- * The entity's vruntime has been adjusted, so let's check
- * whether the rq-wide min_vruntime needs updated too. Since
- * the calculations above require stable min_vruntime rather
- * than up-to-date one, we do the update at the end of the
- * reweight process.
- */
+ if (!curr)
__enqueue_entity(cfs_rq, se);
- update_min_vruntime(cfs_rq);
- }
+
+ /*
+ * The entity's vruntime has been adjusted, so let's check
+ * whether the rq-wide min_vruntime needs updated too. Since
+ * the calculations above require stable min_vruntime rather
+ * than up-to-date one, we do the update at the end of the
+ * reweight process.
+ */
+ update_min_vruntime(cfs_rq);
}
}
@@ -4096,6 +4100,10 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
if (cfs_rq->tg == &root_task_group)
return;
+ /* rq has been offline and doesn't contribute to the share anymore: */
+ if (!cpu_active(cpu_of(rq_of(cfs_rq))))
+ return;
+
/*
* For migration heavy workloads, access to tg->load_avg can be
* unbound. Limit the update rate to at most once per ms.
@@ -4112,6 +4120,49 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
}
}
+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
+{
+ long delta;
+ u64 now;
+
+ /*
+ * No need to update load_avg for root_task_group, as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ delta = 0 - cfs_rq->tg_load_avg_contrib;
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = 0;
+ cfs_rq->last_update_tg_load_avg = now;
+}
+
+/* CPU offline callback: */
+static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
+{
+ struct task_group *tg;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * The rq clock has already been updated in
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ clear_tg_load_avg(cfs_rq);
+ }
+ rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
+}
+
/*
* Called within set_task_rq() right before setting a task's CPU. The
* caller only guarantees p->pi_lock is held; no other assumptions,
@@ -4408,6 +4459,8 @@ static inline bool skip_blocked_update(struct sched_entity *se)
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
+static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
+
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
return 0;
@@ -4770,11 +4823,14 @@ static inline unsigned long task_util(struct task_struct *p)
return READ_ONCE(p->se.avg.util_avg);
}
-static inline unsigned long _task_util_est(struct task_struct *p)
+static inline unsigned long task_runnable(struct task_struct *p)
{
- struct util_est ue = READ_ONCE(p->se.avg.util_est);
+ return READ_ONCE(p->se.avg.runnable_avg);
+}
- return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+ return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
}
static inline unsigned long task_util_est(struct task_struct *p)
@@ -4791,9 +4847,9 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
return;
/* Update root cfs_rq's estimated utilization */
- enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued = cfs_rq->avg.util_est;
enqueued += _task_util_est(p);
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+ WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
}
@@ -4807,34 +4863,20 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
return;
/* Update root cfs_rq's estimated utilization */
- enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued = cfs_rq->avg.util_est;
enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+ WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
}
#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
-/*
- * Check if a (signed) value is within a specified (unsigned) margin,
- * based on the observation that:
- *
- * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
- *
- * NOTE: this only works when value + margin < INT_MAX.
- */
-static inline bool within_margin(int value, int margin)
-{
- return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
-}
-
static inline void util_est_update(struct cfs_rq *cfs_rq,
struct task_struct *p,
bool task_sleep)
{
- long last_ewma_diff, last_enqueued_diff;
- struct util_est ue;
+ unsigned int ewma, dequeued, last_ewma_diff;
if (!sched_feat(UTIL_EST))
return;
@@ -4846,71 +4888,73 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
if (!task_sleep)
return;
+ /* Get current estimate of utilization */
+ ewma = READ_ONCE(p->se.avg.util_est);
+
/*
* If the PELT values haven't changed since enqueue time,
* skip the util_est update.
*/
- ue = p->se.avg.util_est;
- if (ue.enqueued & UTIL_AVG_UNCHANGED)
+ if (ewma & UTIL_AVG_UNCHANGED)
return;
- last_enqueued_diff = ue.enqueued;
+ /* Get utilization at dequeue */
+ dequeued = task_util(p);
/*
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
*/
- ue.enqueued = task_util(p);
- if (sched_feat(UTIL_EST_FASTUP)) {
- if (ue.ewma < ue.enqueued) {
- ue.ewma = ue.enqueued;
- goto done;
- }
+ if (ewma <= dequeued) {
+ ewma = dequeued;
+ goto done;
}
/*
* Skip update of task's estimated utilization when its members are
* already ~1% close to its last activation value.
*/
- last_ewma_diff = ue.enqueued - ue.ewma;
- last_enqueued_diff -= ue.enqueued;
- if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
- if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
- goto done;
-
- return;
- }
+ last_ewma_diff = ewma - dequeued;
+ if (last_ewma_diff < UTIL_EST_MARGIN)
+ goto done;
/*
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
- if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
+ if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
return;
/*
+ * To avoid underestimate of task utilization, skip updates of EWMA if
+ * we cannot grant that thread got all CPU time it wanted.
+ */
+ if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
+ goto done;
+
+
+ /*
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
- * of the task size. This is done by storing the current PELT value
- * as ue.enqueued and by using this value to update the Exponential
- * Weighted Moving Average (EWMA):
+ * of the task size. This is done by using this value to update the
+ * Exponential Weighted Moving Average (EWMA):
*
* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
- * = w * ( last_ewma_diff ) + ewma(t-1)
- * = w * (last_ewma_diff + ewma(t-1) / w)
+ * = w * ( -last_ewma_diff ) + ewma(t-1)
+ * = w * (-last_ewma_diff + ewma(t-1) / w)
*
* Where 'w' is the weight of new samples, which is configured to be
* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
*/
- ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
- ue.ewma += last_ewma_diff;
- ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+ ewma <<= UTIL_EST_WEIGHT_SHIFT;
+ ewma -= last_ewma_diff;
+ ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
- ue.enqueued |= UTIL_AVG_UNCHANGED;
- WRITE_ONCE(p->se.avg.util_est, ue);
+ ewma |= UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(p->se.avg.util_est, ewma);
trace_sched_util_est_se_tp(&p->se);
}
@@ -7638,16 +7682,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
if (sched_feat(UTIL_EST)) {
unsigned long util_est;
- util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+ util_est = READ_ONCE(cfs_rq->avg.util_est);
/*
* During wake-up @p isn't enqueued yet and doesn't contribute
- * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+ * to any cpu_rq(cpu)->cfs.avg.util_est.
* If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
* has been enqueued.
*
* During exec (@dst_cpu = -1) @p is enqueued and does
- * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+ * contribute to cpu_rq(cpu)->cfs.util_est.
* Remove it to "simulate" cpu_util without @p's contribution.
*
* Despite the task_on_rq_queued(@p) check there is still a
@@ -7776,7 +7820,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
for_each_cpu(cpu, pd_cpus) {
unsigned long util = cpu_util(cpu, p, -1, 0);
- busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
+ busy_time += effective_cpu_util(cpu, util, NULL, NULL);
}
eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
@@ -7799,7 +7843,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
for_each_cpu(cpu, pd_cpus) {
struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
- unsigned long eff_util;
+ unsigned long eff_util, min, max;
/*
* Performance domain frequency: utilization clamping
@@ -7808,7 +7852,23 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+ eff_util = effective_cpu_util(cpu, util, &min, &max);
+
+ /* Task's uclamp can modify min and max value */
+ if (tsk && uclamp_is_used()) {
+ min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
+
+ /*
+ * If there is no active max uclamp constraint,
+ * directly use task's one, otherwise keep max.
+ */
+ if (uclamp_rq_is_idle(cpu_rq(cpu)))
+ max = uclamp_eff_value(p, UCLAMP_MAX);
+ else
+ max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
+ }
+
+ eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
max_util = max(max_util, eff_util);
}
@@ -8210,7 +8270,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int next_buddy_marked = 0;
int cse_is_idle, pse_is_idle;
if (unlikely(se == pse))
@@ -8227,7 +8286,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) {
set_next_buddy(pse);
- next_buddy_marked = 1;
}
/*
@@ -9060,7 +9118,7 @@ static int detach_tasks(struct lb_env *env)
case migrate_util:
util = task_util_est(p);
- if (util > env->imbalance)
+ if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= util;
@@ -12413,6 +12471,9 @@ static void rq_offline_fair(struct rq *rq)
/* Ensure any throttled groups are reachable by pick_next_task */
unthrottle_offline_cfs_rqs(rq);
+
+ /* Ensure that we remove rq contribution to group share: */
+ clear_tg_offline_cfs_rqs(rq);
}
#endif /* CONFIG_SMP */
@@ -13036,19 +13097,6 @@ next_cpu:
return 0;
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
-
-void free_fair_sched_group(struct task_group *tg) { }
-
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
- return 1;
-}
-
-void online_fair_sched_group(struct task_group *tg) { }
-
-void unregister_fair_sched_group(struct task_group *tg) { }
-
#endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index a3ddf84de430..143f55df890b 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -83,7 +83,6 @@ SCHED_FEAT(WA_BIAS, true)
* UtilEstimation. Use estimated CPU utilization.
*/
SCHED_FEAT(UTIL_EST, true)
-SCHED_FEAT(UTIL_EST_FASTUP, true)
SCHED_FEAT(LATENCY_WARN, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 565f8374ddbb..31231925f1ec 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -258,6 +258,36 @@ static void do_idle(void)
while (!need_resched()) {
rmb();
+ /*
+ * Interrupts shouldn't be re-enabled from that point on until
+ * the CPU sleeping instruction is reached. Otherwise an interrupt
+ * may fire and queue a timer that would be ignored until the CPU
+ * wakes from the sleeping instruction. And testing need_resched()
+ * doesn't tell about pending needed timer reprogram.
+ *
+ * Several cases to consider:
+ *
+ * - SLEEP-UNTIL-PENDING-INTERRUPT based instructions such as
+ * "wfi" or "mwait" are fine because they can be entered with
+ * interrupt disabled.
+ *
+ * - sti;mwait() couple is fine because the interrupts are
+ * re-enabled only upon the execution of mwait, leaving no gap
+ * in-between.
+ *
+ * - ROLLBACK based idle handlers with the sleeping instruction
+ * called with interrupts enabled are NOT fine. In this scheme
+ * when the interrupt detects it has interrupted an idle handler,
+ * it rolls back to its beginning which performs the
+ * need_resched() check before re-executing the sleeping
+ * instruction. This can leak a pending needed timer reprogram.
+ * If such a scheme is really mandatory due to the lack of an
+ * appropriate CPU sleeping instruction, then a FAST-FORWARD
+ * must instead be applied: when the interrupt detects it has
+ * interrupted an idle handler, it must resume to the end of
+ * this idle handler so that the generic idle loop is iterated
+ * again to reprogram the tick.
+ */
local_irq_disable();
if (cpu_is_offline(cpu)) {
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 3a0e0dc28721..9e1083465fbc 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -52,13 +52,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
return;
/* Avoid store if the flag has been already reset */
- enqueued = avg->util_est.enqueued;
+ enqueued = avg->util_est;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
/* Reset flag to report util_avg has been updated */
enqueued &= ~UTIL_AVG_UNCHANGED;
- WRITE_ONCE(avg->util_est.enqueued, enqueued);
+ WRITE_ONCE(avg->util_est, enqueued);
}
static inline u64 rq_clock_pelt(struct rq *rq)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 6aaf0a3d6081..3261b067b67e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1002,24 +1002,15 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
- u64 delta_exec;
- u64 now;
+ s64 delta_exec;
if (curr->sched_class != &rt_sched_class)
return;
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec <= 0))
+ delta_exec = update_curr_common(rq);
+ if (unlikely(delta_exec <= 0))
return;
- schedstat_set(curr->stats.exec_max,
- max(curr->stats.exec_max, delta_exec));
-
- trace_sched_stat_runtime(curr, delta_exec, 0);
-
- update_current_exec_runtime(curr, now, delta_exec);
-
if (!rt_bandwidth_enabled())
return;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2e5a95486a42..001fe047bd5d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -273,8 +273,6 @@ struct rt_bandwidth {
unsigned int rt_period_active;
};
-void __dl_clear_params(struct task_struct *p);
-
static inline int dl_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
@@ -315,6 +313,33 @@ extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *att
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int dl_bw_check_overflow(int cpu);
+/*
+ * SCHED_DEADLINE supports servers (nested scheduling) with the following
+ * interface:
+ *
+ * dl_se::rq -- runqueue we belong to.
+ *
+ * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
+ * server when it runs out of tasks to run.
+ *
+ * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
+ * returns NULL.
+ *
+ * dl_server_update() -- called from update_curr_common(), propagates runtime
+ * to the server.
+ *
+ * dl_server_start()
+ * dl_server_stop() -- start/stop the server when it has (no) tasks.
+ *
+ * dl_server_init() -- initializes the server.
+ */
+extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
+extern void dl_server_start(struct sched_dl_entity *dl_se);
+extern void dl_server_stop(struct sched_dl_entity *dl_se);
+extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
+ dl_server_has_tasks_f has_tasks,
+ dl_server_pick_f pick);
+
#ifdef CONFIG_CGROUP_SCHED
struct cfs_rq;
@@ -436,10 +461,21 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
extern int tg_nop(struct task_group *tg, void *data);
+#ifdef CONFIG_FAIR_GROUP_SCHED
extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg);
+#else
+static inline void free_fair_sched_group(struct task_group *tg) { }
+static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+static inline void online_fair_sched_group(struct task_group *tg) { }
+static inline void unregister_fair_sched_group(struct task_group *tg) { }
+#endif
+
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent);
@@ -2179,6 +2215,10 @@ extern const u32 sched_prio_to_wmult[40];
* MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
* in the runqueue.
*
+ * NOCLOCK - skip the update_rq_clock() (avoids double updates)
+ *
+ * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE)
+ *
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_MIGRATED - the task was migrated during wakeup
@@ -2189,6 +2229,7 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -2203,6 +2244,7 @@ extern const u32 sched_prio_to_wmult[40];
#define ENQUEUE_MIGRATED 0x00
#endif
#define ENQUEUE_INITIAL 0x80
+#define ENQUEUE_MIGRATING 0x100
#define RETRY_TASK ((void *)-1UL)
@@ -2212,6 +2254,8 @@ struct affinity_context {
unsigned int flags;
};
+extern s64 update_curr_common(struct rq *rq);
+
struct sched_class {
#ifdef CONFIG_UCLAMP_TASK
@@ -2425,8 +2469,7 @@ extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
-extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_entity(struct sched_dl_entity *dl_se);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
@@ -2822,6 +2865,7 @@ DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
double_rq_lock(_T->lock, _T->lock2),
double_rq_unlock(_T->lock, _T->lock2))
+extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
@@ -2961,24 +3005,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif
#ifdef CONFIG_SMP
-/**
- * enum cpu_util_type - CPU utilization type
- * @FREQUENCY_UTIL: Utilization used to select frequency
- * @ENERGY_UTIL: Utilization used during energy calculation
- *
- * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
- * need to be aggregated differently depending on the usage made of them. This
- * enum is used within effective_cpu_util() to differentiate the types of
- * utilization expected by the callers, and adjust the aggregation accordingly.
- */
-enum cpu_util_type {
- FREQUENCY_UTIL,
- ENERGY_UTIL,
-};
-
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- enum cpu_util_type type,
- struct task_struct *p);
+ unsigned long *min,
+ unsigned long *max);
+
+unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
+ unsigned long min,
+ unsigned long max);
+
/*
* Verify the fitness of task @p to run on @cpu taking into account the
@@ -3035,59 +3069,6 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
}
-/**
- * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
- * @rq: The rq to clamp against. Must not be NULL.
- * @util: The util value to clamp.
- * @p: The task to clamp against. Can be NULL if you want to clamp
- * against @rq only.
- *
- * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
- *
- * If sched_uclamp_used static key is disabled, then just return the util
- * without any clamping since uclamp aggregation at the rq level in the fast
- * path is disabled, rendering this operation a NOP.
- *
- * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
- * will return the correct effective uclamp value of the task even if the
- * static key is disabled.
- */
-static __always_inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
- struct task_struct *p)
-{
- unsigned long min_util = 0;
- unsigned long max_util = 0;
-
- if (!static_branch_likely(&sched_uclamp_used))
- return util;
-
- if (p) {
- min_util = uclamp_eff_value(p, UCLAMP_MIN);
- max_util = uclamp_eff_value(p, UCLAMP_MAX);
-
- /*
- * Ignore last runnable task's max clamp, as this task will
- * reset it. Similarly, no need to read the rq's min clamp.
- */
- if (uclamp_rq_is_idle(rq))
- goto out;
- }
-
- min_util = max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN));
- max_util = max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX));
-out:
- /*
- * Since CPU's {min,max}_util clamps are MAX aggregated considering
- * RUNNABLE tasks with _different_ clamps, we can end up with an
- * inversion. Fix it now when the clamps are applied.
- */
- if (unlikely(min_util >= max_util))
- return min_util;
-
- return clamp(util, min_util, max_util);
-}
-
/* Is the rq being capped/throttled by uclamp_max? */
static inline bool uclamp_rq_is_capped(struct rq *rq)
{
@@ -3125,13 +3106,6 @@ static inline unsigned long uclamp_eff_value(struct task_struct *p,
return SCHED_CAPACITY_SCALE;
}
-static inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
- struct task_struct *p)
-{
- return util;
-}
-
static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; }
static inline bool uclamp_is_used(void)
@@ -3261,16 +3235,6 @@ extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
#endif
-static inline void update_current_exec_runtime(struct task_struct *curr,
- u64 now, u64 delta_exec)
-{
- curr->se.sum_exec_runtime += delta_exec;
- account_group_exec_runtime(curr, delta_exec);
-
- curr->se.exec_start = now;
- cgroup_account_cputime(curr, delta_exec);
-}
-
#ifdef CONFIG_SCHED_MM_CID
#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 6cf7304e6449..b1b8fe61c532 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -70,18 +70,7 @@ static void yield_task_stop(struct rq *rq)
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *curr = rq->curr;
- u64 now, delta_exec;
-
- now = rq_clock_task(rq);
- delta_exec = now - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
-
- schedstat_set(curr->stats.exec_max,
- max(curr->stats.exec_max, delta_exec));
-
- update_current_exec_runtime(curr, now, delta_exec);
+ update_curr_common(rq);
}
/*
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 255999ba9190..aca7b437882e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1072,7 +1072,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
*/
list_del_init(&addfd->list);
if (!addfd->setfd)
- fd = receive_fd(addfd->file, addfd->flags);
+ fd = receive_fd(addfd->file, NULL, addfd->flags);
else
fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
addfd->ret = fd;
diff --git a/kernel/signal.c b/kernel/signal.c
index 47a7602dfe8d..c9c57d053ce4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -171,16 +171,6 @@ static bool recalc_sigpending_tsk(struct task_struct *t)
return false;
}
-/*
- * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up.
- * This is superfluous when called on current, the wakeup is a harmless no-op.
- */
-void recalc_sigpending_and_wake(struct task_struct *t)
-{
- if (recalc_sigpending_tsk(t))
- signal_wake_up(t, 0);
-}
-
void recalc_sigpending(void)
{
if (!recalc_sigpending_tsk(current) && !freezing(current))
@@ -1348,10 +1338,8 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
action->sa.sa_handler = SIG_DFL;
if (handler == HANDLER_EXIT)
action->sa.sa_flags |= SA_IMMUTABLE;
- if (blocked) {
+ if (blocked)
sigdelset(&t->blocked, sig);
- recalc_sigpending_and_wake(t);
- }
}
/*
* Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
@@ -1361,6 +1349,9 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
(!t->ptrace || (handler == HANDLER_EXIT)))
t->signal->flags &= ~SIGNAL_UNKILLABLE;
ret = send_signal_locked(sig, info, t, PIDTYPE_PID);
+ /* This can happen if the signal was already pending and blocked */
+ if (!task_sigpending(t))
+ signal_wake_up(t, 0);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
return ret;
@@ -1376,12 +1367,12 @@ int force_sig_info(struct kernel_siginfo *info)
*/
int zap_other_threads(struct task_struct *p)
{
- struct task_struct *t = p;
+ struct task_struct *t;
int count = 0;
p->signal->group_stop_count = 0;
- while_each_thread(p, t) {
+ for_other_threads(p, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
/* Don't require de_thread to wait for the vhost_worker */
if ((t->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)
@@ -2465,12 +2456,10 @@ static bool do_signal_stop(int signr)
sig->group_exit_code = signr;
sig->group_stop_count = 0;
-
if (task_set_jobctl_pending(current, signr | gstop))
sig->group_stop_count++;
- t = current;
- while_each_thread(current, t) {
+ for_other_threads(current, t) {
/*
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held,
@@ -2966,8 +2955,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
if (sigisemptyset(&retarget))
return;
- t = tsk;
- while_each_thread(tsk, t) {
+ for_other_threads(tsk, t) {
if (t->flags & PF_EXITING)
continue;
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 4f65824879ab..afb3c116da91 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(stack_trace_save);
/**
* stack_trace_save_tsk - Save a task stack trace into a storage array
- * @task: The task to examine
+ * @tsk: The task to examine
* @store: Pointer to storage array
* @size: Size of the storage array
* @skipnr: Number of entries to skip at the start of the stack trace
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e1a6e3c675c0..faad00cce269 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -171,6 +171,9 @@ COND_SYSCALL(landlock_add_rule);
COND_SYSCALL(landlock_restrict_self);
COND_SYSCALL(fadvise64_64);
COND_SYSCALL_COMPAT(fadvise64_64);
+COND_SYSCALL(lsm_get_self_attr);
+COND_SYSCALL(lsm_set_self_attr);
+COND_SYSCALL(lsm_list_modules);
/* CONFIG_MMU only */
COND_SYSCALL(swapon);
@@ -201,6 +204,20 @@ COND_SYSCALL(recvmmsg_time32);
COND_SYSCALL_COMPAT(recvmmsg_time32);
COND_SYSCALL_COMPAT(recvmmsg_time64);
+/* Posix timer syscalls may be configured out */
+COND_SYSCALL(timer_create);
+COND_SYSCALL(timer_gettime);
+COND_SYSCALL(timer_getoverrun);
+COND_SYSCALL(timer_settime);
+COND_SYSCALL(timer_delete);
+COND_SYSCALL(clock_adjtime);
+COND_SYSCALL(getitimer);
+COND_SYSCALL(setitimer);
+COND_SYSCALL(alarm);
+COND_SYSCALL_COMPAT(timer_create);
+COND_SYSCALL_COMPAT(getitimer);
+COND_SYSCALL_COMPAT(setitimer);
+
/*
* Architecture specific syscalls: see further below
*/
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 828aeecbd1e8..9b6fcb8d85e7 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -17,40 +17,6 @@
#include <linux/time_namespace.h>
#include <linux/compat.h>
-#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
-/* Architectures may override SYS_NI and COMPAT_SYS_NI */
-#include <asm/syscall_wrapper.h>
-#endif
-
-asmlinkage long sys_ni_posix_timers(void)
-{
- pr_err_once("process %d (%s) attempted a POSIX timer syscall "
- "while CONFIG_POSIX_TIMERS is not set\n",
- current->pid, current->comm);
- return -ENOSYS;
-}
-
-#ifndef SYS_NI
-#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
-#endif
-
-#ifndef COMPAT_SYS_NI
-#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers)
-#endif
-
-SYS_NI(timer_create);
-SYS_NI(timer_gettime);
-SYS_NI(timer_getoverrun);
-SYS_NI(timer_settime);
-SYS_NI(timer_delete);
-SYS_NI(clock_adjtime);
-SYS_NI(getitimer);
-SYS_NI(setitimer);
-SYS_NI(clock_adjtime32);
-#ifdef __ARCH_WANT_SYS_ALARM
-SYS_NI(alarm);
-#endif
-
/*
* We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
* as it is easy to remain compatible with little code. CLOCK_BOOTTIME
@@ -158,18 +124,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
which_clock);
}
-#ifdef CONFIG_COMPAT
-COMPAT_SYS_NI(timer_create);
-#endif
-
-#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
-COMPAT_SYS_NI(getitimer);
-COMPAT_SYS_NI(setitimer);
-#endif
-
#ifdef CONFIG_COMPAT_32BIT_TIME
-SYS_NI(timer_settime32);
-SYS_NI(timer_gettime32);
SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock,
struct old_timespec32 __user *, tp)
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 649f2b48e8f0..481b7ab65e2c 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -56,7 +56,6 @@ extern int clockevents_program_event(struct clock_event_device *dev,
ktime_t expires, bool force);
extern void clockevents_handle_noop(struct clock_event_device *dev);
extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
-extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
/* Broadcasting support */
# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
@@ -197,3 +196,5 @@ void hrtimers_resume_local(void);
#else
#define JIFFIES_SHIFT 8
#endif
+
+extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index be77b021e5d6..a17d26002831 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -839,6 +839,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
ts->next_timer = next_tick;
}
+ /* Make sure next_tick is never before basemono! */
+ if (WARN_ON_ONCE(basemono > next_tick))
+ next_tick = basemono;
+
/*
* If the tick is due in the next period, keep it ticking or
* force prod the timer.
@@ -887,7 +891,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
u64 basemono = ts->timer_expires_base;
u64 expires = ts->timer_expires;
- ktime_t tick = expires;
/* Make sure we won't be trying to stop it twice in a row. */
ts->timer_expires_base = 0;
@@ -910,7 +913,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
/* Skip reprogram of event if it's not changed */
if (ts->tick_stopped && (expires == ts->next_tick)) {
/* Sanity check: make sure clockevent is actually programmed */
- if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
+ if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
return;
WARN_ON_ONCE(1);
@@ -920,11 +923,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
}
/*
- * nohz_stop_sched_tick() can be called several times before
- * nohz_restart_sched_tick() is called. This happens when
- * interrupts arrive which do not cause a reschedule. In the
- * first call we save the current tick time, so we can restart
- * the scheduler tick in nohz_restart_sched_tick().
+ * tick_nohz_stop_tick() can be called several times before
+ * tick_nohz_restart_sched_tick() is called. This happens when
+ * interrupts arrive which do not cause a reschedule. In the first
+ * call we save the current tick time, so we can restart the
+ * scheduler tick in tick_nohz_restart_sched_tick().
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
@@ -935,7 +938,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
- ts->next_tick = tick;
+ ts->next_tick = expires;
/*
* If the expiration time == KTIME_MAX, then we simply stop
@@ -950,11 +953,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
}
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer, tick,
+ hrtimer_start(&ts->sched_timer, expires,
HRTIMER_MODE_ABS_PINNED_HARD);
} else {
- hrtimer_set_expires(&ts->sched_timer, tick);
- tick_program_event(tick, 1);
+ hrtimer_set_expires(&ts->sched_timer, expires);
+ tick_program_event(expires, 1);
}
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 63a8ce7177dd..352b161113cd 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -571,18 +571,15 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk,
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
- if (!is_timers_nohz_active())
- return;
-
/*
- * TODO: This wants some optimizing similar to the code below, but we
- * will do that when we switch from push to pull for deferrable timers.
+ * Deferrable timers do not prevent the CPU from entering dynticks and
+ * are not taken into account on the idle/nohz_full path. An IPI when a
+ * new deferrable timer is enqueued will wake up the remote CPU but
+ * nothing will be done with the deferrable timer base. Therefore skip
+ * the remote IPI for deferrable timers completely.
*/
- if (timer->flags & TIMER_DEFERRABLE) {
- if (tick_nohz_full_cpu(base->cpu))
- wake_up_nohz_cpu(base->cpu);
+ if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
return;
- }
/*
* We might have to IPI the remote CPU if the base is idle and the
@@ -606,7 +603,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
__set_bit(idx, base->pending_map);
timer_set_idx(timer, idx);
- trace_timer_start(timer, timer->expires, timer->flags);
+ trace_timer_start(timer, bucket_expiry);
/*
* Check whether this is the new first expiring timer. The
@@ -942,31 +939,34 @@ get_target_base(struct timer_base *base, unsigned tflags)
return get_timer_this_cpu_base(tflags);
}
-static inline void forward_timer_base(struct timer_base *base)
+static inline void __forward_timer_base(struct timer_base *base,
+ unsigned long basej)
{
- unsigned long jnow = READ_ONCE(jiffies);
-
/*
- * No need to forward if we are close enough below jiffies.
- * Also while executing timers, base->clk is 1 offset ahead
- * of jiffies to avoid endless requeuing to current jiffies.
+ * Check whether we can forward the base. We can only do that when
+ * @basej is past base->clk otherwise we might rewind base->clk.
*/
- if ((long)(jnow - base->clk) < 1)
+ if (time_before_eq(basej, base->clk))
return;
/*
* If the next expiry value is > jiffies, then we fast forward to
* jiffies otherwise we forward to the next expiry value.
*/
- if (time_after(base->next_expiry, jnow)) {
- base->clk = jnow;
+ if (time_after(base->next_expiry, basej)) {
+ base->clk = basej;
} else {
if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
return;
base->clk = base->next_expiry;
}
+
}
+static inline void forward_timer_base(struct timer_base *base)
+{
+ __forward_timer_base(base, READ_ONCE(jiffies));
+}
/*
* We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -1803,8 +1803,10 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset,
/*
* Search the first expiring timer in the various clock levels. Caller must
* hold base->lock.
+ *
+ * Store next expiry time in base->next_expiry.
*/
-static unsigned long __next_timer_interrupt(struct timer_base *base)
+static void next_expiry_recalc(struct timer_base *base)
{
unsigned long clk, next, adj;
unsigned lvl, offset = 0;
@@ -1870,10 +1872,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
clk += adj;
}
+ base->next_expiry = next;
base->next_expiry_recalc = false;
base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
-
- return next;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -1921,8 +1922,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA;
u64 expires = KTIME_MAX;
- unsigned long nextevt;
+ bool was_idle;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1933,37 +1935,44 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
raw_spin_lock(&base->lock);
if (base->next_expiry_recalc)
- base->next_expiry = __next_timer_interrupt(base);
- nextevt = base->next_expiry;
+ next_expiry_recalc(base);
/*
* We have a fresh next event. Check whether we can forward the
- * base. We can only do that when @basej is past base->clk
- * otherwise we might rewind base->clk.
+ * base.
*/
- if (time_after(basej, base->clk)) {
- if (time_after(nextevt, basej))
- base->clk = basej;
- else if (time_after(nextevt, base->clk))
- base->clk = nextevt;
- }
+ __forward_timer_base(base, basej);
- if (time_before_eq(nextevt, basej)) {
- expires = basem;
- base->is_idle = false;
+ if (base->timers_pending) {
+ nextevt = base->next_expiry;
+
+ /* If we missed a tick already, force 0 delta */
+ if (time_before(nextevt, basej))
+ nextevt = basej;
+ expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
} else {
- if (base->timers_pending)
- expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
- * If we expect to sleep more than a tick, mark the base idle.
- * Also the tick is stopped so any added timer must forward
- * the base clk itself to keep granularity small. This idle
- * logic is only maintained for the BASE_STD base, deferrable
- * timers may still see large granularity skew (by design).
+ * Move next_expiry for the empty base into the future to
+ * prevent a unnecessary raise of the timer softirq when the
+ * next_expiry value will be reached even if there is no timer
+ * pending.
*/
- if ((expires - basem) > TICK_NSEC)
- base->is_idle = true;
+ base->next_expiry = nextevt;
}
+
+ /*
+ * Base is idle if the next event is more than a tick away.
+ *
+ * If the base is marked idle then any timer add operation must forward
+ * the base clk itself to keep granularity small. This idle logic is
+ * only maintained for the BASE_STD base, deferrable timers may still
+ * see large granularity skew (by design).
+ */
+ was_idle = base->is_idle;
+ base->is_idle = time_after(nextevt, basej + 1);
+ if (was_idle != base->is_idle)
+ trace_timer_base_idle(base->is_idle, base->cpu);
+
raw_spin_unlock(&base->lock);
return cmp_next_hrtimer_event(basem, expires);
@@ -1984,7 +1993,10 @@ void timer_clear_idle(void)
* sending the IPI a few instructions smaller for the cost of taking
* the lock in the exit from idle path.
*/
- base->is_idle = false;
+ if (base->is_idle) {
+ base->is_idle = false;
+ trace_timer_base_idle(false, smp_processor_id());
+ }
}
#endif
@@ -2015,8 +2027,12 @@ static inline void __run_timers(struct timer_base *base)
*/
WARN_ON_ONCE(!levels && !base->next_expiry_recalc
&& base->timers_pending);
+ /*
+ * While executing timers, base->clk is set 1 offset ahead of
+ * jiffies to avoid endless requeuing to current jiffies.
+ */
base->clk++;
- base->next_expiry = __next_timer_interrupt(base);
+ next_expiry_recalc(base);
while (levels--)
expire_timers(base, heads + levels);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8de8bec5f366..b01ae7d36021 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1183,18 +1183,19 @@ static void __add_hash_entry(struct ftrace_hash *hash,
hash->count++;
}
-static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+static struct ftrace_func_entry *
+add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
{
struct ftrace_func_entry *entry;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
- return -ENOMEM;
+ return NULL;
entry->ip = ip;
__add_hash_entry(hash, entry);
- return 0;
+ return entry;
}
static void
@@ -1349,7 +1350,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
struct ftrace_func_entry *entry;
struct ftrace_hash *new_hash;
int size;
- int ret;
int i;
new_hash = alloc_ftrace_hash(size_bits);
@@ -1366,8 +1366,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- ret = add_hash_entry(new_hash, entry->ip);
- if (ret < 0)
+ if (add_hash_entry(new_hash, entry->ip) == NULL)
goto free_hash;
}
}
@@ -2536,7 +2535,7 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec)
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
/* Protected by rcu_tasks for reading, and direct_mutex for writing */
-static struct ftrace_hash *direct_functions = EMPTY_HASH;
+static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH;
static DEFINE_MUTEX(direct_mutex);
int ftrace_direct_func_count;
@@ -2555,39 +2554,6 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
return entry->direct;
}
-static struct ftrace_func_entry*
-ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
- struct ftrace_hash **free_hash)
-{
- struct ftrace_func_entry *entry;
-
- if (ftrace_hash_empty(direct_functions) ||
- direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
- struct ftrace_hash *new_hash;
- int size = ftrace_hash_empty(direct_functions) ? 0 :
- direct_functions->count + 1;
-
- if (size < 32)
- size = 32;
-
- new_hash = dup_hash(direct_functions, size);
- if (!new_hash)
- return NULL;
-
- *free_hash = direct_functions;
- direct_functions = new_hash;
- }
-
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- return NULL;
-
- entry->ip = ip;
- entry->direct = addr;
- __add_hash_entry(direct_functions, entry);
- return entry;
-}
-
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
@@ -4223,8 +4189,8 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter)
/* Do nothing if it exists */
if (entry)
return 0;
-
- ret = add_hash_entry(hash, rec->ip);
+ if (add_hash_entry(hash, rec->ip) == NULL)
+ ret = -ENOMEM;
}
return ret;
}
@@ -5266,7 +5232,8 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return 0;
}
- return add_hash_entry(hash, ip);
+ entry = add_hash_entry(hash, ip);
+ return entry ? 0 : -ENOMEM;
}
static int
@@ -5410,7 +5377,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
*/
int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
- struct ftrace_hash *hash, *free_hash = NULL;
+ struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL;
struct ftrace_func_entry *entry, *new;
int err = -EBUSY, size, i;
@@ -5436,17 +5403,44 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
}
}
- /* ... and insert them to direct_functions hash. */
err = -ENOMEM;
+
+ /* Make a copy hash to place the new and the old entries in */
+ size = hash->count + direct_functions->count;
+ if (size > 32)
+ size = 32;
+ new_hash = alloc_ftrace_hash(fls(size));
+ if (!new_hash)
+ goto out_unlock;
+
+ /* Now copy over the existing direct entries */
+ size = 1 << direct_functions->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) {
+ new = add_hash_entry(new_hash, entry->ip);
+ if (!new)
+ goto out_unlock;
+ new->direct = entry->direct;
+ }
+ }
+
+ /* ... and add the new entries */
+ size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- new = ftrace_add_rec_direct(entry->ip, addr, &free_hash);
+ new = add_hash_entry(new_hash, entry->ip);
if (!new)
- goto out_remove;
+ goto out_unlock;
+ /* Update both the copy and the hash entry */
+ new->direct = addr;
entry->direct = addr;
}
}
+ free_hash = direct_functions;
+ rcu_assign_pointer(direct_functions, new_hash);
+ new_hash = NULL;
+
ops->func = call_direct_funcs;
ops->flags = MULTI_FLAGS;
ops->trampoline = FTRACE_REGS_ADDR;
@@ -5454,17 +5448,17 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
err = register_ftrace_function_nolock(ops);
- out_remove:
- if (err)
- remove_direct_functions_hash(hash, addr);
-
out_unlock:
mutex_unlock(&direct_mutex);
- if (free_hash) {
+ if (free_hash && free_hash != EMPTY_HASH) {
synchronize_rcu_tasks();
free_ftrace_hash(free_hash);
}
+
+ if (new_hash)
+ free_ftrace_hash(new_hash);
+
return err;
}
EXPORT_SYMBOL_GPL(register_ftrace_direct);
@@ -6309,7 +6303,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
if (entry)
continue;
- if (add_hash_entry(hash, rec->ip) < 0)
+ if (add_hash_entry(hash, rec->ip) == NULL)
goto out;
} else {
if (entry) {
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index 6fd7d4ecbbc6..fa03094e9e69 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -48,7 +48,7 @@ static void rethook_free_rcu(struct rcu_head *head)
*/
void rethook_stop(struct rethook *rh)
{
- WRITE_ONCE(rh->handler, NULL);
+ rcu_assign_pointer(rh->handler, NULL);
}
/**
@@ -63,7 +63,7 @@ void rethook_stop(struct rethook *rh)
*/
void rethook_free(struct rethook *rh)
{
- WRITE_ONCE(rh->handler, NULL);
+ rethook_stop(rh);
call_rcu(&rh->rcu, rethook_free_rcu);
}
@@ -82,6 +82,12 @@ static int rethook_fini_pool(struct objpool_head *head, void *context)
return 0;
}
+static inline rethook_handler_t rethook_get_handler(struct rethook *rh)
+{
+ return (rethook_handler_t)rcu_dereference_check(rh->handler,
+ rcu_read_lock_any_held());
+}
+
/**
* rethook_alloc() - Allocate struct rethook.
* @data: a data to pass the @handler when hooking the return.
@@ -107,7 +113,7 @@ struct rethook *rethook_alloc(void *data, rethook_handler_t handler,
return ERR_PTR(-ENOMEM);
rh->data = data;
- rh->handler = handler;
+ rcu_assign_pointer(rh->handler, handler);
/* initialize the objpool for rethook nodes */
if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh,
@@ -135,9 +141,10 @@ static void free_rethook_node_rcu(struct rcu_head *head)
*/
void rethook_recycle(struct rethook_node *node)
{
- lockdep_assert_preemption_disabled();
+ rethook_handler_t handler;
- if (likely(READ_ONCE(node->rethook->handler)))
+ handler = rethook_get_handler(node->rethook);
+ if (likely(handler))
objpool_push(node, &node->rethook->pool);
else
call_rcu(&node->rcu, free_rethook_node_rcu);
@@ -153,9 +160,7 @@ NOKPROBE_SYMBOL(rethook_recycle);
*/
struct rethook_node *rethook_try_get(struct rethook *rh)
{
- rethook_handler_t handler = READ_ONCE(rh->handler);
-
- lockdep_assert_preemption_disabled();
+ rethook_handler_t handler = rethook_get_handler(rh);
/* Check whether @rh is going to be freed. */
if (unlikely(!handler))
@@ -300,7 +305,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs,
rhn = container_of(first, struct rethook_node, llist);
if (WARN_ON_ONCE(rhn->frame != frame))
break;
- handler = READ_ONCE(rhn->rethook->handler);
+ handler = rethook_get_handler(rhn->rethook);
if (handler)
handler(rhn, rhn->rethook->data,
correct_ret_addr, regs);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 43cc47d7faaf..9286f88fcd32 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -644,8 +644,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
*cnt = rb_time_cnt(top);
- /* If top and bottom counts don't match, this interrupted a write */
- if (*cnt != rb_time_cnt(bottom))
+ /* If top, msb or bottom counts don't match, this interrupted a write */
+ if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
return false;
/* The shift to msb will lose its cnt bits */
@@ -700,44 +700,6 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
return local_try_cmpxchg(l, &expect, set);
}
-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
-{
- unsigned long cnt, top, bottom, msb;
- unsigned long cnt2, top2, bottom2, msb2;
- u64 val;
-
- /* The cmpxchg always fails if it interrupted an update */
- if (!__rb_time_read(t, &val, &cnt2))
- return false;
-
- if (val != expect)
- return false;
-
- cnt = local_read(&t->cnt);
- if ((cnt & 3) != cnt2)
- return false;
-
- cnt2 = cnt + 1;
-
- rb_time_split(val, &top, &bottom, &msb);
- top = rb_time_val_cnt(top, cnt);
- bottom = rb_time_val_cnt(bottom, cnt);
-
- rb_time_split(set, &top2, &bottom2, &msb2);
- top2 = rb_time_val_cnt(top2, cnt2);
- bottom2 = rb_time_val_cnt(bottom2, cnt2);
-
- if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
- return false;
- if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
- return false;
- if (!rb_time_read_cmpxchg(&t->top, top, top2))
- return false;
- if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
- return false;
- return true;
-}
-
#else /* 64 bits */
/* local64_t always succeeds */
@@ -751,11 +713,6 @@ static void rb_time_set(rb_time_t *t, u64 val)
{
local64_set(&t->time, val);
}
-
-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
-{
- return local64_try_cmpxchg(&t->time, &expect, set);
-}
#endif
/*
@@ -924,9 +881,14 @@ static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int f
if (!nr_pages || !full)
return true;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+ /*
+ * Add one as dirty will never equal nr_pages, as the sub-buffer
+ * that the writer is on is not counted as dirty.
+ * This is needed if "buffer_percent" is set to 100.
+ */
+ dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1;
- return (dirty * 100) > (full * nr_pages);
+ return (dirty * 100) >= (full * nr_pages);
}
/*
@@ -987,7 +949,8 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
/* make sure the waiters see the new index */
smp_wmb();
- rb_wake_up_waiters(&rbwork->work);
+ /* This can be called in any context */
+ irq_work_queue(&rbwork->work);
}
/**
@@ -1787,6 +1750,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
free_buffer_page(bpage);
}
+ free_page((unsigned long)cpu_buffer->free_page);
+
kfree(cpu_buffer);
}
@@ -2407,7 +2372,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
*/
barrier();
- if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
+ if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
/* Writer corrupted the read? */
goto reset;
@@ -2981,25 +2946,6 @@ static unsigned rb_calculate_event_length(unsigned length)
return length;
}
-static u64 rb_time_delta(struct ring_buffer_event *event)
-{
- switch (event->type_len) {
- case RINGBUF_TYPE_PADDING:
- return 0;
-
- case RINGBUF_TYPE_TIME_EXTEND:
- return rb_event_time_stamp(event);
-
- case RINGBUF_TYPE_TIME_STAMP:
- return 0;
-
- case RINGBUF_TYPE_DATA:
- return event->time_delta;
- default:
- return 0;
- }
-}
-
static inline bool
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
@@ -3007,8 +2953,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long new_index, old_index;
struct buffer_page *bpage;
unsigned long addr;
- u64 write_stamp;
- u64 delta;
new_index = rb_event_index(event);
old_index = new_index + rb_event_ts_length(event);
@@ -3017,41 +2961,34 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
bpage = READ_ONCE(cpu_buffer->tail_page);
- delta = rb_time_delta(event);
-
- if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
- return false;
-
- /* Make sure the write stamp is read before testing the location */
- barrier();
-
+ /*
+ * Make sure the tail_page is still the same and
+ * the next write location is the end of this event
+ */
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
local_read(&bpage->write) & ~RB_WRITE_MASK;
unsigned long event_length = rb_event_length(event);
- /* Something came in, can't discard */
- if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
- write_stamp, write_stamp - delta))
- return false;
-
/*
- * It's possible that the event time delta is zero
- * (has the same time stamp as the previous event)
- * in which case write_stamp and before_stamp could
- * be the same. In such a case, force before_stamp
- * to be different than write_stamp. It doesn't
- * matter what it is, as long as its different.
+ * For the before_stamp to be different than the write_stamp
+ * to make sure that the next event adds an absolute
+ * value and does not rely on the saved write stamp, which
+ * is now going to be bogus.
+ *
+ * By setting the before_stamp to zero, the next event
+ * is not going to use the write_stamp and will instead
+ * create an absolute timestamp. This means there's no
+ * reason to update the wirte_stamp!
*/
- if (!delta)
- rb_time_set(&cpu_buffer->before_stamp, 0);
+ rb_time_set(&cpu_buffer->before_stamp, 0);
/*
* If an event were to come in now, it would see that the
* write_stamp and the before_stamp are different, and assume
* that this event just added itself before updating
* the write stamp. The interrupting event will fix the
- * write stamp for us, and use the before stamp as its delta.
+ * write stamp for us, and use an absolute timestamp.
*/
/*
@@ -3488,7 +3425,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
return;
/*
- * If this interrupted another event,
+ * If this interrupted another event,
*/
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
goto out;
@@ -3582,7 +3519,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
* absolute timestamp.
* Don't bother if this is the start of a new page (w == 0).
*/
- if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
+ if (!w) {
+ /* Use the sub-buffer timestamp */
+ info->delta = 0;
+ } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
info->length += RB_LEN_TIME_EXTEND;
} else {
@@ -3605,26 +3545,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
/* See if we shot pass the end of this buffer page */
if (unlikely(write > BUF_PAGE_SIZE)) {
- /* before and after may now different, fix it up*/
- b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- if (a_ok && b_ok && info->before != info->after)
- (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
- info->before, info->after);
- if (a_ok && b_ok)
- check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
+ check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
return rb_move_tail(cpu_buffer, tail, info);
}
if (likely(tail == w)) {
- u64 save_before;
- bool s_ok;
-
/* Nothing interrupted us between A and C */
/*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts);
- barrier();
- /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
- RB_WARN_ON(cpu_buffer, !s_ok);
+ /*
+ * If something came in between C and D, the write stamp
+ * may now not be in sync. But that's fine as the before_stamp
+ * will be different and then next event will just be forced
+ * to use an absolute timestamp.
+ */
if (likely(!(info->add_timestamp &
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
/* This did not interrupt any time update */
@@ -3632,41 +3565,40 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
else
/* Just use full timestamp for interrupting event */
info->delta = info->ts;
- barrier();
check_buffer(cpu_buffer, info, tail);
- if (unlikely(info->ts != save_before)) {
- /* SLOW PATH - Interrupted between C and E */
-
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- RB_WARN_ON(cpu_buffer, !a_ok);
-
- /* Write stamp must only go forward */
- if (save_before > info->after) {
- /*
- * We do not care about the result, only that
- * it gets updated atomically.
- */
- (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
- info->after, save_before);
- }
- }
} else {
u64 ts;
/* SLOW PATH - Interrupted between A and C */
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- /* Was interrupted before here, write_stamp must be valid */
+
+ /* Save the old before_stamp */
+ a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
RB_WARN_ON(cpu_buffer, !a_ok);
+
+ /*
+ * Read a new timestamp and update the before_stamp to make
+ * the next event after this one force using an absolute
+ * timestamp. This is in case an interrupt were to come in
+ * between E and F.
+ */
ts = rb_time_stamp(cpu_buffer->buffer);
+ rb_time_set(&cpu_buffer->before_stamp, ts);
+
barrier();
- /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
- info->after < ts &&
- rb_time_cmpxchg(&cpu_buffer->write_stamp,
- info->after, ts)) {
- /* Nothing came after this event between C and E */
+ /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ /* Was interrupted before here, write_stamp must be valid */
+ RB_WARN_ON(cpu_buffer, !a_ok);
+ barrier();
+ /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+ info->after == info->before && info->after < ts) {
+ /*
+ * Nothing came after this event between C and F, it is
+ * safe to use info->after for the delta as it
+ * matched info->before and is still valid.
+ */
info->delta = ts - info->after;
} else {
/*
- * Interrupted between C and E:
+ * Interrupted between C and F:
* Lost the previous events time stamp. Just set the
* delta to zero, and this will be the same time as
* the event this event interrupted. And the events that
@@ -3717,6 +3649,12 @@ rb_reserve_next_event(struct trace_buffer *buffer,
int nr_loops = 0;
int add_ts_default;
+ /* ring buffer does cmpxchg, make sure it is safe in NMI context */
+ if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+ (unlikely(in_nmi()))) {
+ return NULL;
+ }
+
rb_start_commit(cpu_buffer);
/* The commit page can not change after this */
@@ -3740,6 +3678,8 @@ rb_reserve_next_event(struct trace_buffer *buffer,
if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
add_ts_default = RB_ADD_STAMP_ABSOLUTE;
info.length += RB_LEN_TIME_EXTEND;
+ if (info.length > BUF_MAX_DATA_SIZE)
+ goto out_fail;
} else {
add_ts_default = RB_ADD_STAMP_NONE;
}
@@ -5121,7 +5061,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
if (!iter)
return NULL;
- iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
+ /* Holds the entire event: data and meta data */
+ iter->event = kmalloc(BUF_PAGE_SIZE, flags);
if (!iter->event) {
kfree(iter);
return NULL;
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 8dfe85499d4a..354c2117be43 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -477,6 +477,17 @@ static int __init synth_event_gen_test_init(void)
ret = test_trace_synth_event();
WARN_ON(ret);
+
+ /* Disable when done */
+ trace_array_set_clr_event(gen_synth_test->tr,
+ "synthetic",
+ "gen_synth_test", false);
+ trace_array_set_clr_event(empty_synth_test->tr,
+ "synthetic",
+ "empty_synth_test", false);
+ trace_array_set_clr_event(create_synth_test->tr,
+ "synthetic",
+ "create_synth_test", false);
out:
return ret;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9aebf904ff97..a0defe156b57 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1894,6 +1894,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
__update_max_tr(tr, tsk, cpu);
arch_spin_unlock(&tr->max_lock);
+
+ /* Any waiters on the old snapshot buffer need to wake up */
+ ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
}
/**
@@ -1945,12 +1948,23 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
static int wait_on_pipe(struct trace_iterator *iter, int full)
{
+ int ret;
+
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file,
- full);
+ ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ /*
+ * Make sure this is still the snapshot buffer, as if a snapshot were
+ * to happen, this would now be the main buffer.
+ */
+ if (iter->snapshot)
+ iter->array_buffer = &iter->tr->max_buffer;
+#endif
+ return ret;
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -2360,13 +2374,7 @@ int is_tracing_stopped(void)
return global_trace.stop_count;
}
-/**
- * tracing_start - quick start of the tracer
- *
- * If tracing is enabled but was stopped by tracing_stop,
- * this will start the tracer back up.
- */
-void tracing_start(void)
+static void tracing_start_tr(struct trace_array *tr)
{
struct trace_buffer *buffer;
unsigned long flags;
@@ -2374,119 +2382,83 @@ void tracing_start(void)
if (tracing_disabled)
return;
- raw_spin_lock_irqsave(&global_trace.start_lock, flags);
- if (--global_trace.stop_count) {
- if (global_trace.stop_count < 0) {
+ raw_spin_lock_irqsave(&tr->start_lock, flags);
+ if (--tr->stop_count) {
+ if (WARN_ON_ONCE(tr->stop_count < 0)) {
/* Someone screwed up their debugging */
- WARN_ON_ONCE(1);
- global_trace.stop_count = 0;
+ tr->stop_count = 0;
}
goto out;
}
/* Prevent the buffers from switching */
- arch_spin_lock(&global_trace.max_lock);
+ arch_spin_lock(&tr->max_lock);
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- buffer = global_trace.max_buffer.buffer;
+ buffer = tr->max_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
#endif
- arch_spin_unlock(&global_trace.max_lock);
-
- out:
- raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
-}
-
-static void tracing_start_tr(struct trace_array *tr)
-{
- struct trace_buffer *buffer;
- unsigned long flags;
-
- if (tracing_disabled)
- return;
-
- /* If global, we need to also start the max tracer */
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
- return tracing_start();
-
- raw_spin_lock_irqsave(&tr->start_lock, flags);
-
- if (--tr->stop_count) {
- if (tr->stop_count < 0) {
- /* Someone screwed up their debugging */
- WARN_ON_ONCE(1);
- tr->stop_count = 0;
- }
- goto out;
- }
-
- buffer = tr->array_buffer.buffer;
- if (buffer)
- ring_buffer_record_enable(buffer);
+ arch_spin_unlock(&tr->max_lock);
out:
raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
/**
- * tracing_stop - quick stop of the tracer
+ * tracing_start - quick start of the tracer
*
- * Light weight way to stop tracing. Use in conjunction with
- * tracing_start.
+ * If tracing is enabled but was stopped by tracing_stop,
+ * this will start the tracer back up.
*/
-void tracing_stop(void)
+void tracing_start(void)
+
+{
+ return tracing_start_tr(&global_trace);
+}
+
+static void tracing_stop_tr(struct trace_array *tr)
{
struct trace_buffer *buffer;
unsigned long flags;
- raw_spin_lock_irqsave(&global_trace.start_lock, flags);
- if (global_trace.stop_count++)
+ raw_spin_lock_irqsave(&tr->start_lock, flags);
+ if (tr->stop_count++)
goto out;
/* Prevent the buffers from switching */
- arch_spin_lock(&global_trace.max_lock);
+ arch_spin_lock(&tr->max_lock);
- buffer = global_trace.array_buffer.buffer;
+ buffer = tr->array_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- buffer = global_trace.max_buffer.buffer;
+ buffer = tr->max_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
#endif
- arch_spin_unlock(&global_trace.max_lock);
+ arch_spin_unlock(&tr->max_lock);
out:
- raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+ raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
-static void tracing_stop_tr(struct trace_array *tr)
+/**
+ * tracing_stop - quick stop of the tracer
+ *
+ * Light weight way to stop tracing. Use in conjunction with
+ * tracing_start.
+ */
+void tracing_stop(void)
{
- struct trace_buffer *buffer;
- unsigned long flags;
-
- /* If global, we need to also stop the max tracer */
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
- return tracing_stop();
-
- raw_spin_lock_irqsave(&tr->start_lock, flags);
- if (tr->stop_count++)
- goto out;
-
- buffer = tr->array_buffer.buffer;
- if (buffer)
- ring_buffer_record_disable(buffer);
-
- out:
- raw_spin_unlock_irqrestore(&tr->start_lock, flags);
+ return tracing_stop_tr(&global_trace);
}
static int trace_save_cmdline(struct task_struct *tsk)
@@ -2770,8 +2742,11 @@ void trace_buffered_event_enable(void)
for_each_tracing_cpu(cpu) {
page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
- goto failed;
+ /* This is just an optimization and can handle failures */
+ if (!page) {
+ pr_err("Failed to allocate event buffer\n");
+ break;
+ }
event = page_address(page);
memset(event, 0, sizeof(*event));
@@ -2785,10 +2760,6 @@ void trace_buffered_event_enable(void)
WARN_ON_ONCE(1);
preempt_enable();
}
-
- return;
- failed:
- trace_buffered_event_disable();
}
static void enable_trace_buffered_event(void *data)
@@ -2823,11 +2794,9 @@ void trace_buffered_event_disable(void)
if (--trace_buffered_event_ref)
return;
- preempt_disable();
/* For each CPU, set the buffer as used. */
- smp_call_function_many(tracing_buffer_mask,
- disable_trace_buffered_event, NULL, 1);
- preempt_enable();
+ on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event,
+ NULL, true);
/* Wait for all current users to finish */
synchronize_rcu();
@@ -2836,17 +2805,19 @@ void trace_buffered_event_disable(void)
free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
per_cpu(trace_buffered_event, cpu) = NULL;
}
+
/*
- * Make sure trace_buffered_event is NULL before clearing
- * trace_buffered_event_cnt.
+ * Wait for all CPUs that potentially started checking if they can use
+ * their event buffer only after the previous synchronize_rcu() call and
+ * they still read a valid pointer from trace_buffered_event. It must be
+ * ensured they don't see cleared trace_buffered_event_cnt else they
+ * could wrongly decide to use the pointed-to buffer which is now freed.
*/
- smp_wmb();
+ synchronize_rcu();
- preempt_disable();
- /* Do the work on each cpu */
- smp_call_function_many(tracing_buffer_mask,
- enable_trace_buffered_event, NULL, 1);
- preempt_enable();
+ /* For each CPU, relinquish the buffer */
+ on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL,
+ true);
}
static struct trace_buffer *temp_buffer;
@@ -4765,7 +4736,11 @@ static int s_show(struct seq_file *m, void *v)
iter->leftover = ret;
} else {
- print_trace_line(iter);
+ ret = print_trace_line(iter);
+ if (ret == TRACE_TYPE_PARTIAL_LINE) {
+ iter->seq.full = 0;
+ trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
+ }
ret = trace_print_seq(m, &iter->seq);
/*
* If we overflow the seq_file buffer, then it will
@@ -5007,6 +4982,12 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp)
return 0;
}
+int tracing_single_release_file_tr(struct inode *inode, struct file *filp)
+{
+ tracing_release_file_tr(inode, filp);
+ return single_release(inode, filp);
+}
+
static int tracing_mark_open(struct inode *inode, struct file *filp)
{
stream_open(inode, filp);
@@ -6387,13 +6368,15 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
if (!tr->array_buffer.buffer)
return 0;
+ /* Do not allow tracing while resizing ring buffer */
+ tracing_stop_tr(tr);
+
ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu);
if (ret < 0)
- return ret;
+ goto out_start;
#ifdef CONFIG_TRACER_MAX_TRACE
- if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
- !tr->current_trace->use_max_tr)
+ if (!tr->allocated_snapshot)
goto out;
ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
@@ -6418,7 +6401,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
WARN_ON(1);
tracing_disabled = 1;
}
- return ret;
+ goto out_start;
}
update_buffer_entries(&tr->max_buffer, cpu);
@@ -6427,7 +6410,8 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
#endif /* CONFIG_TRACER_MAX_TRACE */
update_buffer_entries(&tr->array_buffer, cpu);
-
+ out_start:
+ tracing_start_tr(tr);
return ret;
}
@@ -8547,7 +8531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
wait_index = READ_ONCE(iter->wait_index);
- ret = wait_on_pipe(iter, iter->tr->buffer_percent);
+ ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
if (ret)
goto out;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b7f4ea25a194..0489e72c8169 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -617,6 +617,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
int tracing_open_file_tr(struct inode *inode, struct file *filp);
int tracing_release_file_tr(struct inode *inode, struct file *filp);
+int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1abc07fba1b9..5ecf3c8bde20 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5623,10 +5623,12 @@ static int event_hist_open(struct inode *inode, struct file *file)
{
int ret;
- ret = security_locked_down(LOCKDOWN_TRACEFS);
+ ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
+ /* Clear private_data to avoid warning in single_open() */
+ file->private_data = NULL;
return single_open(file, hist_show, file);
}
@@ -5634,7 +5636,7 @@ const struct file_operations event_hist_fops = {
.open = event_hist_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = tracing_single_release_file_tr,
};
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
@@ -5900,10 +5902,12 @@ static int event_hist_debug_open(struct inode *inode, struct file *file)
{
int ret;
- ret = security_locked_down(LOCKDOWN_TRACEFS);
+ ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
+ /* Clear private_data to avoid warning in single_open() */
+ file->private_data = NULL;
return single_open(file, hist_debug_show, file);
}
@@ -5911,7 +5915,7 @@ const struct file_operations event_hist_debug_fops = {
.open = event_hist_debug_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = tracing_single_release_file_tr,
};
#endif
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 846e02c0fb59..e7af286af4f1 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -1137,7 +1137,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_fields);
* @cmd: A pointer to the dynevent_cmd struct representing the new event
* @name: The name of the synthetic event
* @mod: The module creating the event, NULL if not created from a module
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
*
* NOTE: Users normally won't want to call this function directly, but
* rather use the synth_event_gen_cmd_start() wrapper, which
@@ -1695,7 +1695,7 @@ __synth_event_trace_end(struct synth_event_trace_state *trace_state)
* synth_event_trace - Trace a synthetic event
* @file: The trace_event_file representing the synthetic event
* @n_vals: The number of values in vals
- * @args: Variable number of args containing the event values
+ * @...: Variable number of args containing the event values
*
* Trace a synthetic event using the values passed in the variable
* argument list.
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 9365ce407426..e76f5e1efdf2 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -2177,14 +2177,12 @@ static int user_events_open(struct inode *node, struct file *file)
static ssize_t user_events_write(struct file *file, const char __user *ubuf,
size_t count, loff_t *ppos)
{
- struct iovec iov;
struct iov_iter i;
if (unlikely(*ppos != 0))
return -EFAULT;
- if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf,
- count, &iov, &i)))
+ if (unlikely(import_ubuf(ITER_SOURCE, (char __user *)ubuf, count, &i)))
return -EFAULT;
return user_events_write_core(file, &i);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d8b302d01083..3e7fa44dc2b2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1587,11 +1587,12 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
+ int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
seq_print_ip_sym(s, field->ip, flags);
- trace_seq_printf(s, ": %s", field->buf);
+ trace_seq_printf(s, ": %.*s", max, field->buf);
return trace_handle_return(s);
}
@@ -1600,10 +1601,11 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
struct print_entry *field;
+ int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
- trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
+ trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf);
return trace_handle_return(&iter->seq);
}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index eabe8bcc7042..ce4d99df5f0e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -231,7 +231,7 @@ void __put_user_ns(struct user_namespace *ns)
}
EXPORT_SYMBOL(__put_user_ns);
-/**
+/*
* struct idmap_key - holds the information necessary to find an idmapping in a
* sorted idmap array. It is passed to cmp_map_id() as first argument.
*/
@@ -241,7 +241,7 @@ struct idmap_key {
u32 count; /* == 0 unless used with map_id_range_down() */
};
-/**
+/*
* cmp_map_id - Function to be passed to bsearch() to find the requested
* idmapping. Expects struct idmap_key to be passed via @k.
*/
@@ -271,7 +271,7 @@ static int cmp_map_id(const void *k, const void *e)
return 1;
}
-/**
+/*
* map_id_range_down_max - Find idmap via binary search in ordered idmap array.
* Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
*/
@@ -288,7 +288,7 @@ map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 cou
sizeof(struct uid_gid_extent), cmp_map_id);
}
-/**
+/*
* map_id_range_down_base - Find idmap via binary search in static extent array.
* Can only be called if number of mappings is equal or less than
* UID_GID_MAP_MAX_BASE_EXTENTS.
@@ -332,12 +332,12 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
return id;
}
-static u32 map_id_down(struct uid_gid_map *map, u32 id)
+u32 map_id_down(struct uid_gid_map *map, u32 id)
{
return map_id_range_down(map, id, 1);
}
-/**
+/*
* map_id_up_base - Find idmap via binary search in static extent array.
* Can only be called if number of mappings is equal or less than
* UID_GID_MAP_MAX_BASE_EXTENTS.
@@ -358,7 +358,7 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
return NULL;
}
-/**
+/*
* map_id_up_max - Find idmap via binary search in ordered idmap array.
* Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
*/
@@ -375,7 +375,7 @@ map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
sizeof(struct uid_gid_extent), cmp_map_id);
}
-static u32 map_id_up(struct uid_gid_map *map, u32 id)
+u32 map_id_up(struct uid_gid_map *map, u32 id)
{
struct uid_gid_extent *extent;
unsigned extents = map->nr_extents;
@@ -770,7 +770,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
return false;
}
-/**
+/*
* insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
* Takes care to allocate a 4K block of memory if the number of mappings exceeds
* UID_GID_MAP_MAX_BASE_EXTENTS.
@@ -839,7 +839,7 @@ static int cmp_extents_reverse(const void *a, const void *b)
return 0;
}
-/**
+/*
* sort_idmaps - Sorts an array of idmap entries.
* Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
*/
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 778b4056700f..03b90d7d2175 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -270,7 +270,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
goto error;
ret = -ENOMEM;
- pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
goto error;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5cd6d4e26915..81a8862295d6 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -91,7 +91,7 @@ static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched);
-static unsigned long watchdog_hardlockup_all_cpu_dumped;
+static unsigned long hard_lockup_nmi_warn;
notrace void arch_touch_nmi_watchdog(void)
{
@@ -151,12 +151,32 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
*/
if (is_hardlockup(cpu)) {
unsigned int this_cpu = smp_processor_id();
+ unsigned long flags;
/* Only print hardlockups once. */
if (per_cpu(watchdog_hardlockup_warned, cpu))
return;
+ /*
+ * Prevent multiple hard-lockup reports if one cpu is already
+ * engaged in dumping all cpu back traces.
+ */
+ if (sysctl_hardlockup_all_cpu_backtrace) {
+ if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn))
+ return;
+ }
+
+ /*
+ * NOTE: we call printk_cpu_sync_get_irqsave() after printing
+ * the lockup message. While it would be nice to serialize
+ * that printout, we really want to make sure that if some
+ * other CPU somehow locked up while holding the lock associated
+ * with printk_cpu_sync_get_irqsave() that we can still at least
+ * get the message about the lockup out.
+ */
pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu);
+ printk_cpu_sync_get_irqsave(flags);
+
print_modules();
print_irqtrace_events(current);
if (cpu == this_cpu) {
@@ -164,17 +184,17 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
show_regs(regs);
else
dump_stack();
+ printk_cpu_sync_put_irqrestore(flags);
} else {
+ printk_cpu_sync_put_irqrestore(flags);
trigger_single_cpu_backtrace(cpu);
}
- /*
- * Perform multi-CPU dump only once to avoid multiple
- * hardlockups generating interleaving traces
- */
- if (sysctl_hardlockup_all_cpu_backtrace &&
- !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped))
+ if (sysctl_hardlockup_all_cpu_backtrace) {
trigger_allbutcpu_cpu_backtrace(cpu);
+ if (!hardlockup_panic)
+ clear_bit_unlock(0, &hard_lockup_nmi_warn);
+ }
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");
@@ -448,6 +468,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
struct pt_regs *regs = get_irq_regs();
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ unsigned long flags;
if (!watchdog_enabled)
return HRTIMER_NORESTART;
@@ -514,6 +535,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* Start period for the next softlockup warning. */
update_report_ts();
+ printk_cpu_sync_get_irqsave(flags);
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
@@ -523,10 +545,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
show_regs(regs);
else
dump_stack();
+ printk_cpu_sync_put_irqrestore(flags);
if (softlockup_all_cpu_backtrace) {
trigger_allbutcpu_cpu_backtrace(smp_processor_id());
- clear_bit_unlock(0, &soft_lockup_nmi_warn);
+ if (!softlockup_panic)
+ clear_bit_unlock(0, &soft_lockup_nmi_warn);
}
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6e578f576a6f..76e60faed892 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -381,6 +381,12 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */
/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
+/* PL: user requested unbound cpumask via sysfs */
+static cpumask_var_t wq_requested_unbound_cpumask;
+
+/* PL: isolated cpumask to be excluded from unbound cpumask */
+static cpumask_var_t wq_isolated_cpumask;
+
/* for further constrain wq_unbound_cpumask by cmdline parameter*/
static struct cpumask wq_cmdline_cpumask __initdata;
@@ -1684,9 +1690,6 @@ static int wq_select_unbound_cpu(int cpu)
pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
}
- if (cpumask_empty(wq_unbound_cpumask))
- return cpu;
-
new_cpu = __this_cpu_read(wq_rr_cpu_last);
new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
if (unlikely(new_cpu >= nr_cpu_ids)) {
@@ -4411,19 +4414,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
mutex_unlock(&ctx->wq->mutex);
}
-static void apply_wqattrs_lock(void)
-{
- /* CPUs should stay stable across pwq creations and installations */
- cpus_read_lock();
- mutex_lock(&wq_pool_mutex);
-}
-
-static void apply_wqattrs_unlock(void)
-{
- mutex_unlock(&wq_pool_mutex);
- cpus_read_unlock();
-}
-
static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
@@ -5828,39 +5818,40 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
}
/**
- * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
- * @cpumask: the cpumask to set
- *
- * The low-level workqueues cpumask is a global cpumask that limits
- * the affinity of all unbound workqueues. This function check the @cpumask
- * and apply it to all unbound workqueues and updates all pwqs of them.
+ * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
+ * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
*
- * Return: 0 - Success
- * -EINVAL - Invalid @cpumask
- * -ENOMEM - Failed to allocate memory for attrs or pwqs.
+ * This function can be called from cpuset code to provide a set of isolated
+ * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
+ * either cpus_read_lock or cpus_write_lock.
*/
-int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
{
- int ret = -EINVAL;
+ cpumask_var_t cpumask;
+ int ret = 0;
+
+ if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ lockdep_assert_cpus_held();
+ mutex_lock(&wq_pool_mutex);
+
+ /* Save the current isolated cpumask & export it via sysfs */
+ cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
/*
- * Not excluding isolated cpus on purpose.
- * If the user wishes to include them, we allow that.
+ * If the operation fails, it will fall back to
+ * wq_requested_unbound_cpumask which is initially set to
+ * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
+ * by any subsequent write to workqueue/cpumask sysfs file.
*/
- cpumask_and(cpumask, cpumask, cpu_possible_mask);
- if (!cpumask_empty(cpumask)) {
- apply_wqattrs_lock();
- if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
- ret = 0;
- goto out_unlock;
- }
-
+ if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
+ cpumask_copy(cpumask, wq_requested_unbound_cpumask);
+ if (!cpumask_equal(cpumask, wq_unbound_cpumask))
ret = workqueue_apply_unbound_cpumask(cpumask);
-out_unlock:
- apply_wqattrs_unlock();
- }
-
+ mutex_unlock(&wq_pool_mutex);
+ free_cpumask_var(cpumask);
return ret;
}
@@ -5982,6 +5973,19 @@ static struct attribute *wq_sysfs_attrs[] = {
};
ATTRIBUTE_GROUPS(wq_sysfs);
+static void apply_wqattrs_lock(void)
+{
+ /* CPUs should stay stable across pwq creations and installations */
+ cpus_read_lock();
+ mutex_lock(&wq_pool_mutex);
+}
+
+static void apply_wqattrs_unlock(void)
+{
+ mutex_unlock(&wq_pool_mutex);
+ cpus_read_unlock();
+}
+
static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -6158,19 +6162,74 @@ static struct bus_type wq_subsys = {
.dev_groups = wq_sysfs_groups,
};
-static ssize_t wq_unbound_cpumask_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+/**
+ * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
+ * @cpumask: the cpumask to set
+ *
+ * The low-level workqueues cpumask is a global cpumask that limits
+ * the affinity of all unbound workqueues. This function check the @cpumask
+ * and apply it to all unbound workqueues and updates all pwqs of them.
+ *
+ * Return: 0 - Success
+ * -EINVAL - Invalid @cpumask
+ * -ENOMEM - Failed to allocate memory for attrs or pwqs.
+ */
+static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+{
+ int ret = -EINVAL;
+
+ /*
+ * Not excluding isolated cpus on purpose.
+ * If the user wishes to include them, we allow that.
+ */
+ cpumask_and(cpumask, cpumask, cpu_possible_mask);
+ if (!cpumask_empty(cpumask)) {
+ apply_wqattrs_lock();
+ cpumask_copy(wq_requested_unbound_cpumask, cpumask);
+ if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ ret = workqueue_apply_unbound_cpumask(cpumask);
+
+out_unlock:
+ apply_wqattrs_unlock();
+ }
+
+ return ret;
+}
+
+static ssize_t __wq_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf, cpumask_var_t mask)
{
int written;
mutex_lock(&wq_pool_mutex);
- written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
- cpumask_pr_args(wq_unbound_cpumask));
+ written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
mutex_unlock(&wq_pool_mutex);
return written;
}
+static ssize_t wq_unbound_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
+}
+
+static ssize_t wq_requested_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
+}
+
+static ssize_t wq_isolated_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
+}
+
static ssize_t wq_unbound_cpumask_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
@@ -6188,9 +6247,13 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev,
return ret ? ret : count;
}
-static struct device_attribute wq_sysfs_cpumask_attr =
+static struct device_attribute wq_sysfs_cpumask_attrs[] = {
__ATTR(cpumask, 0644, wq_unbound_cpumask_show,
- wq_unbound_cpumask_store);
+ wq_unbound_cpumask_store),
+ __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL),
+ __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL),
+ __ATTR_NULL,
+};
static int __init wq_sysfs_init(void)
{
@@ -6203,7 +6266,13 @@ static int __init wq_sysfs_init(void)
dev_root = bus_get_dev_root(&wq_subsys);
if (dev_root) {
- err = device_create_file(dev_root, &wq_sysfs_cpumask_attr);
+ struct device_attribute *attr;
+
+ for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) {
+ err = device_create_file(dev_root, attr);
+ if (err)
+ break;
+ }
put_device(dev_root);
}
return err;
@@ -6515,6 +6584,17 @@ static inline void wq_watchdog_init(void) { }
#endif /* CONFIG_WQ_WATCHDOG */
+static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
+{
+ if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
+ pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
+ cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
+ return;
+ }
+
+ cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
+}
+
/**
* workqueue_init_early - early init for workqueue subsystem
*
@@ -6534,11 +6614,16 @@ void __init workqueue_init_early(void)
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
- cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
- cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
+ cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+ restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
+ restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
if (!cpumask_empty(&wq_cmdline_cpumask))
- cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask);
+ restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
+
+ cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);