summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c27
-rw-r--r--kernel/bpf/devmap.c6
-rw-r--r--kernel/bpf/disasm.c16
-rw-r--r--kernel/bpf/verifier.c210
-rw-r--r--kernel/cgroup/cgroup-v1.c16
-rw-r--r--kernel/debug/gdbstub.c2
-rw-r--r--kernel/dma/ops_helpers.c12
-rw-r--r--kernel/rcu/refscale.c6
-rw-r--r--kernel/rcu/tasks.h6
-rw-r--r--kernel/rcu/tree_stall.h4
-rw-r--r--kernel/scftorture.c6
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/time/posix-cpu-timers.c10
-rw-r--r--kernel/time/timer.c8
-rw-r--r--kernel/trace/ftrace.c5
-rw-r--r--kernel/trace/ring_buffer.c28
-rw-r--r--kernel/trace/trace.c4
-rw-r--r--kernel/trace/trace_events_hist.c28
-rw-r--r--kernel/trace/trace_events_synth.c8
-rw-r--r--kernel/trace/trace_synth.h2
-rw-r--r--kernel/tracepoint.c2
-rw-r--r--kernel/workqueue.c20
22 files changed, 216 insertions, 212 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 034ad93a1ad7..b1a5fc04492b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -32,6 +32,8 @@
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
+
+#include <asm/barrier.h>
#include <asm/unaligned.h>
/* Registers */
@@ -1377,6 +1379,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
/* Non-UAPI available opcodes. */
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
+ [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC,
[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
@@ -1621,7 +1624,21 @@ out:
COND_JMP(s, JSGE, >=)
COND_JMP(s, JSLE, <=)
#undef COND_JMP
- /* STX and ST and LDX*/
+ /* ST, STX and LDX*/
+ ST_NOSPEC:
+ /* Speculation barrier for mitigating Speculative Store Bypass.
+ * In case of arm64, we rely on the firmware mitigation as
+ * controlled via the ssbd kernel parameter. Whenever the
+ * mitigation is enabled, it works for all of the kernel code
+ * with no need to provide any additional instructions here.
+ * In case of x86, we use 'lfence' insn for mitigation. We
+ * reuse preexisting logic from Spectre v1 mitigation that
+ * happens to produce the required code on x86 for v4 as well.
+ */
+#ifdef CONFIG_X86
+ barrier_nospec();
+#endif
+ CONT;
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
@@ -2236,8 +2253,14 @@ static void bpf_prog_free_deferred(struct work_struct *work)
#endif
if (aux->dst_trampoline)
bpf_trampoline_put(aux->dst_trampoline);
- for (i = 0; i < aux->func_cnt; i++)
+ for (i = 0; i < aux->func_cnt; i++) {
+ /* We can just unlink the subprog poke descriptor table as
+ * it was originally linked to the main program and is also
+ * released along with it.
+ */
+ aux->func[i]->aux->poke_tab = NULL;
bpf_jit_free(aux->func[i]);
+ }
if (aux->func_cnt) {
kfree(aux->func);
bpf_prog_unlock_free(aux->prog);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2546dafd6672..fdc20892837c 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -558,7 +558,8 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
for (i = 0; i < map->max_entries; i++) {
- dst = READ_ONCE(dtab->netdev_map[i]);
+ dst = rcu_dereference_check(dtab->netdev_map[i],
+ rcu_read_lock_bh_held());
if (!is_valid_dst(dst, xdp, exclude_ifindex))
continue;
@@ -654,7 +655,8 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
for (i = 0; i < map->max_entries; i++) {
- dst = READ_ONCE(dtab->netdev_map[i]);
+ dst = rcu_dereference_check(dtab->netdev_map[i],
+ rcu_read_lock_bh_held());
if (!dst || dst->dev->ifindex == exclude_ifindex)
continue;
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index bbfc6bb79240..ca3cd9aaa6ce 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -206,15 +206,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "BUG_%02x\n", insn->code);
}
} else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
+ if (BPF_MODE(insn->code) == BPF_MEM) {
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) {
+ verbose(cbs->private_data, "(%02x) nospec\n", insn->code);
+ } else {
verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
- return;
}
- verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
} else if (class == BPF_LDX) {
if (BPF_MODE(insn->code) != BPF_MEM) {
verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index be38bb930bf1..f9bda5476ea5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2610,6 +2610,19 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
cur = env->cur_state->frame[env->cur_state->curframe];
if (value_regno >= 0)
reg = &cur->regs[value_regno];
+ if (!env->bypass_spec_v4) {
+ bool sanitize = reg && is_spillable_regtype(reg->type);
+
+ for (i = 0; i < size; i++) {
+ if (state->stack[spi].slot_type[i] == STACK_INVALID) {
+ sanitize = true;
+ break;
+ }
+ }
+
+ if (sanitize)
+ env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
+ }
if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
@@ -2632,47 +2645,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
verbose(env, "invalid size of register spill\n");
return -EACCES;
}
-
if (state != cur && reg->type == PTR_TO_STACK) {
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}
-
- if (!env->bypass_spec_v4) {
- bool sanitize = false;
-
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
- register_is_const(&state->stack[spi].spilled_ptr))
- sanitize = true;
- for (i = 0; i < BPF_REG_SIZE; i++)
- if (state->stack[spi].slot_type[i] == STACK_MISC) {
- sanitize = true;
- break;
- }
- if (sanitize) {
- int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
- int soff = (-spi - 1) * BPF_REG_SIZE;
-
- /* detected reuse of integer stack slot with a pointer
- * which means either llvm is reusing stack slot or
- * an attacker is trying to exploit CVE-2018-3639
- * (speculative store bypass)
- * Have to sanitize that slot with preemptive
- * store of zero.
- */
- if (*poff && *poff != soff) {
- /* disallow programs where single insn stores
- * into two different stack slots, since verifier
- * cannot sanitize them
- */
- verbose(env,
- "insn %d cannot access two stack slots fp%d and fp%d",
- insn_idx, *poff, soff);
- return -EINVAL;
- }
- *poff = soff;
- }
- }
save_register_state(state, spi, reg);
} else {
u8 type = STACK_MISC;
@@ -3677,6 +3653,8 @@ continue_func:
if (tail_call_reachable)
for (j = 0; j < frame; j++)
subprog[ret_prog[j]].tail_call_reachable = true;
+ if (subprog[0].tail_call_reachable)
+ env->prog->aux->tail_call_reachable = true;
/* end of for() loop means the last insn of the 'subprog'
* was reached. Doesn't matter whether it was JA or EXIT
@@ -6559,6 +6537,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
alu_state |= ptr_is_dst_reg ?
BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+
+ /* Limit pruning on unknown scalars to enable deep search for
+ * potential masking differences from other program paths.
+ */
+ if (!off_is_imm)
+ env->explore_alu_limits = true;
}
err = update_alu_sanitation_state(aux, alu_state, alu_limit);
@@ -9934,8 +9918,8 @@ next:
}
/* Returns true if (rold safe implies rcur safe) */
-static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
- struct bpf_id_pair *idmap)
+static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
+ struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
{
bool equal;
@@ -9961,6 +9945,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
switch (rold->type) {
case SCALAR_VALUE:
+ if (env->explore_alu_limits)
+ return false;
if (rcur->type == SCALAR_VALUE) {
if (!rold->precise && !rcur->precise)
return true;
@@ -10051,9 +10037,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
}
-static bool stacksafe(struct bpf_func_state *old,
- struct bpf_func_state *cur,
- struct bpf_id_pair *idmap)
+static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, struct bpf_id_pair *idmap)
{
int i, spi;
@@ -10098,9 +10083,8 @@ static bool stacksafe(struct bpf_func_state *old,
continue;
if (old->stack[spi].slot_type[0] != STACK_SPILL)
continue;
- if (!regsafe(&old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr,
- idmap))
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap))
/* when explored and current stack slot are both storing
* spilled registers, check that stored pointers types
* are the same as well.
@@ -10157,10 +10141,11 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
for (i = 0; i < MAX_BPF_REG; i++)
- if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch))
+ if (!regsafe(env, &old->regs[i], &cur->regs[i],
+ env->idmap_scratch))
return false;
- if (!stacksafe(old, cur, env->idmap_scratch))
+ if (!stacksafe(env, old, cur, env->idmap_scratch))
return false;
if (!refsafe(old, cur))
@@ -11904,35 +11889,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
bpf_convert_ctx_access_t convert_ctx_access;
+ bool ctx_access;
if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
type = BPF_READ;
- else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_DW))
+ ctx_access = true;
+ } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
- else
+ ctx_access = BPF_CLASS(insn->code) == BPF_STX;
+ } else {
continue;
+ }
if (type == BPF_WRITE &&
- env->insn_aux_data[i + delta].sanitize_stack_off) {
+ env->insn_aux_data[i + delta].sanitize_stack_spill) {
struct bpf_insn patch[] = {
- /* Sanitize suspicious stack slot with zero.
- * There are no memory dependencies for this store,
- * since it's only using frame pointer and immediate
- * constant of zero
- */
- BPF_ST_MEM(BPF_DW, BPF_REG_FP,
- env->insn_aux_data[i + delta].sanitize_stack_off,
- 0),
- /* the original STX instruction will immediately
- * overwrite the same stack slot with appropriate value
- */
*insn,
+ BPF_ST_NOSPEC(),
};
cnt = ARRAY_SIZE(patch);
@@ -11946,6 +11929,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
continue;
}
+ if (!ctx_access)
+ continue;
+
switch (env->insn_aux_data[i + delta].ptr_type) {
case PTR_TO_CTX:
if (!ops->convert_ctx_access)
@@ -12121,33 +12107,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
goto out_free;
func[i]->is_func = 1;
func[i]->aux->func_idx = i;
- /* the btf and func_info will be freed only at prog->aux */
+ /* Below members will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
func[i]->aux->func_info = prog->aux->func_info;
+ func[i]->aux->poke_tab = prog->aux->poke_tab;
+ func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
for (j = 0; j < prog->aux->size_poke_tab; j++) {
- u32 insn_idx = prog->aux->poke_tab[j].insn_idx;
- int ret;
+ struct bpf_jit_poke_descriptor *poke;
- if (!(insn_idx >= subprog_start &&
- insn_idx <= subprog_end))
- continue;
-
- ret = bpf_jit_add_poke_descriptor(func[i],
- &prog->aux->poke_tab[j]);
- if (ret < 0) {
- verbose(env, "adding tail call poke descriptor failed\n");
- goto out_free;
- }
-
- func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1;
-
- map_ptr = func[i]->aux->poke_tab[ret].tail_call.map;
- ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux);
- if (ret < 0) {
- verbose(env, "tracking tail call prog failed\n");
- goto out_free;
- }
+ poke = &prog->aux->poke_tab[j];
+ if (poke->insn_idx < subprog_end &&
+ poke->insn_idx >= subprog_start)
+ poke->aux = func[i]->aux;
}
/* Use bpf_prog_F_tag to indicate functions in stack traces.
@@ -12178,18 +12150,6 @@ static int jit_subprogs(struct bpf_verifier_env *env)
cond_resched();
}
- /* Untrack main program's aux structs so that during map_poke_run()
- * we will not stumble upon the unfilled poke descriptors; each
- * of the main program's poke descs got distributed across subprogs
- * and got tracked onto map, so we are sure that none of them will
- * be missed after the operation below
- */
- for (i = 0; i < prog->aux->size_poke_tab; i++) {
- map_ptr = prog->aux->poke_tab[i].tail_call.map;
-
- map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
- }
-
/* at this point all bpf functions were successfully JITed
* now populate all bpf_calls with correct addresses and
* run last pass of JIT
@@ -12267,14 +12227,22 @@ static int jit_subprogs(struct bpf_verifier_env *env)
bpf_prog_jit_attempt_done(prog);
return 0;
out_free:
+ /* We failed JIT'ing, so at this point we need to unregister poke
+ * descriptors from subprogs, so that kernel is not attempting to
+ * patch it anymore as we're freeing the subprog JIT memory.
+ */
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
+ map_ptr = prog->aux->poke_tab[i].tail_call.map;
+ map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
+ }
+ /* At this point we're guaranteed that poke descriptors are not
+ * live anymore. We can just unlink its descriptor table as it's
+ * released with the main prog.
+ */
for (i = 0; i < env->subprog_cnt; i++) {
if (!func[i])
continue;
-
- for (j = 0; j < func[i]->aux->size_poke_tab; j++) {
- map_ptr = func[i]->aux->poke_tab[j].tail_call.map;
- map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux);
- }
+ func[i]->aux->poke_tab = NULL;
bpf_jit_free(func[i]);
}
kfree(func);
@@ -12768,37 +12736,6 @@ static void free_states(struct bpf_verifier_env *env)
}
}
-/* The verifier is using insn_aux_data[] to store temporary data during
- * verification and to store information for passes that run after the
- * verification like dead code sanitization. do_check_common() for subprogram N
- * may analyze many other subprograms. sanitize_insn_aux_data() clears all
- * temporary data after do_check_common() finds that subprogram N cannot be
- * verified independently. pass_cnt counts the number of times
- * do_check_common() was run and insn->aux->seen tells the pass number
- * insn_aux_data was touched. These variables are compared to clear temporary
- * data from failed pass. For testing and experiments do_check_common() can be
- * run multiple times even when prior attempt to verify is unsuccessful.
- *
- * Note that special handling is needed on !env->bypass_spec_v1 if this is
- * ever called outside of error path with subsequent program rejection.
- */
-static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
-{
- struct bpf_insn *insn = env->prog->insnsi;
- struct bpf_insn_aux_data *aux;
- int i, class;
-
- for (i = 0; i < env->prog->len; i++) {
- class = BPF_CLASS(insn[i].code);
- if (class != BPF_LDX && class != BPF_STX)
- continue;
- aux = &env->insn_aux_data[i];
- if (aux->seen != env->pass_cnt)
- continue;
- memset(aux, 0, offsetof(typeof(*aux), orig_idx));
- }
-}
-
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
@@ -12875,9 +12812,6 @@ out:
if (!ret && pop_log)
bpf_vlog_reset(&env->log, 0);
free_states(env);
- if (ret)
- /* clean aux data in case subprog was rejected */
- sanitize_insn_aux_data(env);
return ret;
}
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index ee93b6e89587..de2c432dee20 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -911,13 +911,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
if (opt == -ENOPARAM) {
- if (strcmp(param->key, "source") == 0) {
- if (fc->source)
- return invalf(fc, "Multiple sources not supported");
- fc->source = param->string;
- param->string = NULL;
- return 0;
- }
+ int ret;
+
+ ret = vfs_parse_fs_param_source(fc, param);
+ if (ret != -ENOPARAM)
+ return ret;
for_each_subsys(ss, i) {
if (strcmp(param->key, ss->legacy_name))
continue;
@@ -1223,9 +1221,7 @@ int cgroup1_get_tree(struct fs_context *fc)
ret = cgroup_do_get_tree(fc);
if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
- struct super_block *sb = fc->root->d_sb;
- dput(fc->root);
- deactivate_locked_super(sb);
+ fc_drop_locked(fc);
ret = 1;
}
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 8372897402f4..b6f28fad4307 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1045,8 +1045,8 @@ int gdb_serial_stub(struct kgdb_state *ks)
gdb_cmd_detachkill(ks);
return DBG_PASS_EVENT;
}
-#endif
fallthrough;
+#endif
case 'C': /* Exception passing */
tmp = gdb_cmd_exception_pass(ks);
if (tmp > 0)
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index 910ae69cae77..af4a6ef48ce0 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -5,6 +5,13 @@
*/
#include <linux/dma-map-ops.h>
+static struct page *dma_common_vaddr_to_page(void *cpu_addr)
+{
+ if (is_vmalloc_addr(cpu_addr))
+ return vmalloc_to_page(cpu_addr);
+ return virt_to_page(cpu_addr);
+}
+
/*
* Create scatter-list for the already allocated DMA buffer.
*/
@@ -12,7 +19,7 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
- struct page *page = virt_to_page(cpu_addr);
+ struct page *page = dma_common_vaddr_to_page(cpu_addr);
int ret;
ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
@@ -32,6 +39,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
unsigned long user_count = vma_pages(vma);
unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long off = vma->vm_pgoff;
+ struct page *page = dma_common_vaddr_to_page(cpu_addr);
int ret = -ENXIO;
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
@@ -43,7 +51,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
return -ENXIO;
return remap_pfn_range(vma, vma->vm_start,
- page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff,
+ page_to_pfn(page) + vma->vm_pgoff,
user_count << PAGE_SHIFT, vma->vm_page_prot);
#else
return -ENXIO;
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 313d4547cbc7..d998a76fb542 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -487,13 +487,13 @@ ref_scale_reader(void *arg)
s64 duration;
VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me);
- set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)));
set_user_nice(current, MAX_NICE);
atomic_inc(&n_init);
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
repeat:
- VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, raw_smp_processor_id());
// Wait for signal that this reader can start.
wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
@@ -503,7 +503,7 @@ repeat:
goto end;
// Make sure that the CPU is affinitized appropriately during testing.
- WARN_ON_ONCE(smp_processor_id() != me);
+ WARN_ON_ONCE(raw_smp_processor_id() != me);
WRITE_ONCE(rt->start_reader, 0);
if (!atomic_dec_return(&n_started))
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 03a118d1c003..8536c55df514 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -953,10 +953,9 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
in_qs = likely(!t->trc_reader_nesting);
}
- // Mark as checked. Because this is called from the grace-period
- // kthread, also remove the task from the holdout list.
+ // Mark as checked so that the grace-period kthread will
+ // remove it from the holdout list.
t->trc_reader_checked = true;
- trc_del_holdout(t);
if (in_qs)
return true; // Already in quiescent state, done!!!
@@ -983,7 +982,6 @@ static void trc_wait_for_one_reader(struct task_struct *t,
// The current task had better be in a quiescent state.
if (t == current) {
t->trc_reader_checked = true;
- trc_del_holdout(t);
WARN_ON_ONCE(t->trc_reader_nesting);
return;
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 3f937b20814f..6c76988cc019 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -795,9 +795,9 @@ void show_rcu_gp_kthreads(void)
jr = j - data_race(rcu_state.gp_req_activity);
js = j - data_race(rcu_state.gp_start);
jw = j - data_race(rcu_state.gp_wake_time);
- pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
+ pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
rcu_state.name, gp_state_getname(rcu_state.gp_state),
- rcu_state.gp_state, t ? t->__state : 0x1ffffL, t ? t->rt_priority : 0xffU,
+ rcu_state.gp_state, t ? t->__state : 0x1ffff, t ? t->rt_priority : 0xffU,
js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
(long)data_race(rcu_state.gp_seq),
(long)data_race(rcu_get_root()->gp_seq_needed),
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 2377cbb32474..29e8fc5d91a7 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -405,15 +405,15 @@ static int scftorture_invoker(void *arg)
VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu);
cpu = scfp->cpu % nr_cpu_ids;
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
+ WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(cpu)));
set_user_nice(current, MAX_NICE);
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
- VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id());
+ VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, raw_smp_processor_id());
// Make sure that the CPU is affinitized appropriately during testing.
- curcpu = smp_processor_id();
+ curcpu = raw_smp_processor_id();
WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids,
"%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n",
__func__, scfp->cpu, curcpu, nr_cpu_ids);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index e4163042c4d6..cf6acab78538 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -47,7 +47,7 @@ void __init idle_thread_set_boot_cpu(void)
*
* Creates the thread if it does not exist.
*/
-static inline void idle_init(unsigned int cpu)
+static __always_inline void idle_init(unsigned int cpu)
{
struct task_struct *tsk = per_cpu(idle_threads, cpu);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 29a5e54e6e10..517be7fd175e 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -991,6 +991,11 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
if (!p)
goto out;
+ /* Protect timer list r/w in arm_timer() */
+ sighand = lock_task_sighand(p, &flags);
+ if (unlikely(sighand == NULL))
+ goto out;
+
/*
* Fetch the current sample and update the timer's expiry time.
*/
@@ -1001,11 +1006,6 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
bump_cpu_timer(timer, now);
- /* Protect timer list r/w in arm_timer() */
- sighand = lock_task_sighand(p, &flags);
- if (unlikely(sighand == NULL))
- goto out;
-
/*
* Now re-arm for the new expiry time.
*/
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3fadb58fc9d7..9eb11c2209e5 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -207,6 +207,7 @@ struct timer_base {
unsigned int cpu;
bool next_expiry_recalc;
bool is_idle;
+ bool timers_pending;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -595,6 +596,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
* can reevaluate the wheel:
*/
base->next_expiry = bucket_expiry;
+ base->timers_pending = true;
base->next_expiry_recalc = false;
trigger_dyntick_cpu(base, timer);
}
@@ -1582,6 +1584,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
}
base->next_expiry_recalc = false;
+ base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
return next;
}
@@ -1633,7 +1636,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
u64 expires = KTIME_MAX;
unsigned long nextevt;
- bool is_max_delta;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1646,7 +1648,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
if (base->next_expiry_recalc)
base->next_expiry = __next_timer_interrupt(base);
nextevt = base->next_expiry;
- is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
/*
* We have a fresh next event. Check whether we can forward the
@@ -1664,7 +1665,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
expires = basem;
base->is_idle = false;
} else {
- if (!is_max_delta)
+ if (base->timers_pending)
expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
* If we expect to sleep more than a tick, mark the base idle.
@@ -1947,6 +1948,7 @@ int timers_prepare_cpu(unsigned int cpu)
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->timers_pending = false;
base->is_idle = false;
}
return 0;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e6fb3e6e1ffc..7b180f61e6d3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5985,7 +5985,8 @@ ftrace_graph_release(struct inode *inode, struct file *file)
* infrastructure to do the synchronization, thus we must do it
* ourselves.
*/
- synchronize_rcu_tasks_rude();
+ if (old_hash != EMPTY_HASH)
+ synchronize_rcu_tasks_rude();
free_ftrace_hash(old_hash);
}
@@ -7544,7 +7545,7 @@ int ftrace_is_dead(void)
*/
int register_ftrace_function(struct ftrace_ops *ops)
{
- int ret = -1;
+ int ret;
ftrace_ops_init(ops);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d1463eac11a3..e592d1df6f88 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3880,10 +3880,30 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
if (unlikely(!head))
return true;
- return reader->read == rb_page_commit(reader) &&
- (commit == reader ||
- (commit == head &&
- head->read == rb_page_commit(commit)));
+ /* Reader should exhaust content in reader page */
+ if (reader->read != rb_page_commit(reader))
+ return false;
+
+ /*
+ * If writers are committing on the reader page, knowing all
+ * committed content has been read, the ring buffer is empty.
+ */
+ if (commit == reader)
+ return true;
+
+ /*
+ * If writers are committing on a page other than reader page
+ * and head page, there should always be content to read.
+ */
+ if (commit != head)
+ return false;
+
+ /*
+ * Writers are committing on the head page, we just need
+ * to care about there're committed data, and the reader will
+ * swap reader page with head page when it is to read data.
+ */
+ return rb_page_commit(commit) == 0;
}
/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8b80b5bab71..c59dd35a6da5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5609,6 +5609,10 @@ static const char readme_msg[] =
"\t [:name=histname1]\n"
"\t [:<handler>.<action>]\n"
"\t [if <filter>]\n\n"
+ "\t Note, special fields can be used as well:\n"
+ "\t common_timestamp - to record current timestamp\n"
+ "\t common_cpu - to record the CPU the event happened on\n"
+ "\n"
"\t When a matching event is hit, an entry is added to a hash\n"
"\t table using the key(s) and value(s) named, and the value of a\n"
"\t sum called 'hitcount' is incremented. Keys and values\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0207aeed31e6..34325f41ebc0 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1111,7 +1111,7 @@ static const char *hist_field_name(struct hist_field *field,
field->flags & HIST_FIELD_FL_ALIAS)
field_name = hist_field_name(field->operands[0], ++level);
else if (field->flags & HIST_FIELD_FL_CPU)
- field_name = "cpu";
+ field_name = "common_cpu";
else if (field->flags & HIST_FIELD_FL_EXPR ||
field->flags & HIST_FIELD_FL_VAR_REF) {
if (field->system) {
@@ -1689,7 +1689,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (WARN_ON_ONCE(!field))
goto out;
- if (is_string_field(field)) {
+ /* Pointers to strings are just pointers and dangerous to dereference */
+ if (is_string_field(field) &&
+ (field->filter_type != FILTER_PTR_STRING)) {
flags |= HIST_FIELD_FL_STRING;
hist_field->size = MAX_FILTER_STR_VAL;
@@ -1989,14 +1991,24 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
hist_data->enable_timestamps = true;
if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
hist_data->attrs->ts_in_usecs = true;
- } else if (strcmp(field_name, "cpu") == 0)
+ } else if (strcmp(field_name, "common_cpu") == 0)
*flags |= HIST_FIELD_FL_CPU;
else {
field = trace_find_event_field(file->event_call, field_name);
if (!field || !field->size) {
- hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
- field = ERR_PTR(-EINVAL);
- goto out;
+ /*
+ * For backward compatibility, if field_name
+ * was "cpu", then we treat this the same as
+ * common_cpu.
+ */
+ if (strcmp(field_name, "cpu") == 0) {
+ *flags |= HIST_FIELD_FL_CPU;
+ } else {
+ hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
+ errpos(field_name));
+ field = ERR_PTR(-EINVAL);
+ goto out;
+ }
}
}
out:
@@ -4495,8 +4507,6 @@ static inline void add_to_key(char *compound_key, void *key,
field = key_field->field;
if (field->filter_type == FILTER_DYN_STRING)
size = *(u32 *)(rec + field->offset) >> 16;
- else if (field->filter_type == FILTER_PTR_STRING)
- size = strlen(key);
else if (field->filter_type == FILTER_STATIC_STRING)
size = field->size;
@@ -5085,7 +5095,7 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
seq_printf(m, "%s=", hist_field->var.name);
if (hist_field->flags & HIST_FIELD_FL_CPU)
- seq_puts(m, "cpu");
+ seq_puts(m, "common_cpu");
else if (field_name) {
if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
hist_field->flags & HIST_FIELD_FL_ALIAS)
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 2ac75eb6aa86..9315fc03e303 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -893,15 +893,13 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields,
dyn_event_init(&event->devent, &synth_event_ops);
for (i = 0, j = 0; i < n_fields; i++) {
+ fields[i]->field_pos = i;
event->fields[i] = fields[i];
- if (fields[i]->is_dynamic) {
- event->dynamic_fields[j] = fields[i];
- event->dynamic_fields[j]->field_pos = i;
+ if (fields[i]->is_dynamic)
event->dynamic_fields[j++] = fields[i];
- event->n_dynamic_fields++;
- }
}
+ event->n_dynamic_fields = j;
event->n_fields = n_fields;
out:
return event;
diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h
index 6e146b959dcd..4007fe95cf42 100644
--- a/kernel/trace/trace_synth.h
+++ b/kernel/trace/trace_synth.h
@@ -14,10 +14,10 @@ struct synth_field {
char *name;
size_t size;
unsigned int offset;
+ unsigned int field_pos;
bool is_signed;
bool is_string;
bool is_dynamic;
- bool field_pos;
};
struct synth_event {
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 976bf8ce8039..fc32821f8240 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -299,8 +299,8 @@ static int tracepoint_add_func(struct tracepoint *tp,
* a pointer to it. This array is referenced by __DO_TRACE from
* include/linux/tracepoint.h using rcu_dereference_sched().
*/
- rcu_assign_pointer(tp->funcs, tp_funcs);
tracepoint_update_call(tp, tp_funcs, false);
+ rcu_assign_pointer(tp->funcs, tp_funcs);
static_key_enable(&tp->key);
release_probes(old);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 50142fc08902..f148eacda55a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3676,15 +3676,21 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
unbound_release_work);
struct workqueue_struct *wq = pwq->wq;
struct worker_pool *pool = pwq->pool;
- bool is_last;
+ bool is_last = false;
- if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
- return;
+ /*
+ * when @pwq is not linked, it doesn't hold any reference to the
+ * @wq, and @wq is invalid to access.
+ */
+ if (!list_empty(&pwq->pwqs_node)) {
+ if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
+ return;
- mutex_lock(&wq->mutex);
- list_del_rcu(&pwq->pwqs_node);
- is_last = list_empty(&wq->pwqs);
- mutex_unlock(&wq->mutex);
+ mutex_lock(&wq->mutex);
+ list_del_rcu(&pwq->pwqs_node);
+ is_last = list_empty(&wq->pwqs);
+ mutex_unlock(&wq->mutex);
+ }
mutex_lock(&wq_pool_mutex);
put_unbound_pool(pool);