summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c26
-rw-r--r--kernel/bpf/disasm.c16
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/helpers.c8
-rw-r--r--kernel/bpf/verifier.c150
-rw-r--r--kernel/cgroup/cgroup-v1.c4
-rw-r--r--kernel/cgroup/rstat.c19
-rw-r--r--kernel/dma/ops_helpers.c12
-rw-r--r--kernel/events/core.c35
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/msi.c13
-rw-r--r--kernel/irq/timings.c5
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/sched/core.c90
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/time/posix-cpu-timers.c10
-rw-r--r--kernel/time/timer.c14
-rw-r--r--kernel/trace/bpf_trace.c13
-rw-r--r--kernel/ucount.c29
-rw-r--r--kernel/workqueue.c20
21 files changed, 251 insertions, 228 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9b1577498373..0a28a8095d3e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -32,6 +32,8 @@
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
+
+#include <asm/barrier.h>
#include <asm/unaligned.h>
/* Registers */
@@ -1360,11 +1362,13 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
}
/**
- * __bpf_prog_run - run eBPF program on a given context
+ * ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
*
* Decode and execute eBPF instructions.
+ *
+ * Return: whatever value is in %BPF_R0 at program exit
*/
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
@@ -1377,6 +1381,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
/* Non-UAPI available opcodes. */
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
+ [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC,
[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
@@ -1621,7 +1626,21 @@ out:
COND_JMP(s, JSGE, >=)
COND_JMP(s, JSLE, <=)
#undef COND_JMP
- /* STX and ST and LDX*/
+ /* ST, STX and LDX*/
+ ST_NOSPEC:
+ /* Speculation barrier for mitigating Speculative Store Bypass.
+ * In case of arm64, we rely on the firmware mitigation as
+ * controlled via the ssbd kernel parameter. Whenever the
+ * mitigation is enabled, it works for all of the kernel code
+ * with no need to provide any additional instructions here.
+ * In case of x86, we use 'lfence' insn for mitigation. We
+ * reuse preexisting logic from Spectre v1 mitigation that
+ * happens to produce the required code on x86 for v4 as well.
+ */
+#ifdef CONFIG_X86
+ barrier_nospec();
+#endif
+ CONT;
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
@@ -1861,6 +1880,9 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
* The BPF program will be executed via BPF_PROG_RUN() macro.
+ *
+ * Return: the &fp argument along with &err set to 0 for success or
+ * a negative errno code on failure
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index bbfc6bb79240..ca3cd9aaa6ce 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -206,15 +206,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "BUG_%02x\n", insn->code);
}
} else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
+ if (BPF_MODE(insn->code) == BPF_MEM) {
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) {
+ verbose(cbs->private_data, "(%02x) nospec\n", insn->code);
+ } else {
verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
- return;
}
- verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
} else if (class == BPF_LDX) {
if (BPF_MODE(insn->code) != BPF_MEM) {
verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 72c58cc516a3..9c011f3a2687 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1565,8 +1565,8 @@ alloc:
/* We cannot do copy_from_user or copy_to_user inside
* the rcu_read_lock. Allocate enough space here.
*/
- keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN);
- values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN);
+ keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
if (!keys || !values) {
ret = -ENOMEM;
goto after_loop;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 62cf00383910..7a97b2f4747d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -397,8 +397,8 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
void *ptr;
int i;
- for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
- if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+ for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) {
+ if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
continue;
storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
@@ -1070,12 +1070,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 42a4063de7cd..f9bda5476ea5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2610,6 +2610,19 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
cur = env->cur_state->frame[env->cur_state->curframe];
if (value_regno >= 0)
reg = &cur->regs[value_regno];
+ if (!env->bypass_spec_v4) {
+ bool sanitize = reg && is_spillable_regtype(reg->type);
+
+ for (i = 0; i < size; i++) {
+ if (state->stack[spi].slot_type[i] == STACK_INVALID) {
+ sanitize = true;
+ break;
+ }
+ }
+
+ if (sanitize)
+ env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
+ }
if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
@@ -2632,47 +2645,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
verbose(env, "invalid size of register spill\n");
return -EACCES;
}
-
if (state != cur && reg->type == PTR_TO_STACK) {
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}
-
- if (!env->bypass_spec_v4) {
- bool sanitize = false;
-
- if (state->stack[spi].slot_type[0] == STACK_SPILL &&
- register_is_const(&state->stack[spi].spilled_ptr))
- sanitize = true;
- for (i = 0; i < BPF_REG_SIZE; i++)
- if (state->stack[spi].slot_type[i] == STACK_MISC) {
- sanitize = true;
- break;
- }
- if (sanitize) {
- int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
- int soff = (-spi - 1) * BPF_REG_SIZE;
-
- /* detected reuse of integer stack slot with a pointer
- * which means either llvm is reusing stack slot or
- * an attacker is trying to exploit CVE-2018-3639
- * (speculative store bypass)
- * Have to sanitize that slot with preemptive
- * store of zero.
- */
- if (*poff && *poff != soff) {
- /* disallow programs where single insn stores
- * into two different stack slots, since verifier
- * cannot sanitize them
- */
- verbose(env,
- "insn %d cannot access two stack slots fp%d and fp%d",
- insn_idx, *poff, soff);
- return -EINVAL;
- }
- *poff = soff;
- }
- }
save_register_state(state, spi, reg);
} else {
u8 type = STACK_MISC;
@@ -3677,6 +3653,8 @@ continue_func:
if (tail_call_reachable)
for (j = 0; j < frame; j++)
subprog[ret_prog[j]].tail_call_reachable = true;
+ if (subprog[0].tail_call_reachable)
+ env->prog->aux->tail_call_reachable = true;
/* end of for() loop means the last insn of the 'subprog'
* was reached. Doesn't matter whether it was JA or EXIT
@@ -6559,6 +6537,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
alu_state |= ptr_is_dst_reg ?
BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+
+ /* Limit pruning on unknown scalars to enable deep search for
+ * potential masking differences from other program paths.
+ */
+ if (!off_is_imm)
+ env->explore_alu_limits = true;
}
err = update_alu_sanitation_state(aux, alu_state, alu_limit);
@@ -9934,8 +9918,8 @@ next:
}
/* Returns true if (rold safe implies rcur safe) */
-static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
- struct bpf_id_pair *idmap)
+static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
+ struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
{
bool equal;
@@ -9961,6 +9945,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
switch (rold->type) {
case SCALAR_VALUE:
+ if (env->explore_alu_limits)
+ return false;
if (rcur->type == SCALAR_VALUE) {
if (!rold->precise && !rcur->precise)
return true;
@@ -10051,9 +10037,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
}
-static bool stacksafe(struct bpf_func_state *old,
- struct bpf_func_state *cur,
- struct bpf_id_pair *idmap)
+static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+ struct bpf_func_state *cur, struct bpf_id_pair *idmap)
{
int i, spi;
@@ -10098,9 +10083,8 @@ static bool stacksafe(struct bpf_func_state *old,
continue;
if (old->stack[spi].slot_type[0] != STACK_SPILL)
continue;
- if (!regsafe(&old->stack[spi].spilled_ptr,
- &cur->stack[spi].spilled_ptr,
- idmap))
+ if (!regsafe(env, &old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr, idmap))
/* when explored and current stack slot are both storing
* spilled registers, check that stored pointers types
* are the same as well.
@@ -10157,10 +10141,11 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
for (i = 0; i < MAX_BPF_REG; i++)
- if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch))
+ if (!regsafe(env, &old->regs[i], &cur->regs[i],
+ env->idmap_scratch))
return false;
- if (!stacksafe(old, cur, env->idmap_scratch))
+ if (!stacksafe(env, old, cur, env->idmap_scratch))
return false;
if (!refsafe(old, cur))
@@ -11904,35 +11889,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
bpf_convert_ctx_access_t convert_ctx_access;
+ bool ctx_access;
if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
type = BPF_READ;
- else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
- insn->code == (BPF_STX | BPF_MEM | BPF_DW))
+ ctx_access = true;
+ } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
- else
+ ctx_access = BPF_CLASS(insn->code) == BPF_STX;
+ } else {
continue;
+ }
if (type == BPF_WRITE &&
- env->insn_aux_data[i + delta].sanitize_stack_off) {
+ env->insn_aux_data[i + delta].sanitize_stack_spill) {
struct bpf_insn patch[] = {
- /* Sanitize suspicious stack slot with zero.
- * There are no memory dependencies for this store,
- * since it's only using frame pointer and immediate
- * constant of zero
- */
- BPF_ST_MEM(BPF_DW, BPF_REG_FP,
- env->insn_aux_data[i + delta].sanitize_stack_off,
- 0),
- /* the original STX instruction will immediately
- * overwrite the same stack slot with appropriate value
- */
*insn,
+ BPF_ST_NOSPEC(),
};
cnt = ARRAY_SIZE(patch);
@@ -11946,6 +11929,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
continue;
}
+ if (!ctx_access)
+ continue;
+
switch (env->insn_aux_data[i + delta].ptr_type) {
case PTR_TO_CTX:
if (!ops->convert_ctx_access)
@@ -12750,37 +12736,6 @@ static void free_states(struct bpf_verifier_env *env)
}
}
-/* The verifier is using insn_aux_data[] to store temporary data during
- * verification and to store information for passes that run after the
- * verification like dead code sanitization. do_check_common() for subprogram N
- * may analyze many other subprograms. sanitize_insn_aux_data() clears all
- * temporary data after do_check_common() finds that subprogram N cannot be
- * verified independently. pass_cnt counts the number of times
- * do_check_common() was run and insn->aux->seen tells the pass number
- * insn_aux_data was touched. These variables are compared to clear temporary
- * data from failed pass. For testing and experiments do_check_common() can be
- * run multiple times even when prior attempt to verify is unsuccessful.
- *
- * Note that special handling is needed on !env->bypass_spec_v1 if this is
- * ever called outside of error path with subsequent program rejection.
- */
-static void sanitize_insn_aux_data(struct bpf_verifier_env *env)
-{
- struct bpf_insn *insn = env->prog->insnsi;
- struct bpf_insn_aux_data *aux;
- int i, class;
-
- for (i = 0; i < env->prog->len; i++) {
- class = BPF_CLASS(insn[i].code);
- if (class != BPF_LDX && class != BPF_STX)
- continue;
- aux = &env->insn_aux_data[i];
- if (aux->seen != env->pass_cnt)
- continue;
- memset(aux, 0, offsetof(typeof(*aux), orig_idx));
- }
-}
-
static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
@@ -12857,9 +12812,6 @@ out:
if (!ret && pop_log)
bpf_vlog_reset(&env->log, 0);
free_states(env);
- if (ret)
- /* clean aux data in case subprog was rejected */
- sanitize_insn_aux_data(env);
return ret;
}
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 8d6bf56ed77a..de2c432dee20 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1221,9 +1221,7 @@ int cgroup1_get_tree(struct fs_context *fc)
ret = cgroup_do_get_tree(fc);
if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
- struct super_block *sb = fc->root->d_sb;
- dput(fc->root);
- deactivate_locked_super(sb);
+ fc_drop_locked(fc);
ret = 1;
}
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 7f0e58917432..b264ab5652ba 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -347,19 +347,20 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
}
static struct cgroup_rstat_cpu *
-cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
struct cgroup_rstat_cpu *rstatc;
rstatc = get_cpu_ptr(cgrp->rstat_cpu);
- u64_stats_update_begin(&rstatc->bsync);
+ *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
return rstatc;
}
static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
- struct cgroup_rstat_cpu *rstatc)
+ struct cgroup_rstat_cpu *rstatc,
+ unsigned long flags)
{
- u64_stats_update_end(&rstatc->bsync);
+ u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
cgroup_rstat_updated(cgrp, smp_processor_id());
put_cpu_ptr(rstatc);
}
@@ -367,18 +368,20 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
case CPUTIME_USER:
@@ -394,7 +397,7 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
break;
}
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
/*
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index 910ae69cae77..af4a6ef48ce0 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -5,6 +5,13 @@
*/
#include <linux/dma-map-ops.h>
+static struct page *dma_common_vaddr_to_page(void *cpu_addr)
+{
+ if (is_vmalloc_addr(cpu_addr))
+ return vmalloc_to_page(cpu_addr);
+ return virt_to_page(cpu_addr);
+}
+
/*
* Create scatter-list for the already allocated DMA buffer.
*/
@@ -12,7 +19,7 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
- struct page *page = virt_to_page(cpu_addr);
+ struct page *page = dma_common_vaddr_to_page(cpu_addr);
int ret;
ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
@@ -32,6 +39,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
unsigned long user_count = vma_pages(vma);
unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long off = vma->vm_pgoff;
+ struct page *page = dma_common_vaddr_to_page(cpu_addr);
int ret = -ENXIO;
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
@@ -43,7 +51,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
return -ENXIO;
return remap_pfn_range(vma, vma->vm_start,
- page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff,
+ page_to_pfn(page) + vma->vm_pgoff,
user_count << PAGE_SHIFT, vma->vm_page_prot);
#else
return -ENXIO;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 464917096e73..1cb1f9b8392e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11917,6 +11917,37 @@ again:
return gctx;
}
+static bool
+perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
+{
+ unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
+ bool is_capable = perfmon_capable();
+
+ if (attr->sigtrap) {
+ /*
+ * perf_event_attr::sigtrap sends signals to the other task.
+ * Require the current task to also have CAP_KILL.
+ */
+ rcu_read_lock();
+ is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
+ rcu_read_unlock();
+
+ /*
+ * If the required capabilities aren't available, checks for
+ * ptrace permissions: upgrade to ATTACH, since sending signals
+ * can effectively change the target task.
+ */
+ ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
+ }
+
+ /*
+ * Preserve ptrace permission check for backwards compatibility. The
+ * ptrace check also includes checks that the current task and other
+ * task have matching uids, and is therefore not done here explicitly.
+ */
+ return is_capable || ptrace_may_access(task, ptrace_mode);
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -12163,15 +12194,13 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_file;
/*
- * Preserve ptrace permission check for backwards compatibility.
- *
* We must hold exec_update_lock across this and any potential
* perf_install_in_context() call for this new event to
* serialize against exec() altering our credentials (and the
* perf_event_exit_task() that could imply).
*/
err = -EACCES;
- if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ if (!perf_check_permission(&attr, task))
goto err_cred;
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 7f04c7d8296e..a98bcfc4be7b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
} else {
switch (__irq_startup_managed(desc, aff, force)) {
case IRQ_STARTUP_NORMAL:
+ if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)
+ irq_setup_affinity(desc);
ret = __irq_startup(desc);
- irq_setup_affinity(desc);
+ if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP))
+ irq_setup_affinity(desc);
break;
case IRQ_STARTUP_MANAGED:
irq_do_set_affinity(d, aff, false);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index c41965e348b5..85df3ca03efe 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -476,11 +476,6 @@ skip_activate:
return 0;
cleanup:
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- if (irqd_is_activated(irq_data))
- irq_domain_deactivate_irq(irq_data);
- }
msi_domain_free_irqs(domain, dev);
return ret;
}
@@ -505,7 +500,15 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
+ struct irq_data *irq_data;
struct msi_desc *desc;
+ int i;
+
+ for_each_msi_vector(desc, i, dev) {
+ irq_data = irq_domain_get_irq_data(domain, i);
+ if (irqd_is_activated(irq_data))
+ irq_domain_deactivate_irq(irq_data);
+ }
for_each_msi_entry(desc, dev) {
/*
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index d309d6fbf5bd..4d2a702d7aa9 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -453,6 +453,11 @@ static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
*/
index = irq_timings_interval_index(interval);
+ if (index > PREDICTION_BUFFER_SIZE - 1) {
+ irqs->count = 0;
+ return;
+ }
+
/*
* Store the index as an element of the pattern in another
* circular array.
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b5d9bb5202c6..ad0db322ed3b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -343,7 +343,7 @@ static __always_inline bool
rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
enum rtmutex_chainwalk chwalk)
{
- if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEX))
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
return waiter != NULL;
return chwalk == RT_MUTEX_FULL_CHAINWALK;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d9ff40f4661..20ffcc044134 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1981,12 +1981,18 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
+static inline int __normal_prio(int policy, int rt_prio, int nice)
{
- return p->static_prio;
+ int prio;
+
+ if (dl_policy(policy))
+ prio = MAX_DL_PRIO - 1;
+ else if (rt_policy(policy))
+ prio = MAX_RT_PRIO - 1 - rt_prio;
+ else
+ prio = NICE_TO_PRIO(nice);
+
+ return prio;
}
/*
@@ -1998,15 +2004,7 @@ static inline int __normal_prio(struct task_struct *p)
*/
static inline int normal_prio(struct task_struct *p)
{
- int prio;
-
- if (task_has_dl_policy(p))
- prio = MAX_DL_PRIO-1;
- else if (task_has_rt_policy(p))
- prio = MAX_RT_PRIO-1 - p->rt_priority;
- else
- prio = __normal_prio(p);
- return prio;
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
}
/*
@@ -4099,7 +4097,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
- p->prio = p->normal_prio = __normal_prio(p);
+ p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
/*
@@ -6341,6 +6339,18 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);
+static void __setscheduler_prio(struct task_struct *p, int prio)
+{
+ if (dl_prio(prio))
+ p->sched_class = &dl_sched_class;
+ else if (rt_prio(prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+
+ p->prio = prio;
+}
+
#ifdef CONFIG_RT_MUTEXES
static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
@@ -6456,22 +6466,19 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
} else {
p->dl.pi_se = &p->dl;
}
- p->sched_class = &dl_sched_class;
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
- p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
- p->sched_class = &fair_sched_class;
}
- p->prio = prio;
+ __setscheduler_prio(p, prio);
if (queued)
enqueue_task(rq, p, queue_flag);
@@ -6824,35 +6831,6 @@ static void __setscheduler_params(struct task_struct *p,
set_load_weight(p, true);
}
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr, bool keep_boost)
-{
- /*
- * If params can't change scheduling class changes aren't allowed
- * either.
- */
- if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
- return;
-
- __setscheduler_params(p, attr);
-
- /*
- * Keep a potential priority boosting if called from
- * sched_setscheduler().
- */
- p->prio = normal_prio(p);
- if (keep_boost)
- p->prio = rt_effective_prio(p, p->prio);
-
- if (dl_prio(p->prio))
- p->sched_class = &dl_sched_class;
- else if (rt_prio(p->prio))
- p->sched_class = &rt_sched_class;
- else
- p->sched_class = &fair_sched_class;
-}
-
/*
* Check the target process has a UID that matches the current process's:
*/
@@ -6873,10 +6851,8 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
{
- int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- MAX_RT_PRIO - 1 - attr->sched_priority;
- int retval, oldprio, oldpolicy = -1, queued, running;
- int new_effective_prio, policy = attr->sched_policy;
+ int oldpolicy = -1, policy = attr->sched_policy;
+ int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
struct callback_head *head;
struct rq_flags rf;
@@ -7074,6 +7050,7 @@ change:
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
if (pi) {
/*
* Take priority boosted tasks into account. If the new
@@ -7082,8 +7059,8 @@ change:
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_effective_prio(p, newprio);
- if (new_effective_prio == oldprio)
+ newprio = rt_effective_prio(p, newprio);
+ if (newprio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
@@ -7096,7 +7073,10 @@ change:
prev_class = p->sched_class;
- __setscheduler(rq, p, attr, pi);
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ __setscheduler_prio(p, newprio);
+ }
__setscheduler_uclamp(p, attr);
if (queued) {
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 057e17f3215d..6469eca8078c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -602,7 +602,7 @@ static inline void seccomp_sync_threads(unsigned long flags)
smp_store_release(&thread->seccomp.filter,
caller->seccomp.filter);
atomic_set(&thread->seccomp.filter_count,
- atomic_read(&thread->seccomp.filter_count));
+ atomic_read(&caller->seccomp.filter_count));
/*
* Don't let an unprivileged task work around
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index e4163042c4d6..cf6acab78538 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -47,7 +47,7 @@ void __init idle_thread_set_boot_cpu(void)
*
* Creates the thread if it does not exist.
*/
-static inline void idle_init(unsigned int cpu)
+static __always_inline void idle_init(unsigned int cpu)
{
struct task_struct *tsk = per_cpu(idle_threads, cpu);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 29a5e54e6e10..517be7fd175e 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -991,6 +991,11 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
if (!p)
goto out;
+ /* Protect timer list r/w in arm_timer() */
+ sighand = lock_task_sighand(p, &flags);
+ if (unlikely(sighand == NULL))
+ goto out;
+
/*
* Fetch the current sample and update the timer's expiry time.
*/
@@ -1001,11 +1006,6 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
bump_cpu_timer(timer, now);
- /* Protect timer list r/w in arm_timer() */
- sighand = lock_task_sighand(p, &flags);
- if (unlikely(sighand == NULL))
- goto out;
-
/*
* Now re-arm for the new expiry time.
*/
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3fadb58fc9d7..e3d2c23c413d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -207,6 +207,7 @@ struct timer_base {
unsigned int cpu;
bool next_expiry_recalc;
bool is_idle;
+ bool timers_pending;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -595,6 +596,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
* can reevaluate the wheel:
*/
base->next_expiry = bucket_expiry;
+ base->timers_pending = true;
base->next_expiry_recalc = false;
trigger_dyntick_cpu(base, timer);
}
@@ -1263,8 +1265,10 @@ static inline void timer_base_unlock_expiry(struct timer_base *base)
static void timer_sync_wait_running(struct timer_base *base)
{
if (atomic_read(&base->timer_waiters)) {
+ raw_spin_unlock_irq(&base->lock);
spin_unlock(&base->expiry_lock);
spin_lock(&base->expiry_lock);
+ raw_spin_lock_irq(&base->lock);
}
}
@@ -1455,14 +1459,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn, baseclk);
- base->running_timer = NULL;
raw_spin_lock(&base->lock);
+ base->running_timer = NULL;
} else {
raw_spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, baseclk);
+ raw_spin_lock_irq(&base->lock);
base->running_timer = NULL;
timer_sync_wait_running(base);
- raw_spin_lock_irq(&base->lock);
}
}
}
@@ -1582,6 +1586,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
}
base->next_expiry_recalc = false;
+ base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
return next;
}
@@ -1633,7 +1638,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
u64 expires = KTIME_MAX;
unsigned long nextevt;
- bool is_max_delta;
/*
* Pretend that there is no timer pending if the cpu is offline.
@@ -1646,7 +1650,6 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
if (base->next_expiry_recalc)
base->next_expiry = __next_timer_interrupt(base);
nextevt = base->next_expiry;
- is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
/*
* We have a fresh next event. Check whether we can forward the
@@ -1664,7 +1667,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
expires = basem;
base->is_idle = false;
} else {
- if (!is_max_delta)
+ if (base->timers_pending)
expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
* If we expect to sleep more than a tick, mark the base idle.
@@ -1947,6 +1950,7 @@ int timers_prepare_cpu(unsigned int cpu)
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->timers_pending = false;
base->is_idle = false;
}
return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b4916ef388ad..fdd14072fc3b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -990,28 +990,29 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_numa_node_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
- case BPF_FUNC_probe_write_user:
- return bpf_get_probe_write_proto();
case BPF_FUNC_current_task_under_cgroup:
return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_probe_write_user:
+ return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
+ NULL : bpf_get_probe_write_proto();
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_proto;
case BPF_FUNC_probe_read_user_str:
return &bpf_probe_read_user_str_proto;
case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_proto;
case BPF_FUNC_probe_read_str:
- return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+ return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_str_proto;
#endif
#ifdef CONFIG_CGROUPS
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 87799e2379bd..bb51849e6375 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -58,14 +58,17 @@ static struct ctl_table_root set_root = {
.permissions = set_permissions,
};
-#define UCOUNT_ENTRY(name) \
- { \
- .procname = name, \
- .maxlen = sizeof(int), \
- .mode = 0644, \
- .proc_handler = proc_dointvec_minmax, \
- .extra1 = SYSCTL_ZERO, \
- .extra2 = SYSCTL_INT_MAX, \
+static long ue_zero = 0;
+static long ue_int_max = INT_MAX;
+
+#define UCOUNT_ENTRY(name) \
+ { \
+ .procname = name, \
+ .maxlen = sizeof(long), \
+ .mode = 0644, \
+ .proc_handler = proc_doulongvec_minmax, \
+ .extra1 = &ue_zero, \
+ .extra2 = &ue_int_max, \
}
static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_user_namespaces"),
@@ -160,6 +163,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
struct ucounts *ucounts, *new;
+ long overflow;
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -184,8 +188,12 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
return new;
}
}
+ overflow = atomic_add_negative(1, &ucounts->count);
spin_unlock_irq(&ucounts_lock);
- ucounts = get_ucounts(ucounts);
+ if (overflow) {
+ put_ucounts(ucounts);
+ return NULL;
+ }
return ucounts;
}
@@ -193,8 +201,7 @@ void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;
- if (atomic_dec_and_test(&ucounts->count)) {
- spin_lock_irqsave(&ucounts_lock, flags);
+ if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) {
hlist_del_init(&ucounts->node);
spin_unlock_irqrestore(&ucounts_lock, flags);
kfree(ucounts);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 50142fc08902..f148eacda55a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3676,15 +3676,21 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
unbound_release_work);
struct workqueue_struct *wq = pwq->wq;
struct worker_pool *pool = pwq->pool;
- bool is_last;
+ bool is_last = false;
- if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
- return;
+ /*
+ * when @pwq is not linked, it doesn't hold any reference to the
+ * @wq, and @wq is invalid to access.
+ */
+ if (!list_empty(&pwq->pwqs_node)) {
+ if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
+ return;
- mutex_lock(&wq->mutex);
- list_del_rcu(&pwq->pwqs_node);
- is_last = list_empty(&wq->pwqs);
- mutex_unlock(&wq->mutex);
+ mutex_lock(&wq->mutex);
+ list_del_rcu(&pwq->pwqs_node);
+ is_last = list_empty(&wq->pwqs);
+ mutex_unlock(&wq->mutex);
+ }
mutex_lock(&wq_pool_mutex);
put_unbound_pool(pool);