summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorPaolo Abeni <pabeni@redhat.com>2023-09-21 21:49:45 +0200
committerPaolo Abeni <pabeni@redhat.com>2023-09-21 21:49:45 +0200
commite9cbc89067cce78211c8629c78e931c0fe64e29d (patch)
tree95662373d0a7b4adc589fec61e120586d476de71 /kernel
parentf30e5323a188cfc2d74b04f222cea0dbe9ffd6e6 (diff)
parent27bbf45eae9ca98877a2d52a92a188147cd61b07 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR. No conflicts. Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c2
-rw-r--r--kernel/bpf/cgroup.c13
-rw-r--r--kernel/bpf/memalloc.c94
-rw-r--r--kernel/bpf/offload.c12
-rw-r--r--kernel/bpf/queue_stack_maps.c21
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/power/hibernate.c12
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/sched/fair.c27
-rw-r--r--kernel/trace/bpf_trace.c20
11 files changed, 184 insertions, 36 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f93e835d90af..69101200c124 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -8519,7 +8519,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
tname = btf_name_by_offset(btf, walk_type->name_off);
ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
- if (ret < 0)
+ if (ret >= sizeof(safe_tname))
return false;
safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info));
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 5b2741aa0d9b..03b3d4492980 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -785,7 +785,8 @@ found:
* to descendants
* @cgrp: The cgroup which descendants to traverse
* @link: A link for which to replace BPF program
- * @type: Type of attach operation
+ * @new_prog: &struct bpf_prog for the target BPF program with its refcnt
+ * incremented
*
* Must be called with cgroup_mutex held.
*/
@@ -1334,7 +1335,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socket sending or receiving traffic
* @skb: The skb that is being sent or received
- * @type: The type of program to be executed
+ * @atype: The type of program to be executed
*
* If no socket is passed, or the socket is not of type INET or INET6,
* this function does nothing and returns 0.
@@ -1424,7 +1425,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
/**
* __cgroup_bpf_run_filter_sk() - Run a program on a sock
* @sk: sock structure to manipulate
- * @type: The type of program to be executed
+ * @atype: The type of program to be executed
*
* socket is passed is expected to be of type INET or INET6.
*
@@ -1449,7 +1450,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* provided by user sockaddr
* @sk: sock struct that will use sockaddr
* @uaddr: sockaddr struct provided by user
- * @type: The type of program to be executed
+ * @atype: The type of program to be executed
* @t_ctx: Pointer to attach type specific context
* @flags: Pointer to u32 which contains higher bits of BPF program
* return value (OR'ed together).
@@ -1496,7 +1497,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
* sk with connection information (IP addresses, etc.) May not contain
* cgroup info if it is a req sock.
- * @type: The type of program to be executed
+ * @atype: The type of program to be executed
*
* socket passed is expected to be of type INET or INET6.
*
@@ -1670,7 +1671,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* @ppos: value-result argument: value is position at which read from or write
* to sysctl is happening, result is new position if program overrode it,
* initial value otherwise
- * @type: type of program to be executed
+ * @atype: type of program to be executed
*
* Program is run when sysctl is being accessed, either read or written, and
* can allow or deny such access.
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 961df89d45f1..0ad175277f89 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -459,8 +459,7 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
* Typical case will be between 11K and 116K closer to 11K.
* bpf progs can and should share bpf_mem_cache when possible.
*/
-
-static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+static void init_refill_work(struct bpf_mem_cache *c)
{
init_irq_work(&c->refill_work, bpf_mem_refill);
if (c->unit_size <= 256) {
@@ -476,7 +475,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
c->high_watermark = max(96 * 256 / c->unit_size, 3);
}
c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
+}
+static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+{
/* To avoid consuming memory assume that 1st run of bpf
* prog won't be doing more than 4 map_update_elem from
* irq disabled region
@@ -484,6 +486,31 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
}
+static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx)
+{
+ struct llist_node *first;
+ unsigned int obj_size;
+
+ /* For per-cpu allocator, the size of free objects in free list doesn't
+ * match with unit_size and now there is no way to get the size of
+ * per-cpu pointer saved in free object, so just skip the checking.
+ */
+ if (c->percpu_size)
+ return 0;
+
+ first = c->free_llist.first;
+ if (!first)
+ return 0;
+
+ obj_size = ksize(first);
+ if (obj_size != c->unit_size) {
+ WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n",
+ idx, obj_size, c->unit_size);
+ return -EINVAL;
+ }
+ return 0;
+}
+
/* When size != 0 bpf_mem_cache for each cpu.
* This is typical bpf hash map use case when all elements have equal size.
*
@@ -494,10 +521,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
{
static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+ int cpu, i, err, unit_size, percpu_size = 0;
struct bpf_mem_caches *cc, __percpu *pcc;
struct bpf_mem_cache *c, __percpu *pc;
struct obj_cgroup *objcg = NULL;
- int cpu, i, unit_size, percpu_size = 0;
/* room for llist_node and per-cpu pointer */
if (percpu)
@@ -522,6 +549,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c->objcg = objcg;
c->percpu_size = percpu_size;
c->tgt = c;
+ init_refill_work(c);
prefill_mem_cache(c, cpu);
}
ma->cache = pc;
@@ -531,6 +559,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
if (!pcc)
return -ENOMEM;
+ err = 0;
#ifdef CONFIG_MEMCG_KMEM
objcg = get_obj_cgroup_from_current();
#endif
@@ -542,11 +571,30 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c->objcg = objcg;
c->percpu_size = percpu_size;
c->tgt = c;
+
+ init_refill_work(c);
+ /* Another bpf_mem_cache will be used when allocating
+ * c->unit_size in bpf_mem_alloc(), so doesn't prefill
+ * for the bpf_mem_cache because these free objects will
+ * never be used.
+ */
+ if (i != bpf_mem_cache_idx(c->unit_size))
+ continue;
prefill_mem_cache(c, cpu);
+ err = check_obj_size(c, i);
+ if (err)
+ goto out;
}
}
+
+out:
ma->caches = pcc;
- return 0;
+ /* refill_work is either zeroed or initialized, so it is safe to
+ * call irq_work_sync().
+ */
+ if (err)
+ bpf_mem_alloc_destroy(ma);
+ return err;
}
static void drain_mem_cache(struct bpf_mem_cache *c)
@@ -924,3 +972,41 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
+
+/* Most of the logic is taken from setup_kmalloc_cache_index_table() */
+static __init int bpf_mem_cache_adjust_size(void)
+{
+ unsigned int size, index;
+
+ /* Normally KMALLOC_MIN_SIZE is 8-bytes, but it can be
+ * up-to 256-bytes.
+ */
+ size = KMALLOC_MIN_SIZE;
+ if (size <= 192)
+ index = size_index[(size - 1) / 8];
+ else
+ index = fls(size - 1) - 1;
+ for (size = 8; size < KMALLOC_MIN_SIZE && size <= 192; size += 8)
+ size_index[(size - 1) / 8] = index;
+
+ /* The minimal alignment is 64-bytes, so disable 96-bytes cache and
+ * use 128-bytes cache instead.
+ */
+ if (KMALLOC_MIN_SIZE >= 64) {
+ index = size_index[(128 - 1) / 8];
+ for (size = 64 + 8; size <= 96; size += 8)
+ size_index[(size - 1) / 8] = index;
+ }
+
+ /* The minimal alignment is 128-bytes, so disable 192-bytes cache and
+ * use 256-bytes cache instead.
+ */
+ if (KMALLOC_MIN_SIZE >= 128) {
+ index = fls(256 - 1) - 1;
+ for (size = 128 + 8; size <= 192; size += 8)
+ size_index[(size - 1) / 8] = index;
+ }
+
+ return 0;
+}
+subsys_initcall(bpf_mem_cache_adjust_size);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 92c9df46134a..1a4fec330eaa 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -199,12 +199,14 @@ static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *n
offload->netdev = netdev;
ondev = bpf_offload_find_netdev(offload->netdev);
+ /* When program is offloaded require presence of "true"
+ * bpf_offload_netdev, avoid the one created for !ondev case below.
+ */
+ if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) {
+ err = -EINVAL;
+ goto err_free;
+ }
if (!ondev) {
- if (bpf_prog_is_offloaded(prog->aux)) {
- err = -EINVAL;
- goto err_free;
- }
-
/* When only binding to the device, explicitly
* create an entry in the hashtable.
*/
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 8d2ddcb7566b..d869f51ea93a 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -98,7 +98,12 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
int err = 0;
void *ptr;
- raw_spin_lock_irqsave(&qs->lock, flags);
+ if (in_nmi()) {
+ if (!raw_spin_trylock_irqsave(&qs->lock, flags))
+ return -EBUSY;
+ } else {
+ raw_spin_lock_irqsave(&qs->lock, flags);
+ }
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -128,7 +133,12 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
void *ptr;
u32 index;
- raw_spin_lock_irqsave(&qs->lock, flags);
+ if (in_nmi()) {
+ if (!raw_spin_trylock_irqsave(&qs->lock, flags))
+ return -EBUSY;
+ } else {
+ raw_spin_lock_irqsave(&qs->lock, flags);
+ }
if (queue_stack_map_is_empty(qs)) {
memset(value, 0, qs->map.value_size);
@@ -193,7 +203,12 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
if (flags & BPF_NOEXIST || flags > BPF_EXIST)
return -EINVAL;
- raw_spin_lock_irqsave(&qs->lock, irq_flags);
+ if (in_nmi()) {
+ if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags))
+ return -EBUSY;
+ } else {
+ raw_spin_lock_irqsave(&qs->lock, irq_flags);
+ }
if (queue_stack_map_is_full(qs)) {
if (!replace) {
diff --git a/kernel/panic.c b/kernel/panic.c
index 07239d4ad81e..ffa037fa777d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -697,6 +697,7 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,
if (!fmt) {
__warn(file, line, __builtin_return_address(0), taint,
NULL, NULL);
+ warn_rcu_exit(rcu);
return;
}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 2b4a946a6ff5..8d35b9f9aaa3 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -786,9 +786,9 @@ int hibernate(void)
unlock_device_hotplug();
if (snapshot_test) {
pm_pr_dbg("Checking hibernation image\n");
- error = swsusp_check(snapshot_test);
+ error = swsusp_check(false);
if (!error)
- error = load_image_and_restore(snapshot_test);
+ error = load_image_and_restore(false);
}
thaw_processes();
@@ -945,14 +945,14 @@ static int software_resume(void)
pm_pr_dbg("Looking for hibernation image.\n");
mutex_lock(&system_transition_mutex);
- error = swsusp_check(false);
+ error = swsusp_check(true);
if (error)
goto Unlock;
/* The snapshot device should not be opened while we're running */
if (!hibernate_acquire()) {
error = -EBUSY;
- swsusp_close(false);
+ swsusp_close(true);
goto Unlock;
}
@@ -973,7 +973,7 @@ static int software_resume(void)
goto Close_Finish;
}
- error = load_image_and_restore(false);
+ error = load_image_and_restore(true);
thaw_processes();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
@@ -987,7 +987,7 @@ static int software_resume(void)
pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
- swsusp_close(false);
+ swsusp_close(true);
goto Finish;
}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46eb14dc50c3..a98f95e309a3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -168,11 +168,11 @@ extern int swsusp_swap_in_use(void);
#define SF_HW_SIG 8
/* kernel/power/hibernate.c */
-int swsusp_check(bool snapshot_test);
+int swsusp_check(bool exclusive);
extern void swsusp_free(void);
extern int swsusp_read(unsigned int *flags_p);
extern int swsusp_write(unsigned int flags);
-void swsusp_close(bool snapshot_test);
+void swsusp_close(bool exclusive);
#ifdef CONFIG_SUSPEND
extern int swsusp_unmark(void);
#endif
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f6ebcd00c410..74edbce2320b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1513,12 +1513,13 @@ end:
static void *swsusp_holder;
/**
- * swsusp_check - Check for swsusp signature in the resume device
+ * swsusp_check - Check for swsusp signature in the resume device
+ * @exclusive: Open the resume device exclusively.
*/
-int swsusp_check(bool snapshot_test)
+int swsusp_check(bool exclusive)
{
- void *holder = snapshot_test ? &swsusp_holder : NULL;
+ void *holder = exclusive ? &swsusp_holder : NULL;
int error;
hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ,
@@ -1563,17 +1564,18 @@ put:
}
/**
- * swsusp_close - close swap device.
+ * swsusp_close - close swap device.
+ * @exclusive: Close the resume device which is exclusively opened.
*/
-void swsusp_close(bool snapshot_test)
+void swsusp_close(bool exclusive)
{
if (IS_ERR(hib_resume_bdev)) {
pr_debug("Image device not initialised\n");
return;
}
- blkdev_put(hib_resume_bdev, snapshot_test ? &swsusp_holder : NULL);
+ blkdev_put(hib_resume_bdev, exclusive ? &swsusp_holder : NULL);
}
/**
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8dbff6e7ad4f..cb225921bbca 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6619,6 +6619,7 @@ dequeue_throttle:
/* Working cpumask for: load_balance, load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
+static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -9579,7 +9580,7 @@ static inline long sibling_imbalance(struct lb_env *env,
imbalance /= ncores_local + ncores_busiest;
/* Take advantage of resource in an empty sched group */
- if (imbalance == 0 && local->sum_nr_running == 0 &&
+ if (imbalance <= 1 && local->sum_nr_running == 0 &&
busiest->sum_nr_running > 1)
imbalance = 2;
@@ -9767,6 +9768,15 @@ static bool update_sd_pick_busiest(struct lb_env *env,
break;
case group_smt_balance:
+ /*
+ * Check if we have spare CPUs on either SMT group to
+ * choose has spare or fully busy handling.
+ */
+ if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
+ goto has_spare;
+
+ fallthrough;
+
case group_fully_busy:
/*
* Select the fully busy group with highest avg_load. In
@@ -9806,6 +9816,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
else
return true;
}
+has_spare:
/*
* Select not overloaded group with lowest number of idle cpus
@@ -10917,6 +10928,7 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
+ struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
struct sched_group *sg = env->sd->groups;
int cpu, idle_smt = -1;
@@ -10940,8 +10952,9 @@ static int should_we_balance(struct lb_env *env)
return 1;
}
+ cpumask_copy(swb_cpus, group_balance_mask(sg));
/* Try to find first idle CPU */
- for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+ for_each_cpu_and(cpu, swb_cpus, env->cpus) {
if (!idle_cpu(cpu))
continue;
@@ -10953,6 +10966,14 @@ static int should_we_balance(struct lb_env *env)
if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
if (idle_smt == -1)
idle_smt = cpu;
+ /*
+ * If the core is not idle, and first SMT sibling which is
+ * idle has been found, then its not needed to check other
+ * SMT siblings for idleness:
+ */
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
+#endif
continue;
}
@@ -12918,6 +12939,8 @@ __init void init_sched_fair_class(void)
for_each_possible_cpu(i) {
zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
+ zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
+ GFP_KERNEL, cpu_to_node(i));
#ifdef CONFIG_CFS_BANDWIDTH
INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a7264b2c17ad..868008f56fec 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2853,6 +2853,17 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3
return arr.mods_cnt;
}
+static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt)
+{
+ u32 i;
+
+ for (i = 0; i < cnt; i++) {
+ if (!within_error_injection_list(addrs[i]))
+ return -EINVAL;
+ }
+ return 0;
+}
+
int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_kprobe_multi_link *link = NULL;
@@ -2930,6 +2941,11 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
goto error;
}
+ if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) {
+ err = -EINVAL;
+ goto error;
+ }
+
link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
err = -ENOMEM;
@@ -3207,8 +3223,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
rcu_read_unlock();
- if (!task)
+ if (!task) {
+ err = -ESRCH;
goto error_path_put;
+ }
}
err = -ENOMEM;