diff options
Diffstat (limited to 'kernel/bpf/memalloc.c')
| -rw-r--r-- | kernel/bpf/memalloc.c | 192 |
1 files changed, 145 insertions, 47 deletions
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 9c49ae53deaf..bd45dda9dc35 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -35,6 +35,8 @@ */ #define LLIST_NODE_SZ sizeof(struct llist_node) +#define BPF_MEM_ALLOC_SIZE_MAX 4096 + /* similar to kmalloc, but sizeof == 8 bucket is gone */ static u8 size_index[24] __ro_after_init = { 3, /* 8 */ @@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = { static int bpf_mem_cache_idx(size_t size) { - if (!size || size > 4096) + if (!size || size > BPF_MEM_ALLOC_SIZE_MAX) return -1; if (size <= 192) @@ -121,6 +123,8 @@ struct bpf_mem_caches { struct bpf_mem_cache cache[NUM_CACHES]; }; +static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + static struct llist_node notrace *__llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; @@ -136,8 +140,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head) static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) { if (c->percpu_size) { - void **obj = kmalloc_node(c->percpu_size, flags, node); - void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); + void __percpu **obj = kmalloc_node(c->percpu_size, flags, node); + void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); if (!obj || !pptr) { free_percpu(pptr); @@ -153,12 +157,9 @@ static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (c->objcg) return get_mem_cgroup_from_objcg(c->objcg); -#endif - -#ifdef CONFIG_MEMCG return root_mem_cgroup; #else return NULL; @@ -253,11 +254,8 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) static void free_one(void *obj, bool percpu) { - if (percpu) { - free_percpu(((void **)obj)[1]); - kfree(obj); - return; - } + if (percpu) + free_percpu(((void __percpu **)obj)[1]); kfree(obj); } @@ -340,6 +338,7 @@ static void free_bulk(struct bpf_mem_cache *c) int cnt; WARN_ON_ONCE(tgt->unit_size != c->unit_size); + WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); do { inc_active(c, &flags); @@ -365,6 +364,9 @@ static void __free_by_rcu(struct rcu_head *head) struct bpf_mem_cache *tgt = c->tgt; struct llist_node *llnode; + WARN_ON_ONCE(tgt->unit_size != c->unit_size); + WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); + llnode = llist_del_all(&c->waiting_for_gp); if (!llnode) goto out; @@ -458,12 +460,17 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c) * consume ~ 11 Kbyte per cpu. * Typical case will be between 11K and 116K closer to 11K. * bpf progs can and should share bpf_mem_cache when possible. + * + * Percpu allocation is typically rare. To avoid potential unnecessary large + * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. */ - -static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +static void init_refill_work(struct bpf_mem_cache *c) { init_irq_work(&c->refill_work, bpf_mem_refill); - if (c->unit_size <= 256) { + if (c->percpu_size) { + c->low_watermark = 1; + c->high_watermark = 3; + } else if (c->unit_size <= 256) { c->low_watermark = 32; c->high_watermark = 96; } else { @@ -476,12 +483,20 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) c->high_watermark = max(96 * 256 / c->unit_size, 3); } c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); +} - /* To avoid consuming memory assume that 1st run of bpf - * prog won't be doing more than 4 map_update_elem from - * irq disabled region +static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +{ + int cnt = 1; + + /* To avoid consuming memory, for non-percpu allocation, assume that + * 1st run of bpf prog won't be doing more than 4 map_update_elem from + * irq disabled region if unit size is less than or equal to 256. + * For all other cases, let us just do one allocation. */ - alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false); + if (!c->percpu_size && c->unit_size <= 256) + cnt = 4; + alloc_bulk(c, cnt, cpu_to_node(cpu), false); } /* When size != 0 bpf_mem_cache for each cpu. @@ -493,61 +508,119 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { - static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; - struct bpf_mem_caches *cc, __percpu *pcc; - struct bpf_mem_cache *c, __percpu *pc; + struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; + struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc; struct obj_cgroup *objcg = NULL; int cpu, i, unit_size, percpu_size = 0; + if (percpu && size == 0) + return -EINVAL; + + /* room for llist_node and per-cpu pointer */ + if (percpu) + percpu_size = LLIST_NODE_SZ + sizeof(void *); + ma->percpu = percpu; + if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); if (!pc) return -ENOMEM; - if (percpu) - /* room for llist_node and per-cpu pointer */ - percpu_size = LLIST_NODE_SZ + sizeof(void *); - else + if (!percpu) size += LLIST_NODE_SZ; /* room for llist_node */ unit_size = size; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (memcg_bpf_enabled()) objcg = get_obj_cgroup_from_current(); #endif + ma->objcg = objcg; + for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); c->unit_size = unit_size; c->objcg = objcg; c->percpu_size = percpu_size; c->tgt = c; + init_refill_work(c); prefill_mem_cache(c, cpu); } ma->cache = pc; return 0; } - /* size == 0 && percpu is an invalid combination */ - if (WARN_ON_ONCE(percpu)) - return -EINVAL; - pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG objcg = get_obj_cgroup_from_current(); #endif + ma->objcg = objcg; for_each_possible_cpu(cpu) { cc = per_cpu_ptr(pcc, cpu); for (i = 0; i < NUM_CACHES; i++) { c = &cc->cache[i]; c->unit_size = sizes[i]; c->objcg = objcg; + c->percpu_size = percpu_size; c->tgt = c; + + init_refill_work(c); prefill_mem_cache(c, cpu); } } + + ma->caches = pcc; + return 0; +} + +int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) +{ + struct bpf_mem_caches __percpu *pcc; + + pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); + if (!pcc) + return -ENOMEM; + ma->caches = pcc; + ma->objcg = objcg; + ma->percpu = true; + return 0; +} + +int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) +{ + struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; + int cpu, i, unit_size, percpu_size; + struct obj_cgroup *objcg; + struct bpf_mem_cache *c; + + i = bpf_mem_cache_idx(size); + if (i < 0) + return -EINVAL; + + /* room for llist_node and per-cpu pointer */ + percpu_size = LLIST_NODE_SZ + sizeof(void *); + + unit_size = sizes[i]; + objcg = ma->objcg; + pcc = ma->caches; + + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(pcc, cpu); + c = &cc->cache[i]; + if (c->unit_size) + break; + + c->unit_size = unit_size; + c->objcg = objcg; + c->percpu_size = percpu_size; + c->tgt = c; + + init_refill_work(c); + prefill_mem_cache(c, cpu); + } + return 0; } @@ -663,7 +736,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) /* Defer barriers into worker to let the rest of map memory to be freed */ memset(ma, 0, sizeof(*ma)); INIT_WORK(©->work, free_mem_alloc_deferred); - queue_work(system_unbound_wq, ©->work); + queue_work(system_dfl_wq, ©->work); } void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) @@ -682,9 +755,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } - /* objcg is the same across cpus */ - if (c->objcg) - obj_cgroup_put(c->objcg); + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } if (ma->caches) { @@ -700,8 +771,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } - if (c->objcg) - obj_cgroup_put(c->objcg); + obj_cgroup_put(ma->objcg); destroy_mem_alloc(ma, rcu_in_progress); } } @@ -734,12 +804,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c) } } local_dec(&c->active); - local_irq_restore(flags); WARN_ON(cnt < 0); if (cnt < c->low_watermark) irq_work_raise(c); + /* Enable IRQ after the enqueue of irq work completes, so irq work + * will run after IRQ is enabled and free_llist may be refilled by + * irq work before other task preempts current task. + */ + local_irq_restore(flags); + return llnode; } @@ -775,11 +850,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra); } local_dec(&c->active); - local_irq_restore(flags); if (cnt > c->high_watermark) /* free few objects from current cpu into global kmalloc pool */ irq_work_raise(c); + /* Enable IRQ after irq_work_raise() completes, otherwise when current + * task is preempted by task which does unit_alloc(), unit_alloc() may + * return NULL unexpectedly because irq work is already pending but can + * not been triggered and free_llist can not be refilled timely. + */ + local_irq_restore(flags); } static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) @@ -797,10 +877,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) llist_add(llnode, &c->free_llist_extra_rcu); } local_dec(&c->active); - local_irq_restore(flags); if (!atomic_read(&c->call_rcu_in_progress)) irq_work_raise(c); + local_irq_restore(flags); } /* Called from BPF program or from sys_bpf syscall. @@ -812,9 +892,11 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) void *ret; if (!size) - return ZERO_SIZE_PTR; + return NULL; - idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ); + if (!ma->percpu) + size += LLIST_NODE_SZ; + idx = bpf_mem_cache_idx(size); if (idx < 0) return NULL; @@ -824,13 +906,15 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) { + struct bpf_mem_cache *c; int idx; if (!ptr) return; - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); - if (idx < 0) + c = *(void **)(ptr - LLIST_NODE_SZ); + idx = bpf_mem_cache_idx(c->unit_size); + if (WARN_ON_ONCE(idx < 0)) return; unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); @@ -838,13 +922,15 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) { + struct bpf_mem_cache *c; int idx; if (!ptr) return; - idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); - if (idx < 0) + c = *(void **)(ptr - LLIST_NODE_SZ); + idx = bpf_mem_cache_idx(c->unit_size); + if (WARN_ON_ONCE(idx < 0)) return; unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); @@ -910,9 +996,21 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); + if (ret) + *(struct bpf_mem_cache **)ret = c; set_active_memcg(old_memcg); mem_cgroup_put(memcg); } return !ret ? NULL : ret + LLIST_NODE_SZ; } + +int bpf_mem_alloc_check_size(bool percpu, size_t size) +{ + /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */ + if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) || + (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ)) + return -E2BIG; + + return 0; +} |
