diff options
Diffstat (limited to 'kernel/bpf/syscall.c')
-rw-r--r-- | kernel/bpf/syscall.c | 709 |
1 files changed, 443 insertions, 266 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c287925471f6..dd5304c6ac3c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,6 +35,8 @@ #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> +#include <linux/tracepoint.h> +#include <linux/overflow.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> @@ -155,6 +157,89 @@ static void maybe_wait_bpf_programs(struct bpf_map *map) synchronize_rcu(); } +static void unpin_uptr_kaddr(void *kaddr) +{ + if (kaddr) + unpin_user_page(virt_to_page(kaddr)); +} + +static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj) +{ + const struct btf_field *field; + void **uptr_addr; + int i; + + for (i = 0, field = rec->fields; i < cnt; i++, field++) { + if (field->type != BPF_UPTR) + continue; + + uptr_addr = obj + field->offset; + unpin_uptr_kaddr(*uptr_addr); + } +} + +static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj) +{ + if (!btf_record_has_field(rec, BPF_UPTR)) + return; + + __bpf_obj_unpin_uptrs(rec, rec->cnt, obj); +} + +static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj) +{ + const struct btf_field *field; + const struct btf_type *t; + unsigned long start, end; + struct page *page; + void **uptr_addr; + int i, err; + + if (!btf_record_has_field(rec, BPF_UPTR)) + return 0; + + for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) { + if (field->type != BPF_UPTR) + continue; + + uptr_addr = obj + field->offset; + start = *(unsigned long *)uptr_addr; + if (!start) + continue; + + t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); + /* t->size was checked for zero before */ + if (check_add_overflow(start, t->size - 1, &end)) { + err = -EFAULT; + goto unpin_all; + } + + /* The uptr's struct cannot span across two pages */ + if ((start & PAGE_MASK) != (end & PAGE_MASK)) { + err = -EOPNOTSUPP; + goto unpin_all; + } + + err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page); + if (err != 1) + goto unpin_all; + + if (PageHighMem(page)) { + err = -EOPNOTSUPP; + unpin_user_page(page); + goto unpin_all; + } + + *uptr_addr = page_address(page) + offset_in_page(start); + } + + return 0; + +unpin_all: + __bpf_obj_unpin_uptrs(rec, i, obj); + return err; +} + static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, void *key, void *value, __u64 flags) { @@ -199,9 +284,14 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_push_elem(map, value, flags); } else { - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, flags); - rcu_read_unlock(); + err = bpf_obj_pin_uptrs(map->record, value); + if (!err) { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, flags); + rcu_read_unlock(); + if (err) + bpf_obj_unpin_uptrs(map->record, value); + } } bpf_enable_instrumentation(); @@ -385,7 +475,7 @@ void bpf_map_free_id(struct bpf_map *map) spin_unlock_irqrestore(&map_idr_lock, flags); } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static void bpf_map_save_memcg(struct bpf_map *map) { /* Currently if a map is created by a process belonging to the root @@ -480,32 +570,49 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, +static bool can_alloc_pages(void) +{ + return preempt_count() == 0 && !irqs_disabled() && + !IS_ENABLED(CONFIG_PREEMPT_RT); +} + +static struct page *__bpf_alloc_page(int nid) +{ + if (!can_alloc_pages()) + return alloc_pages_nolock(nid, 0); + + return alloc_pages_node(nid, + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT + | __GFP_NOWARN, + 0); +} + +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **pages) { unsigned long i, j; struct page *pg; int ret = 0; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG struct mem_cgroup *memcg, *old_memcg; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); #endif for (i = 0; i < nr_pages; i++) { - pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); + pg = __bpf_alloc_page(nid); if (pg) { pages[i] = pg; continue; } for (j = 0; j < i; j++) - __free_page(pages[j]); + free_pages_nolock(pages[j], 0); ret = -ENOMEM; break; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG set_active_memcg(old_memcg); mem_cgroup_put(memcg); #endif @@ -548,17 +655,21 @@ void btf_record_free(struct btf_record *rec) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: + case BPF_UPTR: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); - btf_put(rec->fields[i].kptr.btf); + if (btf_is_kernel(rec->fields[i].kptr.btf)) + btf_put(rec->fields[i].kptr.btf); break; case BPF_LIST_HEAD: case BPF_LIST_NODE: case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: + case BPF_WORKQUEUE: /* Nothing to release */ break; default: @@ -583,7 +694,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) if (IS_ERR_OR_NULL(rec)) return NULL; - size = offsetof(struct btf_record, fields[rec->cnt]); + size = struct_size(rec, fields, rec->cnt); new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); if (!new_rec) return ERR_PTR(-ENOMEM); @@ -595,7 +706,9 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: - btf_get(fields[i].kptr.btf); + case BPF_UPTR: + if (btf_is_kernel(fields[i].kptr.btf)) + btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { ret = -ENXIO; goto free; @@ -606,8 +719,10 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: + case BPF_WORKQUEUE: /* Nothing to acquire */ break; default: @@ -634,7 +749,7 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r return false; if (rec_a->cnt != rec_b->cnt) return false; - size = offsetof(struct btf_record, fields[rec_a->cnt]); + size = struct_size(rec_a, fields, rec_a->cnt); /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused * members are zeroed out. So memcmp is safe to do without worrying * about padding/unused fields. @@ -659,6 +774,13 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } +void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE))) + return; + bpf_wq_cancel_and_free(obj + rec->wq_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -675,10 +797,14 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) switch (fields[i].type) { case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: break; case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); break; + case BPF_WORKQUEUE: + bpf_wq_cancel_and_free(field_ptr); + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -691,15 +817,17 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) if (!btf_is_kernel(field->kptr.btf)) { pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); - migrate_disable(); __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? pointee_struct_meta->record : NULL, fields[i].type == BPF_KPTR_PERCPU); - migrate_enable(); } else { field->kptr.dtor(xchgd_field); } break; + case BPF_UPTR: + /* The caller ensured that no one is using the uptr */ + unpin_uptr_kaddr(*(void **)field_ptr); + break; case BPF_LIST_HEAD: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) continue; @@ -721,17 +849,19 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) } } -/* called from workqueue */ -static void bpf_map_free_deferred(struct work_struct *work) +static void bpf_map_free(struct bpf_map *map) { - struct bpf_map *map = container_of(work, struct bpf_map, work); struct btf_record *rec = map->record; struct btf *btf = map->btf; - security_bpf_map_free(map); - bpf_map_release_memcg(map); - /* implementation dependent freeing */ + /* implementation dependent freeing. Disabling migration to simplify + * the free of values or special fields allocated from bpf memory + * allocator. + */ + migrate_disable(); map->ops->map_free(map); + migrate_enable(); + /* Delay freeing of btf_record for maps, as map_free * callback usually needs access to them. It is better to do it here * than require each callback to do the free itself manually. @@ -748,6 +878,16 @@ static void bpf_map_free_deferred(struct work_struct *work) btf_put(btf); } +/* called from workqueue */ +static void bpf_map_free_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_map, work); + + security_bpf_map_free(map); + bpf_map_release_memcg(map); + bpf_map_free(map); +} + static void bpf_map_put_uref(struct bpf_map *map) { if (atomic64_dec_and_test(&map->usercnt)) { @@ -817,7 +957,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp) static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) { - fmode_t mode = f.file->f_mode; + fmode_t mode = fd_file(f)->f_mode; /* Our file permissions may have been overridden by global * map permissions facing syscall side. @@ -916,7 +1056,7 @@ static const struct vm_operations_struct bpf_map_default_vmops = { static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) { struct bpf_map *map = filp->private_data; - int err; + int err = 0; if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; @@ -940,24 +1080,33 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) err = -EACCES; goto out; } + bpf_map_write_active_inc(map); } +out: + mutex_unlock(&map->freeze_mutex); + if (err) + return err; /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; vm_flags_clear(vma, VM_MAYEXEC); + /* If mapping is read-only, then disallow potentially re-mapping with + * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing + * means that as far as BPF map's memory-mapped VMAs are concerned, + * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set, + * both should be set, so we can forget about VM_MAYWRITE and always + * check just VM_WRITE + */ if (!(vma->vm_flags & VM_WRITE)) - /* disallow re-mapping with PROT_WRITE */ vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); - if (err) - goto out; + if (err) { + if (vma->vm_flags & VM_WRITE) + bpf_map_write_active_dec(map); + } - if (vma->vm_flags & VM_MAYWRITE) - bpf_map_write_active_inc(map); -out: - mutex_unlock(&map->freeze_mutex); return err; } @@ -980,7 +1129,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr if (map->ops->map_get_unmapped_area) return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); #ifdef CONFIG_MMU - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); #else return addr; #endif @@ -1084,8 +1233,8 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, return -EINVAL; map->record = btf_parse_fields(btf, value_type, - BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT, + BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1103,6 +1252,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case 0: continue; case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && @@ -1115,6 +1265,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, } break; case BPF_TIMER: + case BPF_WORKQUEUE: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { @@ -1140,6 +1291,12 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, goto free_map_tab; } break; + case BPF_UPTR: + if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; case BPF_LIST_HEAD: case BPF_RB_ROOT: if (map->map_type != BPF_MAP_TYPE_HASH && @@ -1180,7 +1337,7 @@ static bool bpf_net_capable(void) #define BPF_MAP_CREATE_LAST_FIELD map_token_fd /* called via syscall */ -static int map_create(union bpf_attr *attr) +static int map_create(union bpf_attr *attr, bool kernel) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1370,7 +1527,7 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token); + err = security_bpf_map_create(map, attr, token, kernel); if (err) goto free_map_sec; @@ -1398,28 +1555,12 @@ static int map_create(union bpf_attr *attr) free_map_sec: security_bpf_map_free(map); free_map: - btf_put(map->btf); - map->ops->map_free(map); + bpf_map_free(map); put_token: bpf_token_put(token); return err; } -/* if error is returned, fd is released. - * On success caller should complete fd access with matching fdput() - */ -struct bpf_map *__bpf_map_get(struct fd f) -{ - if (!f.file) - return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_map_fops) { - fdput(f); - return ERR_PTR(-EINVAL); - } - - return f.file->private_data; -} - void bpf_map_inc(struct bpf_map *map) { atomic64_inc(&map->refcnt); @@ -1435,31 +1576,23 @@ EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); struct bpf_map *bpf_map_get(u32 ufd) { - struct fd f = fdget(ufd); - struct bpf_map *map; - - map = __bpf_map_get(f); - if (IS_ERR(map)) - return map; + CLASS(fd, f)(ufd); + struct bpf_map *map = __bpf_map_get(f); - bpf_map_inc(map); - fdput(f); + if (!IS_ERR(map)) + bpf_map_inc(map); return map; } -EXPORT_SYMBOL(bpf_map_get); +EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL"); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { - struct fd f = fdget(ufd); - struct bpf_map *map; - - map = __bpf_map_get(f); - if (IS_ERR(map)) - return map; + CLASS(fd, f)(ufd); + struct bpf_map *map = __bpf_map_get(f); - bpf_map_inc_with_uref(map); - fdput(f); + if (!IS_ERR(map)) + bpf_map_inc_with_uref(map); return map; } @@ -1482,11 +1615,8 @@ struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { - spin_lock_bh(&map_idr_lock); - map = __bpf_map_inc_not_zero(map, false); - spin_unlock_bh(&map_idr_lock); - - return map; + lockdep_assert(rcu_read_lock_held()); + return __bpf_map_inc_not_zero(map, false); } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); @@ -1524,11 +1654,9 @@ static int map_lookup_elem(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key); void __user *uvalue = u64_to_user_ptr(attr->value); - int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; u32 value_size; - struct fd f; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) @@ -1537,26 +1665,20 @@ static int map_lookup_elem(union bpf_attr *attr) if (attr->flags & ~BPF_F_LOCK) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { - err = -EPERM; - goto err_put; - } + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) + return -EPERM; if ((attr->flags & BPF_F_LOCK) && - !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { - err = -EINVAL; - goto err_put; - } + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) + return -EINVAL; key = __bpf_copy_key(ukey, map->key_size); - if (IS_ERR(key)) { - err = PTR_ERR(key); - goto err_put; - } + if (IS_ERR(key)) + return PTR_ERR(key); value_size = bpf_map_value_size(map); @@ -1587,8 +1709,6 @@ free_value: kvfree(value); free_key: kvfree(key); -err_put: - fdput(f); return err; } @@ -1599,17 +1719,15 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) { bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel); - int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; u32 value_size; - struct fd f; int err; if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -1638,7 +1756,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto free_key; } - err = bpf_map_update_value(map, f.file, key, value, attr->flags); + err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags); if (!err) maybe_wait_bpf_programs(map); @@ -1647,7 +1765,6 @@ free_key: kvfree(key); err_put: bpf_map_write_active_dec(map); - fdput(f); return err; } @@ -1656,16 +1773,14 @@ err_put: static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) { bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); - int ufd = attr->map_fd; struct bpf_map *map; - struct fd f; void *key; int err; if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -1702,7 +1817,6 @@ out: kvfree(key); err_put: bpf_map_write_active_dec(map); - fdput(f); return err; } @@ -1713,30 +1827,24 @@ static int map_get_next_key(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key); void __user *unext_key = u64_to_user_ptr(attr->next_key); - int ufd = attr->map_fd; struct bpf_map *map; void *key, *next_key; - struct fd f; int err; if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { - err = -EPERM; - goto err_put; - } + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) + return -EPERM; if (ukey) { key = __bpf_copy_key(ukey, map->key_size); - if (IS_ERR(key)) { - err = PTR_ERR(key); - goto err_put; - } + if (IS_ERR(key)) + return PTR_ERR(key); } else { key = NULL; } @@ -1768,8 +1876,6 @@ free_next_key: kvfree(next_key); free_key: kvfree(key); -err_put: - fdput(f); return err; } @@ -1890,8 +1996,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, return err; } -#define MAP_LOOKUP_RETRIES 3 - int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -1901,8 +2005,8 @@ int generic_map_lookup_batch(struct bpf_map *map, void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); void *buf, *buf_prevkey, *prev_key, *key, *value; - int err, retry = MAP_LOOKUP_RETRIES; u32 value_size, cp, max_count; + int err; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; @@ -1948,14 +2052,8 @@ int generic_map_lookup_batch(struct bpf_map *map, err = bpf_map_copy_value(map, key, value, attr->batch.elem_flags); - if (err == -ENOENT) { - if (retry) { - retry--; - continue; - } - err = -EINTR; - break; - } + if (err == -ENOENT) + goto next_key; if (err) goto free_buf; @@ -1970,12 +2068,12 @@ int generic_map_lookup_batch(struct bpf_map *map, goto free_buf; } + cp++; +next_key: if (!prev_key) prev_key = buf_prevkey; swap(prev_key, key); - retry = MAP_LOOKUP_RETRIES; - cp++; cond_resched(); } @@ -1998,11 +2096,9 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key); void __user *uvalue = u64_to_user_ptr(attr->value); - int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; u32 value_size; - struct fd f; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) @@ -2011,7 +2107,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) if (attr->flags & ~BPF_F_LOCK) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -2081,7 +2177,6 @@ free_key: kvfree(key); err_put: bpf_map_write_active_dec(map); - fdput(f); return err; } @@ -2089,27 +2184,22 @@ err_put: static int map_freeze(const union bpf_attr *attr) { - int err = 0, ufd = attr->map_fd; + int err = 0; struct bpf_map *map; - struct fd f; if (CHECK_ATTR(BPF_MAP_FREEZE)) return -EINVAL; - f = fdget(ufd); + CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) { - fdput(f); + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; - } - if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { - fdput(f); + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) return -EPERM; - } mutex_lock(&map->freeze_mutex); if (bpf_map_write_active(map)) { @@ -2124,7 +2214,6 @@ static int map_freeze(const union bpf_attr *attr) WRITE_ONCE(map->frozen, true); err_put: mutex_unlock(&map->freeze_mutex); - fdput(f); return err; } @@ -2244,6 +2333,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); kfree(prog->aux->kfunc_tab); + kfree(prog->aux->ctx_arg_info); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); @@ -2394,18 +2484,6 @@ int bpf_prog_new_fd(struct bpf_prog *prog) O_RDWR | O_CLOEXEC); } -static struct bpf_prog *____bpf_prog_get(struct fd f) -{ - if (!f.file) - return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_prog_fops) { - fdput(f); - return ERR_PTR(-EINVAL); - } - - return f.file->private_data; -} - void bpf_prog_add(struct bpf_prog *prog, int i) { atomic64_add(i, &prog->aux->refcnt); @@ -2461,20 +2539,19 @@ bool bpf_prog_get_ok(struct bpf_prog *prog, static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, bool attach_drv) { - struct fd f = fdget(ufd); + CLASS(fd, f)(ufd); struct bpf_prog *prog; - prog = ____bpf_prog_get(f); - if (IS_ERR(prog)) - return prog; - if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) { - prog = ERR_PTR(-EINVAL); - goto out; - } + if (fd_empty(f)) + return ERR_PTR(-EBADF); + if (fd_file(f)->f_op != &bpf_prog_fops) + return ERR_PTR(-EINVAL); + + prog = fd_file(f)->private_data; + if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) + return ERR_PTR(-EINVAL); bpf_prog_inc(prog); -out: - fdput(f); return prog; } @@ -2678,7 +2755,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd +#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { @@ -2887,7 +2964,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_prog; - err = security_bpf_prog_load(prog, attr, token); + err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); if (err) goto free_prog_sec; @@ -2982,16 +3059,33 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } -void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, - const struct bpf_link_ops *ops, struct bpf_prog *prog) +/* bpf_link_init_sleepable() allows to specify whether BPF link itself has + * "sleepable" semantics, which normally would mean that BPF link's attach + * hook can dereference link or link's underlying program for some time after + * detachment due to RCU Tasks Trace-based lifetime protection scheme. + * BPF program itself can be non-sleepable, yet, because it's transitively + * reachable through BPF link, its freeing has to be delayed until after RCU + * Tasks Trace GP. + */ +void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + bool sleepable) { + WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); link->type = type; + link->sleepable = sleepable; link->id = 0; link->ops = ops; link->prog = prog; } +void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog) +{ + bpf_link_init_sleepable(link, type, ops, prog, false); +} + static void bpf_link_free_id(int id) { if (!id) @@ -3024,12 +3118,24 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); } +static void bpf_link_dealloc(struct bpf_link *link) +{ + /* now that we know that bpf_link itself can't be reached, put underlying BPF program */ + if (link->prog) + bpf_prog_put(link->prog); + + /* free bpf_link and its containing memory */ + if (link->ops->dealloc_deferred) + link->ops->dealloc_deferred(link); + else + link->ops->dealloc(link); +} + static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) { struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); - /* free bpf_link and its containing memory */ - link->ops->dealloc_deferred(link); + bpf_link_dealloc(link); } static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) @@ -3043,27 +3149,28 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { - bool sleepable = false; + const struct bpf_link_ops *ops = link->ops; bpf_link_free_id(link->id); - if (link->prog) { - sleepable = link->prog->sleepable; - /* detach BPF program, clean up used resources */ - link->ops->release(link); - bpf_prog_put(link->prog); - } - if (link->ops->dealloc_deferred) { - /* schedule BPF link deallocation; if underlying BPF program - * is sleepable, we need to first wait for RCU tasks trace - * sync, then go through "classic" RCU grace period + /* detach BPF program, clean up used resources */ + if (link->prog) + ops->release(link); + if (ops->dealloc_deferred) { + /* Schedule BPF link deallocation, which will only then + * trigger putting BPF program refcount. + * If underlying BPF program is sleepable or BPF link's target + * attach hookpoint is sleepable or otherwise requires RCU GPs + * to ensure link and its underlying BPF program is not + * reachable anymore, we need to first wait for RCU tasks + * trace sync, and then go through "classic" RCU grace period */ - if (sleepable) + if (link->sleepable || (link->prog && link->prog->sleepable)) call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); + } else if (ops->dealloc) { + bpf_link_dealloc(link); } - if (link->ops->dealloc) - link->ops->dealloc(link); } static void bpf_link_put_deferred(struct work_struct *work) @@ -3117,13 +3224,17 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; + enum bpf_link_type type = link->type; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - seq_printf(m, - "link_type:\t%s\n" - "link_id:\t%u\n", - bpf_link_type_strs[link->type], - link->id); + if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { + seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); + } else { + WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); + seq_printf(m, "link_type:\t<%u>\n", type); + } + seq_printf(m, "link_id:\t%u\n", link->id); + if (prog) { bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, @@ -3137,6 +3248,13 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) } #endif +static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts) +{ + struct bpf_link *link = file->private_data; + + return link->ops->poll(file, pts); +} + static const struct file_operations bpf_link_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_link_show_fdinfo, @@ -3146,6 +3264,16 @@ static const struct file_operations bpf_link_fops = { .write = bpf_dummy_write, }; +static const struct file_operations bpf_link_fops_poll = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_link_show_fdinfo, +#endif + .release = bpf_link_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, + .poll = bpf_link_poll, +}; + static int bpf_link_alloc_id(struct bpf_link *link) { int id; @@ -3188,7 +3316,9 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) return id; } - file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); + file = anon_inode_getfile("bpf_link", + link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, + link, O_CLOEXEC); if (IS_ERR(file)) { bpf_link_free_id(id); put_unused_fd(fd); @@ -3216,28 +3346,26 @@ int bpf_link_settle(struct bpf_link_primer *primer) int bpf_link_new_fd(struct bpf_link *link) { - return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); + return anon_inode_getfd("bpf-link", + link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, + link, O_CLOEXEC); } struct bpf_link *bpf_link_get_from_fd(u32 ufd) { - struct fd f = fdget(ufd); + CLASS(fd, f)(ufd); struct bpf_link *link; - if (!f.file) + if (fd_empty(f)) return ERR_PTR(-EBADF); - if (f.file->f_op != &bpf_link_fops) { - fdput(f); + if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll) return ERR_PTR(-EINVAL); - } - link = f.file->private_data; + link = fd_file(f)->private_data; bpf_link_inc(link); - fdput(f); - return link; } -EXPORT_SYMBOL(bpf_link_get_from_fd); +EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL"); static void bpf_tracing_link_release(struct bpf_link *link) { @@ -3245,7 +3373,8 @@ static void bpf_tracing_link_release(struct bpf_link *link) container_of(link, struct bpf_tracing_link, link.link); WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, - tr_link->trampoline)); + tr_link->trampoline, + tr_link->tgt_prog)); bpf_trampoline_put(tr_link->trampoline); @@ -3385,7 +3514,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, * in prog->aux * * - if prog->aux->dst_trampoline is NULL, the program has already been - * attached to a target and its initial target was cleared (below) + * attached to a target and its initial target was cleared (below) * * - if tgt_prog != NULL, the caller specified tgt_prog_fd + * target_btf_id using the link_create API. @@ -3460,7 +3589,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, if (err) goto out_unlock; - err = bpf_trampoline_link_prog(&link->link, tr); + err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog); if (err) { bpf_link_cleanup(&link_primer); link = NULL; @@ -3498,17 +3627,12 @@ out_put_prog: return err; } -struct bpf_raw_tp_link { - struct bpf_link link; - struct bpf_raw_event_map *btp; -}; - static void bpf_raw_tp_link_release(struct bpf_link *link) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); - bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog); + bpf_probe_unregister(raw_tp->btp, raw_tp); bpf_put_raw_tracepoint(raw_tp->btp); } @@ -3601,15 +3725,16 @@ static void bpf_perf_link_dealloc(struct bpf_link *link) } static int bpf_perf_link_fill_common(const struct perf_event *event, - char __user *uname, u32 ulen, + char __user *uname, u32 *ulenp, u64 *probe_offset, u64 *probe_addr, u32 *fd_type, unsigned long *missed) { const char *buf; - u32 prog_id; + u32 prog_id, ulen; size_t len; int err; + ulen = *ulenp; if (!ulen ^ !uname) return -EINVAL; @@ -3617,10 +3742,17 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, probe_offset, probe_addr, missed); if (err) return err; + + if (buf) { + len = strlen(buf); + *ulenp = len + 1; + } else { + *ulenp = 1; + } if (!uname) return 0; + if (buf) { - len = strlen(buf); err = bpf_copy_to_user(uname, buf, ulen, len); if (err) return err; @@ -3645,7 +3777,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); ulen = info->perf_event.kprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, &missed); if (err) return err; @@ -3653,7 +3785,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; else info->perf_event.type = BPF_PERF_EVENT_KPROBE; - + info->perf_event.kprobe.name_len = ulen; info->perf_event.kprobe.offset = offset; info->perf_event.kprobe.missed = missed; if (!kallsyms_show_value(current_cred())) @@ -3668,14 +3800,14 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, static int bpf_perf_link_fill_uprobe(const struct perf_event *event, struct bpf_link_info *info) { + u64 ref_ctr_offset, offset; char __user *uname; - u64 addr, offset; u32 ulen, type; int err; uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset, &type, NULL); if (err) return err; @@ -3684,8 +3816,10 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_URETPROBE; else info->perf_event.type = BPF_PERF_EVENT_UPROBE; + info->perf_event.uprobe.name_len = ulen; info->perf_event.uprobe.offset = offset; info->perf_event.uprobe.cookie = event->bpf_cookie; + info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset; return 0; } #endif @@ -3709,12 +3843,18 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, { char __user *uname; u32 ulen; + int err; uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; + err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL); + if (err) + return err; + info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; + info->perf_event.tracepoint.name_len = ulen; info->perf_event.tracepoint.cookie = event->bpf_cookie; - return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL); + return 0; } static int bpf_perf_link_fill_perf_event(const struct perf_event *event, @@ -3808,7 +3948,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro #endif /* CONFIG_PERF_EVENTS */ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, - const char __user *user_tp_name) + const char __user *user_tp_name, u64 cookie) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; @@ -3852,9 +3992,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, err = -ENOMEM; goto out_put_btp; } - bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, - &bpf_raw_tp_link_lops, prog); + bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, + &bpf_raw_tp_link_lops, prog, + tracepoint_is_faultable(btp->tp)); link->btp = btp; + link->cookie = cookie; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -3862,7 +4004,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, goto out_put_btp; } - err = bpf_probe_register(link->btp, prog); + err = bpf_probe_register(link->btp, link); if (err) { bpf_link_cleanup(&link_primer); goto out_put_btp; @@ -3875,11 +4017,13 @@ out_put_btp: return err; } -#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { struct bpf_prog *prog; + void __user *tp_name; + __u64 cookie; int fd; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) @@ -3889,7 +4033,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); - fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name)); + tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); + cookie = attr->raw_tracepoint.cookie; + fd = bpf_raw_tp_link_attach(prog, tp_name, cookie); if (fd < 0) bpf_prog_put(prog); return fd; @@ -3985,6 +4131,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, * check permissions at attach time. */ return -EPERM; + + ptype = attach_type_to_prog_type(attach_type); + if (prog->type != ptype) + return -EINVAL; + return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; @@ -4003,12 +4154,20 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION && + attach_type != BPF_TRACE_KPROBE_SESSION) + return -EINVAL; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION && + attach_type != BPF_TRACE_UPROBE_SESSION) + return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && - attach_type != BPF_TRACE_UPROBE_MULTI) + attach_type != BPF_TRACE_KPROBE_SESSION && + attach_type != BPF_TRACE_UPROBE_MULTI && + attach_type != BPF_TRACE_UPROBE_SESSION) return -EINVAL; return 0; case BPF_PROG_TYPE_SCHED_CLS: @@ -4031,7 +4190,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_F_ATTACH_MASK_BASE \ (BPF_F_ALLOW_OVERRIDE | \ BPF_F_ALLOW_MULTI | \ - BPF_F_REPLACE) + BPF_F_REPLACE | \ + BPF_F_PREORDER) #define BPF_F_ATTACH_MASK_MPROG \ (BPF_F_REPLACE | \ @@ -4595,6 +4755,8 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.recursion_misses = stats.misses; info.verified_insns = prog->aux->verified_insns; + if (prog->aux->btf) + info.btf_id = btf_obj_id(prog->aux->btf); if (!bpf_capable()) { info.jited_prog_len = 0; @@ -4741,8 +4903,6 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } - if (prog->aux->btf) - info.btf_id = btf_obj_id(prog->aux->btf); info.attach_btf_id = prog->aux->attach_btf_id; if (attach_btf) info.attach_btf_obj_id = btf_obj_id(attach_btf); @@ -4930,33 +5090,25 @@ static int bpf_link_get_info_by_fd(struct file *file, static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, union bpf_attr __user *uattr) { - int ufd = attr->info.bpf_fd; - struct fd f; - int err; - if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) return -EINVAL; - f = fdget(ufd); - if (!f.file) + CLASS(fd, f)(attr->info.bpf_fd); + if (fd_empty(f)) return -EBADFD; - if (f.file->f_op == &bpf_prog_fops) - err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, + if (fd_file(f)->f_op == &bpf_prog_fops) + return bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); - else if (f.file->f_op == &bpf_map_fops) - err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, + else if (fd_file(f)->f_op == &bpf_map_fops) + return bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); - else if (f.file->f_op == &btf_fops) - err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); - else if (f.file->f_op == &bpf_link_fops) - err = bpf_link_get_info_by_fd(f.file, f.file->private_data, + else if (fd_file(f)->f_op == &btf_fops) + return bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); + else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll) + return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); - else - err = -EINVAL; - - fdput(f); - return err; + return -EINVAL; } #define BPF_BTF_LOAD_LAST_FIELD btf_token_fd @@ -4991,15 +5143,34 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_ return btf_new_fd(attr, uattr, uattr_size); } -#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id +#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) { + struct bpf_token *token = NULL; + if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (attr->open_flags & ~BPF_F_TOKEN_FD) + return -EINVAL; + + if (attr->open_flags & BPF_F_TOKEN_FD) { + token = bpf_token_get_from_fd(attr->fd_by_id_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) { + bpf_token_put(token); + token = NULL; + } + } + + if (!bpf_token_capable(token, CAP_SYS_ADMIN)) { + bpf_token_put(token); return -EPERM; + } + + bpf_token_put(token); return btf_get_fd_by_id(attr->btf_id); } @@ -5084,7 +5255,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (!file) return -EBADF; - if (file->f_op == &bpf_link_fops) { + if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) { struct bpf_link *link = file->private_data; if (link->ops == &bpf_raw_tp_link_lops) { @@ -5144,14 +5315,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr, cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; struct bpf_map *map; - int err, ufd; - struct fd f; + int err; if (CHECK_ATTR(BPF_MAP_BATCH)) return -EINVAL; - ufd = attr->batch.map_fd; - f = fdget(ufd); + CLASS(fd, f)(attr->batch.map_fd); + map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); @@ -5171,7 +5341,7 @@ static int bpf_map_do_batch(const union bpf_attr *attr, else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr); else if (cmd == BPF_MAP_UPDATE_BATCH) - BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr); + BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr); else BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr); err_put: @@ -5179,7 +5349,6 @@ err_put: maybe_wait_bpf_programs(map); bpf_map_write_active_dec(map); } - fdput(f); return err; } @@ -5227,7 +5396,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; } if (prog->expected_attach_type == BPF_TRACE_RAW_TP) - ret = bpf_raw_tp_link_attach(prog, NULL); + ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) @@ -5242,6 +5411,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_SK_SKB: + ret = sock_map_link_create(attr, prog); + break; #ifdef CONFIG_NET case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); @@ -5264,9 +5437,11 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type == BPF_PERF_EVENT) ret = bpf_perf_link_attach(attr, prog); - else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI) + else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || + attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) ret = bpf_kprobe_multi_link_attach(attr, prog); - else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) + else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI || + attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION) ret = bpf_uprobe_multi_link_attach(attr, prog); break; default: @@ -5389,10 +5564,11 @@ static int link_detach(union bpf_attr *attr) return ret; } -static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) +struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) { return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); } +EXPORT_SYMBOL(bpf_link_inc_not_zero); struct bpf_link *bpf_link_by_id(u32 id) { @@ -5618,7 +5794,7 @@ static int token_create(union bpf_attr *attr) return bpf_token_create(attr); } -static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) +static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; int err; @@ -5633,13 +5809,13 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; - err = security_bpf(cmd, &attr, size); + err = security_bpf(cmd, &attr, size, uattr.is_kernel); if (err < 0) return err; switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr); + err = map_create(&attr, uattr.is_kernel); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); @@ -5846,7 +6022,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) return ____bpf_sys_bpf(cmd, attr, size); } } -EXPORT_SYMBOL(kern_sys_bpf); +EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL"); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, @@ -5882,6 +6058,7 @@ static const struct bpf_func_proto bpf_sys_close_proto = { BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res) { + *res = 0; if (flags) return -EINVAL; @@ -5902,7 +6079,8 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_LONG, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, + .arg4_size = sizeof(u64), }; static const struct bpf_func_proto * @@ -5933,7 +6111,7 @@ const struct bpf_prog_ops bpf_syscall_prog_ops = { }; #ifdef CONFIG_SYSCTL -static int bpf_stats_handler(struct ctl_table *table, int write, +static int bpf_stats_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; @@ -5968,7 +6146,7 @@ void __weak unpriv_ebpf_notify(int new_state) { } -static int bpf_unpriv_handler(struct ctl_table *table, int write, +static int bpf_unpriv_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, unpriv_enable = *(int *)table->data; @@ -5992,7 +6170,7 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write, return ret; } -static struct ctl_table bpf_syscall_table[] = { +static const struct ctl_table bpf_syscall_table[] = { { .procname = "unprivileged_bpf_disabled", .data = &sysctl_unprivileged_bpf_disabled, @@ -6008,7 +6186,6 @@ static struct ctl_table bpf_syscall_table[] = { .mode = 0644, .proc_handler = bpf_stats_handler, }, - { } }; static int __init bpf_syscall_sysctl_init(void) |