summaryrefslogtreecommitdiff
path: root/kernel/bpf/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf/cgroup.c')
-rw-r--r--kernel/bpf/cgroup.c448
1 files changed, 428 insertions, 20 deletions
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 92a7d0cf8d13..0a00eaca6fae 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -15,19 +15,34 @@
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <net/sock.h>
+#include <net/bpf_sk_storage.h>
+
+#include "../cgroup/cgroup-internal.h"
DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+void cgroup_bpf_offline(struct cgroup *cgrp)
+{
+ cgroup_get(cgrp);
+ percpu_ref_kill(&cgrp->bpf.refcnt);
+}
+
/**
- * cgroup_bpf_put() - put references of all bpf programs
- * @cgrp: the cgroup to modify
+ * cgroup_bpf_release() - put references of all bpf programs and
+ * release all cgroup bpf data
+ * @work: work structure embedded into the cgroup to modify
*/
-void cgroup_bpf_put(struct cgroup *cgrp)
+static void cgroup_bpf_release(struct work_struct *work)
{
+ struct cgroup *cgrp = container_of(work, struct cgroup,
+ bpf.release_work);
enum bpf_cgroup_storage_type stype;
+ struct bpf_prog_array *old_array;
unsigned int type;
+ mutex_lock(&cgroup_mutex);
+
for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog_list *pl, *tmp;
@@ -42,8 +57,29 @@ void cgroup_bpf_put(struct cgroup *cgrp)
kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
- bpf_prog_array_free(cgrp->bpf.effective[type]);
+ old_array = rcu_dereference_protected(
+ cgrp->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+ bpf_prog_array_free(old_array);
}
+
+ mutex_unlock(&cgroup_mutex);
+
+ percpu_ref_exit(&cgrp->bpf.refcnt);
+ cgroup_put(cgrp);
+}
+
+/**
+ * cgroup_bpf_release_fn() - callback used to schedule releasing
+ * of bpf cgroup data
+ * @ref: percpu ref counter structure
+ */
+static void cgroup_bpf_release_fn(struct percpu_ref *ref)
+{
+ struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
+
+ INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
+ queue_work(system_wq, &cgrp->bpf.release_work);
}
/* count number of elements in the list.
@@ -98,7 +134,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
*/
static int compute_effective_progs(struct cgroup *cgrp,
enum bpf_attach_type type,
- struct bpf_prog_array __rcu **array)
+ struct bpf_prog_array **array)
{
enum bpf_cgroup_storage_type stype;
struct bpf_prog_array *progs;
@@ -136,17 +172,16 @@ static int compute_effective_progs(struct cgroup *cgrp,
}
} while ((p = cgroup_parent(p)));
- rcu_assign_pointer(*array, progs);
+ *array = progs;
return 0;
}
static void activate_effective_progs(struct cgroup *cgrp,
enum bpf_attach_type type,
- struct bpf_prog_array __rcu *array)
+ struct bpf_prog_array *old_array)
{
- struct bpf_prog_array __rcu *old_array;
-
- old_array = xchg(&cgrp->bpf.effective[type], array);
+ rcu_swap_protected(cgrp->bpf.effective[type], old_array,
+ lockdep_is_held(&cgroup_mutex));
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
*/
@@ -163,8 +198,13 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
* that array below is variable length
*/
#define NR ARRAY_SIZE(cgrp->bpf.effective)
- struct bpf_prog_array __rcu *arrays[NR] = {};
- int i;
+ struct bpf_prog_array *arrays[NR] = {};
+ int ret, i;
+
+ ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
+ GFP_KERNEL);
+ if (ret)
+ return ret;
for (i = 0; i < NR; i++)
INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -180,6 +220,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
cleanup:
for (i = 0; i < NR; i++)
bpf_prog_array_free(arrays[i]);
+
+ percpu_ref_exit(&cgrp->bpf.refcnt);
+
return -ENOMEM;
}
@@ -193,6 +236,9 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
err = compute_effective_progs(desc, type, &desc->bpf.inactive);
if (err)
goto cleanup;
@@ -202,6 +248,14 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
+ if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
+ if (unlikely(desc->bpf.inactive)) {
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+ continue;
+ }
+
activate_effective_progs(desc, type, desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
@@ -441,10 +495,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
enum bpf_attach_type type = attr->query.attach_type;
struct list_head *progs = &cgrp->bpf.progs[type];
u32 flags = cgrp->bpf.flags[type];
+ struct bpf_prog_array *effective;
int cnt, ret = 0, i;
+ effective = rcu_dereference_protected(cgrp->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
- cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+ cnt = bpf_prog_array_length(effective);
else
cnt = prog_list_length(progs);
@@ -461,8 +519,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
- return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
- prog_ids, cnt);
+ return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
} else {
struct bpf_prog_list *pl;
u32 id;
@@ -545,8 +602,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* The program type passed in via @type must be suitable for network
* filtering. No further check is performed to assert that.
*
- * This function will return %-EPERM if any if an attached program was found
- * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ * For egress packets, this function can return:
+ * NET_XMIT_SUCCESS (0) - continue with packet output
+ * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
+ * NET_XMIT_CN (2) - continue with packet output and notify TCP
+ * to call cwr
+ * -EPERM - drop packet
+ *
+ * For ingress packets, this function will return -EPERM if any
+ * attached program was found and if it returned != 1 during execution.
+ * Otherwise 0 is returned.
*/
int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
@@ -572,12 +637,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
/* compute pointers for the bpf prog */
bpf_compute_and_save_data_end(skb, &saved_data_end);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
- __bpf_prog_run_save_cb);
+ if (type == BPF_CGROUP_INET_EGRESS) {
+ ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
+ cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+ } else {
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+ __bpf_prog_run_save_cb);
+ ret = (ret == 1 ? 0 : -EPERM);
+ }
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
skb->sk = save_sk;
- return ret == 1 ? 0 : -EPERM;
+
+ return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
@@ -867,6 +939,190 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
+#ifdef CONFIG_NET
+static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
+ enum bpf_attach_type attach_type)
+{
+ struct bpf_prog_array *prog_array;
+ bool empty;
+
+ rcu_read_lock();
+ prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
+ empty = bpf_prog_array_is_empty(prog_array);
+ rcu_read_unlock();
+
+ return empty;
+}
+
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+{
+ if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+ return -EINVAL;
+
+ ctx->optval = kzalloc(max_optlen, GFP_USER);
+ if (!ctx->optval)
+ return -ENOMEM;
+
+ ctx->optval_end = ctx->optval + max_optlen;
+ ctx->optlen = max_optlen;
+
+ return 0;
+}
+
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+{
+ kfree(ctx->optval);
+}
+
+int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
+ int *optname, char __user *optval,
+ int *optlen, char **kernel_optval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = *level,
+ .optname = *optname,
+ };
+ int ret;
+
+ /* Opportunistic check to see whether we have any BPF program
+ * attached to the hook so we don't waste time allocating
+ * memory and locking the socket.
+ */
+ if (!cgroup_bpf_enabled ||
+ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+ return 0;
+
+ ret = sockopt_alloc_buf(&ctx, *optlen);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ lock_sock(sk);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
+ &ctx, BPF_PROG_RUN);
+ release_sock(sk);
+
+ if (!ret) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (ctx.optlen == -1) {
+ /* optlen set to -1, bypass kernel */
+ ret = 1;
+ } else if (ctx.optlen > *optlen || ctx.optlen < -1) {
+ /* optlen is out of bounds */
+ ret = -EFAULT;
+ } else {
+ /* optlen within bounds, run kernel handler */
+ ret = 0;
+
+ /* export any potential modifications */
+ *level = ctx.level;
+ *optname = ctx.optname;
+ *optlen = ctx.optlen;
+ *kernel_optval = ctx.optval;
+ }
+
+out:
+ if (ret)
+ sockopt_free_buf(&ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
+
+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
+ int optname, char __user *optval,
+ int __user *optlen, int max_optlen,
+ int retval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = level,
+ .optname = optname,
+ .retval = retval,
+ };
+ int ret;
+
+ /* Opportunistic check to see whether we have any BPF program
+ * attached to the hook so we don't waste time allocating
+ * memory and locking the socket.
+ */
+ if (!cgroup_bpf_enabled ||
+ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+ return retval;
+
+ ret = sockopt_alloc_buf(&ctx, max_optlen);
+ if (ret)
+ return ret;
+
+ if (!retval) {
+ /* If kernel getsockopt finished successfully,
+ * copy whatever was returned to the user back
+ * into our temporary buffer. Set optlen to the
+ * one that kernel returned as well to let
+ * BPF programs inspect the value.
+ */
+
+ if (get_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ctx.optlen > max_optlen)
+ ctx.optlen = max_optlen;
+
+ if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ lock_sock(sk);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+ &ctx, BPF_PROG_RUN);
+ release_sock(sk);
+
+ if (!ret) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (ctx.optlen > max_optlen) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* BPF programs only allowed to set retval to 0, not some
+ * arbitrary value.
+ */
+ if (ctx.retval != 0 && ctx.retval != retval) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+ put_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = ctx.retval;
+
+out:
+ sockopt_free_buf(&ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
+#endif
+
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
size_t *lenp)
{
@@ -1127,3 +1383,155 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
const struct bpf_prog_ops cg_sysctl_prog_ops = {
};
+
+static const struct bpf_func_proto *
+cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+#ifdef CONFIG_NET
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#endif
+#ifdef CONFIG_INET
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+#endif
+ default:
+ return cgroup_base_func_proto(func_id, prog);
+ }
+}
+
+static bool cg_sockopt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct bpf_sockopt))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sockopt, retval):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type ==
+ BPF_CGROUP_GETSOCKOPT;
+ case offsetof(struct bpf_sockopt, optname):
+ /* fallthrough */
+ case offsetof(struct bpf_sockopt, level):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type ==
+ BPF_CGROUP_SETSOCKOPT;
+ case offsetof(struct bpf_sockopt, optlen):
+ return size == size_default;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case offsetof(struct bpf_sockopt, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
+ case offsetof(struct bpf_sockopt, optval):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct bpf_sockopt, optval_end):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case offsetof(struct bpf_sockopt, retval):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
+ default:
+ if (size != size_default)
+ return false;
+ break;
+ }
+ return true;
+}
+
+#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
+ T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F))
+
+static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sockopt, sk):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+ break;
+ case offsetof(struct bpf_sockopt, level):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+ break;
+ case offsetof(struct bpf_sockopt, optname):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+ break;
+ case offsetof(struct bpf_sockopt, optlen):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+ break;
+ case offsetof(struct bpf_sockopt, retval):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+ break;
+ case offsetof(struct bpf_sockopt, optval):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+ break;
+ case offsetof(struct bpf_sockopt, optval_end):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
+ bool direct_write,
+ const struct bpf_prog *prog)
+{
+ /* Nothing to do for sockopt argument. The data is kzalloc'ated.
+ */
+ return 0;
+}
+
+const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
+ .get_func_proto = cg_sockopt_func_proto,
+ .is_valid_access = cg_sockopt_is_valid_access,
+ .convert_ctx_access = cg_sockopt_convert_ctx_access,
+ .gen_prologue = cg_sockopt_get_prologue,
+};
+
+const struct bpf_prog_ops cg_sockopt_prog_ops = {
+};