From 74e31ca850c1cddeca03503171dd145b6ce293b6 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 19 Feb 2019 19:53:02 +0100 Subject: bpf: add skb->queue_mapping write access from tc clsact The skb->queue_mapping already have read access, via __sk_buff->queue_mapping. This patch allow BPF tc qdisc clsact write access to the queue_mapping via tc_cls_act_is_valid_access. Also handle that the value NO_QUEUE_MAPPING is not allowed. It is already possible to change this via TC filter action skbedit tc-skbedit(8). Due to the lack of TC examples, lets show one: # tc qdisc add dev ixgbe1 clsact # tc filter add dev ixgbe1 ingress matchall action skbedit queue_mapping 5 # tc filter list dev ixgbe1 ingress The most common mistake is that XPS (Transmit Packet Steering) takes precedence over setting skb->queue_mapping. XPS is configured per DEVICE via /sys/class/net/DEVICE/queues/tx-*/xps_cpus via a CPU hex mask. To disable set mask=00. The purpose of changing skb->queue_mapping is to influence the selection of the net_device "txq" (struct netdev_queue), which influence selection of the qdisc "root_lock" (via txq->qdisc->q.lock) and txq->_xmit_lock. When using the MQ qdisc the txq->qdisc points to different qdiscs and associated locks, and HARD_TX_LOCK (txq->_xmit_lock), allowing for CPU scalability. Due to lack of TC examples, lets show howto attach clsact BPF programs: # tc qdisc add dev ixgbe2 clsact # tc filter add dev ixgbe2 egress bpf da obj XXX_kern.o sec tc_qmap2cpu # tc filter list dev ixgbe2 egress Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- net/core/filter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index b584cb42a803..85749f6ec789 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6279,6 +6279,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): case bpf_ctx_range(struct __sk_buff, tstamp): + case bpf_ctx_range(struct __sk_buff, queue_mapping): break; default: return false; @@ -6683,9 +6684,18 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, queue_mapping): - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, queue_mapping, 2, - target_size)); + if (type == BPF_WRITE) { + *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + queue_mapping, + 2, target_size)); + } else { + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + queue_mapping, + 2, target_size)); + } break; case offsetof(struct __sk_buff, vlan_present): -- cgit From f7c917ba11a67632a8452ea99fe132f626a7a2cc Mon Sep 17 00:00:00 2001 From: brakmo Date: Fri, 1 Mar 2019 12:38:46 -0800 Subject: bpf: add bpf helper bpf_skb_ecn_set_ce This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce "int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can be attached to the ingress and egress path. The helper is needed because his type of bpf_prog cannot modify the skb directly. This helper is used to set the ECN field of ECN capable IP packets to ce (congestion encountered) in the IPv6 or IPv4 header of the skb. It can be used by a bpf_prog to manage egress or ingress network bandwdith limit per cgroupv2 by inducing an ECN response in the TCP sender. This works best when using DCTCP. Signed-off-by: Lawrence Brakmo Signed-off-by: Martin KaFai Lau Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 85749f6ec789..558ca72f2254 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5426,6 +5426,32 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = { .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; +BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) +{ + unsigned int iphdr_len; + + if (skb->protocol == cpu_to_be16(ETH_P_IP)) + iphdr_len = sizeof(struct iphdr); + else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) + iphdr_len = sizeof(struct ipv6hdr); + else + return 0; + + if (skb_headlen(skb) < iphdr_len) + return 0; + + if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) + return 0; + + return INET_ECN_set_ce(skb); +} + +static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { + .func = bpf_skb_ecn_set_ce, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5585,6 +5611,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; #endif default: return sk_filter_func_proto(func_id, prog); -- cgit